feat: add Telegram integration with forum topic support and creator tracking

Adds full Telegram ZIP ingestion pipeline: TDLib worker service scans source
channels for archive files, deduplicates by content hash, extracts metadata,
uploads to archive channel, and indexes in Postgres. Forum supergroups are
scanned per-topic with topic names used as creator. Filename-based creator
extraction (e.g. "Mammoth Factory - 2026-01.zip") serves as fallback.

Includes admin UI for managing accounts/channels, simplified account setup
(API credentials via env vars), auth code/password submission dialog,
package browser with creator column, and live ingestion activity tracking.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
xCyanGrizzly
2026-02-24 16:02:06 +01:00
parent beb9cfb312
commit b427193d17
70 changed files with 8627 additions and 2 deletions

View File

@@ -0,0 +1,21 @@
/**
* Extract a creator name from common archive file naming patterns.
*
* Priority in the worker: topic name > filename extraction.
* This is the fallback when no forum topic name is available.
*
* Patterns handled (split on ` - `):
* "Mammoth Factory - 2026-01.zip" → "Mammoth Factory"
* "Artist Name - Pack Title.part01.rar" → "Artist Name"
* "some_random_file.zip" → null
*/
export function extractCreatorFromFileName(fileName: string): string | null {
// Strip archive extensions (.zip, .rar, .part01.rar, .z01, etc.)
const bare = fileName.replace(/(\.(part\d+\.rar|z\d{2}|zip|rar))+$/i, "");
const idx = bare.indexOf(" - ");
if (idx <= 0) return null;
const creator = bare.slice(0, idx).trim();
return creator.length > 0 ? creator : null;
}

View File

@@ -0,0 +1,96 @@
export type ArchiveFormat = "ZIP" | "RAR";
export interface MultipartInfo {
baseName: string;
partNumber: number;
format: ArchiveFormat;
pattern: "ZIP_NUMBERED" | "ZIP_LEGACY" | "RAR_PART" | "RAR_LEGACY" | "SINGLE";
}
const patterns: {
regex: RegExp;
format: ArchiveFormat;
pattern: MultipartInfo["pattern"];
getBaseName: (match: RegExpMatchArray) => string;
getPartNumber: (match: RegExpMatchArray) => number;
}[] = [
// pack.zip.001, pack.zip.002
{
regex: /^(.+\.zip)\.(\d{3,})$/i,
format: "ZIP",
pattern: "ZIP_NUMBERED",
getBaseName: (m) => m[1],
getPartNumber: (m) => parseInt(m[2], 10),
},
// pack.z01, pack.z02 (legacy split — final part is pack.zip)
{
regex: /^(.+)\.z(\d{2,})$/i,
format: "ZIP",
pattern: "ZIP_LEGACY",
getBaseName: (m) => m[1],
getPartNumber: (m) => parseInt(m[2], 10),
},
// pack.part1.rar, pack.part2.rar
{
regex: /^(.+)\.part(\d+)\.rar$/i,
format: "RAR",
pattern: "RAR_PART",
getBaseName: (m) => m[1],
getPartNumber: (m) => parseInt(m[2], 10),
},
// pack.r00, pack.r01 (legacy split — final part is pack.rar)
{
regex: /^(.+)\.r(\d{2,})$/i,
format: "RAR",
pattern: "RAR_LEGACY",
getBaseName: (m) => m[1],
getPartNumber: (m) => parseInt(m[2], 10),
},
];
/**
* Detect if a filename is an archive and extract multipart info.
*/
export function detectArchive(fileName: string): MultipartInfo | null {
// Check multipart patterns first
for (const p of patterns) {
const match = fileName.match(p.regex);
if (match) {
return {
baseName: p.getBaseName(match),
partNumber: p.getPartNumber(match),
format: p.format,
pattern: p.pattern,
};
}
}
// Single .zip file — could be a standalone or the final part of a ZIP_LEGACY set
if (/\.zip$/i.test(fileName)) {
return {
baseName: fileName.replace(/\.zip$/i, ""),
partNumber: -1, // -1 signals "could be single or final legacy part"
format: "ZIP",
pattern: "SINGLE",
};
}
// Single .rar file — could be standalone or final part of RAR_LEGACY set
if (/\.rar$/i.test(fileName)) {
return {
baseName: fileName.replace(/\.rar$/i, ""),
partNumber: -1,
format: "RAR",
pattern: "SINGLE",
};
}
return null;
}
/**
* Check if a filename looks like any archive attachment we should process.
*/
export function isArchiveAttachment(fileName: string): boolean {
return detectArchive(fileName) !== null;
}

View File

@@ -0,0 +1,25 @@
import { createReadStream } from "fs";
import { createHash } from "crypto";
import { pipeline } from "stream/promises";
import { PassThrough } from "stream";
/**
* Compute SHA-256 hash of one or more files by streaming them in order.
* Memory usage: O(1) — reads in 64KB chunks regardless of total size.
* For multipart archives, pass all parts sorted by part number.
*/
export async function hashParts(filePaths: string[]): Promise<string> {
const hash = createHash("sha256");
for (const filePath of filePaths) {
await pipeline(
createReadStream(filePath),
new PassThrough({
transform(chunk, _encoding, callback) {
hash.update(chunk);
callback();
},
})
);
}
return hash.digest("hex");
}

View File

@@ -0,0 +1,100 @@
import { detectArchive, type ArchiveFormat, type MultipartInfo } from "./detect.js";
import { config } from "../util/config.js";
import { childLogger } from "../util/logger.js";
const log = childLogger("multipart");
export interface TelegramMessage {
id: bigint;
fileName: string;
fileId: string;
fileSize: bigint;
date: Date;
}
export interface ArchiveSet {
type: ArchiveFormat;
baseName: string;
parts: TelegramMessage[];
isMultipart: boolean;
}
/**
* Group messages into archive sets (single files + multipart groups).
* Messages should be pre-filtered to only include archive attachments.
*/
export function groupArchiveSets(messages: TelegramMessage[]): ArchiveSet[] {
// Detect and annotate each message
const annotated: { msg: TelegramMessage; info: MultipartInfo }[] = [];
for (const msg of messages) {
const info = detectArchive(msg.fileName);
if (info) {
annotated.push({ msg, info });
}
}
// Group by baseName + format
const groups = new Map<string, { msg: TelegramMessage; info: MultipartInfo }[]>();
for (const item of annotated) {
const key = `${item.info.format}:${item.info.baseName.toLowerCase()}`;
const group = groups.get(key) ?? [];
group.push(item);
groups.set(key, group);
}
const results: ArchiveSet[] = [];
for (const [, group] of groups) {
const format = group[0].info.format;
const baseName = group[0].info.baseName;
// Separate explicit multipart entries from potential singles
const multipartEntries = group.filter((g) => g.info.pattern !== "SINGLE");
const singleEntries = group.filter((g) => g.info.pattern === "SINGLE");
if (multipartEntries.length > 0) {
// This is a multipart set
// Check if any single entry is the "final part" of a legacy split
const allEntries = [...multipartEntries, ...singleEntries];
// Check time span — skip if parts span too long
const dates = allEntries.map((e) => e.msg.date.getTime());
const span = Math.max(...dates) - Math.min(...dates);
const maxSpanMs = config.multipartTimeoutHours * 60 * 60 * 1000;
if (span > maxSpanMs) {
log.warn(
{ baseName, format, span: span / 3600000 },
"Multipart set spans too long, skipping"
);
continue;
}
// Sort by part number (singles get a very high number so they come last — they're the final part)
allEntries.sort((a, b) => {
const aNum = a.info.partNumber === -1 ? 999999 : a.info.partNumber;
const bNum = b.info.partNumber === -1 ? 999999 : b.info.partNumber;
return aNum - bNum;
});
results.push({
type: format,
baseName,
parts: allEntries.map((e) => e.msg),
isMultipart: true,
});
} else {
// All entries are singles — each is its own archive set
for (const entry of singleEntries) {
results.push({
type: format,
baseName: entry.info.baseName,
parts: [entry.msg],
isMultipart: false,
});
}
}
}
return results;
}

View File

@@ -0,0 +1,90 @@
import { execFile } from "child_process";
import { promisify } from "util";
import path from "path";
import { childLogger } from "../util/logger.js";
import type { FileEntry } from "./zip-reader.js";
const execFileAsync = promisify(execFile);
const log = childLogger("rar-reader");
/**
* Parse output of `unrar l -v <file>` to extract file metadata.
* unrar automatically discovers sibling parts when they're co-located.
*/
export async function readRarContents(
firstPartPath: string
): Promise<FileEntry[]> {
try {
const { stdout } = await execFileAsync("unrar", ["l", "-v", firstPartPath], {
timeout: 30000,
maxBuffer: 10 * 1024 * 1024, // 10MB for very large archives
});
return parseUnrarOutput(stdout);
} catch (err) {
log.warn({ err, file: firstPartPath }, "Failed to read RAR contents");
return []; // Fallback: return empty on error
}
}
/**
* Parse the tabular output of `unrar l -v`.
*
* Example output format:
* Archive: test.rar
* Details: RAR 5
*
* Attributes Size Packed Ratio Date Time CRC-32 Name
* ----------- --------- --------- ----- -------- ----- -------- ----
* ...A.... 12345 10234 83% 2024-01-15 10:30 DEADBEEF folder/file.stl
* ----------- --------- --------- ----- -------- ----- -------- ----
*/
function parseUnrarOutput(output: string): FileEntry[] {
const entries: FileEntry[] = [];
const lines = output.split("\n");
let inFileList = false;
let separatorCount = 0;
for (const line of lines) {
const trimmed = line.trim();
// Detect separator lines (------- pattern)
if (/^-{5,}/.test(trimmed)) {
separatorCount++;
if (separatorCount === 1) {
inFileList = true;
} else if (separatorCount >= 2) {
inFileList = false;
}
continue;
}
if (!inFileList) continue;
// Parse file entry line
// Format: Attributes Size Packed Ratio Date Time CRC Name
const match = trimmed.match(
/^\S+\s+(\d+)\s+(\d+)\s+\d+%\s+\S+\s+\S+\s+([0-9A-Fa-f]+)\s+(.+)$/
);
if (match) {
const [, uncompressedStr, compressedStr, crc32, filePath] = match;
// Skip directory entries (typically end with / or have size 0 with dir attributes)
if (filePath.endsWith("/") || filePath.endsWith("\\")) continue;
const ext = path.extname(filePath).toLowerCase();
entries.push({
path: filePath,
fileName: path.basename(filePath),
extension: ext ? ext.slice(1) : null,
compressedSize: BigInt(compressedStr),
uncompressedSize: BigInt(uncompressedStr),
crc32: crc32.toLowerCase(),
});
}
}
return entries;
}

View File

@@ -0,0 +1,48 @@
import { createReadStream, createWriteStream } from "fs";
import { stat } from "fs/promises";
import path from "path";
import { pipeline } from "stream/promises";
import { childLogger } from "../util/logger.js";
const log = childLogger("split");
/** 2GB in bytes — Telegram's file size limit */
const MAX_PART_SIZE = 2n * 1024n * 1024n * 1024n;
/**
* Split a file into ≤2GB parts using byte-level splitting.
* Returns paths to the split parts. If the file is already ≤2GB, returns the original path.
*/
export async function byteLevelSplit(filePath: string): Promise<string[]> {
const stats = await stat(filePath);
const fileSize = BigInt(stats.size);
if (fileSize <= MAX_PART_SIZE) {
return [filePath];
}
const dir = path.dirname(filePath);
const baseName = path.basename(filePath);
const partSize = Number(MAX_PART_SIZE);
const totalParts = Math.ceil(Number(fileSize) / partSize);
const parts: string[] = [];
log.info({ filePath, fileSize: Number(fileSize), totalParts }, "Splitting file");
for (let i = 0; i < totalParts; i++) {
const partNum = String(i + 1).padStart(3, "0");
const partPath = path.join(dir, `${baseName}.${partNum}`);
const start = i * partSize;
const end = Math.min(start + partSize - 1, Number(fileSize) - 1);
await pipeline(
createReadStream(filePath, { start, end }),
createWriteStream(partPath)
);
parts.push(partPath);
}
log.info({ filePath, parts: parts.length }, "File split complete");
return parts;
}

View File

@@ -0,0 +1,61 @@
import yauzl from "yauzl";
import path from "path";
import { childLogger } from "../util/logger.js";
const log = childLogger("zip-reader");
export interface FileEntry {
path: string;
fileName: string;
extension: string | null;
compressedSize: bigint;
uncompressedSize: bigint;
crc32: string | null;
}
/**
* Read the central directory of a ZIP file without extracting any contents.
* For multipart ZIPs, pass the paths sorted by part order.
* We attempt to read from the last part first (central directory is at the end).
*/
export async function readZipCentralDirectory(
filePaths: string[]
): Promise<FileEntry[]> {
// The central directory lives at the end of the last file
const targetFile = filePaths[filePaths.length - 1];
return new Promise((resolve, reject) => {
yauzl.open(targetFile, { lazyEntries: true, autoClose: true }, (err, zipFile) => {
if (err) {
log.warn({ err, file: targetFile }, "Failed to open ZIP for reading");
resolve([]); // Fallback: return empty on error
return;
}
const entries: FileEntry[] = [];
zipFile.readEntry();
zipFile.on("entry", (entry: yauzl.Entry) => {
// Skip directories
if (!entry.fileName.endsWith("/")) {
const ext = path.extname(entry.fileName).toLowerCase();
entries.push({
path: entry.fileName,
fileName: path.basename(entry.fileName),
extension: ext ? ext.slice(1) : null, // Remove leading dot
compressedSize: BigInt(entry.compressedSize),
uncompressedSize: BigInt(entry.uncompressedSize),
crc32: entry.crc32 !== 0 ? entry.crc32.toString(16).padStart(8, "0") : null,
});
}
zipFile.readEntry();
});
zipFile.on("end", () => resolve(entries));
zipFile.on("error", (error) => {
log.warn({ error, file: targetFile }, "Error reading ZIP entries");
resolve(entries); // Return whatever we got
});
});
});
}

14
worker/src/db/client.ts Normal file
View File

@@ -0,0 +1,14 @@
import { PrismaClient } from "@prisma/client";
import { PrismaPg } from "@prisma/adapter-pg";
import pg from "pg";
import { config } from "../util/config.js";
const pool = new pg.Pool({
connectionString: config.databaseUrl,
max: 5,
});
const adapter = new PrismaPg(pool);
export const db = new PrismaClient({ adapter });
export { pool };

56
worker/src/db/locks.ts Normal file
View File

@@ -0,0 +1,56 @@
import { pool } from "./client.js";
import { childLogger } from "../util/logger.js";
const log = childLogger("locks");
/**
* Derive a stable 32-bit integer lock ID from an account ID string.
* PostgreSQL advisory locks use bigint, but we use 32-bit for safety.
*/
function hashToLockId(accountId: string): number {
let hash = 0;
for (let i = 0; i < accountId.length; i++) {
const char = accountId.charCodeAt(i);
hash = (hash << 5) - hash + char;
hash |= 0; // Convert to 32-bit integer
}
return Math.abs(hash);
}
/**
* Try to acquire a PostgreSQL advisory lock for an account.
* Returns true if acquired, false if already held by another session.
*/
export async function tryAcquireLock(accountId: string): Promise<boolean> {
const lockId = hashToLockId(accountId);
const client = await pool.connect();
try {
const result = await client.query<{ pg_try_advisory_lock: boolean }>(
"SELECT pg_try_advisory_lock($1)",
[lockId]
);
const acquired = result.rows[0]?.pg_try_advisory_lock ?? false;
if (acquired) {
log.debug({ accountId, lockId }, "Advisory lock acquired");
} else {
log.debug({ accountId, lockId }, "Advisory lock already held");
}
return acquired;
} finally {
client.release();
}
}
/**
* Release the advisory lock for an account.
*/
export async function releaseLock(accountId: string): Promise<void> {
const lockId = hashToLockId(accountId);
const client = await pool.connect();
try {
await client.query("SELECT pg_advisory_unlock($1)", [lockId]);
log.debug({ accountId, lockId }, "Advisory lock released");
} finally {
client.release();
}
}

270
worker/src/db/queries.ts Normal file
View File

@@ -0,0 +1,270 @@
import { db } from "./client.js";
import type { ArchiveType } from "@prisma/client";
export async function getActiveAccounts() {
return db.telegramAccount.findMany({
where: { isActive: true, authState: "AUTHENTICATED" },
});
}
export async function getSourceChannelMappings(accountId: string) {
return db.accountChannelMap.findMany({
where: {
accountId,
role: "READER",
channel: { type: "SOURCE", isActive: true },
},
include: { channel: true },
});
}
export async function getDestinationChannel(accountId: string) {
const mapping = await db.accountChannelMap.findFirst({
where: {
accountId,
role: "WRITER",
channel: { type: "DESTINATION", isActive: true },
},
include: { channel: true },
});
return mapping?.channel ?? null;
}
export async function packageExistsByHash(contentHash: string) {
const pkg = await db.package.findUnique({
where: { contentHash },
select: { id: true },
});
return pkg !== null;
}
export interface CreatePackageInput {
contentHash: string;
fileName: string;
fileSize: bigint;
archiveType: ArchiveType;
sourceChannelId: string;
sourceMessageId: bigint;
sourceTopicId?: bigint | null;
destChannelId?: string;
destMessageId?: bigint;
isMultipart: boolean;
partCount: number;
ingestionRunId: string;
creator?: string | null;
previewData?: Buffer | null;
previewMsgId?: bigint | null;
files: {
path: string;
fileName: string;
extension: string | null;
compressedSize: bigint;
uncompressedSize: bigint;
crc32: string | null;
}[];
}
export async function createPackageWithFiles(input: CreatePackageInput) {
return db.package.create({
data: {
contentHash: input.contentHash,
fileName: input.fileName,
fileSize: input.fileSize,
archiveType: input.archiveType,
sourceChannelId: input.sourceChannelId,
sourceMessageId: input.sourceMessageId,
sourceTopicId: input.sourceTopicId ?? undefined,
destChannelId: input.destChannelId,
destMessageId: input.destMessageId,
isMultipart: input.isMultipart,
partCount: input.partCount,
fileCount: input.files.length,
ingestionRunId: input.ingestionRunId,
creator: input.creator ?? undefined,
previewData: input.previewData ? new Uint8Array(input.previewData) : undefined,
previewMsgId: input.previewMsgId ?? undefined,
files: {
create: input.files,
},
},
});
}
export async function createIngestionRun(accountId: string) {
return db.ingestionRun.create({
data: {
accountId,
status: "RUNNING",
currentActivity: "Starting ingestion run",
currentStep: "initializing",
lastActivityAt: new Date(),
},
});
}
export interface ActivityUpdate {
currentActivity: string;
currentStep: string;
currentChannel?: string | null;
currentFile?: string | null;
currentFileNum?: number | null;
totalFiles?: number | null;
downloadedBytes?: bigint | null;
totalBytes?: bigint | null;
downloadPercent?: number | null;
messagesScanned?: number;
zipsFound?: number;
zipsDuplicate?: number;
zipsIngested?: number;
}
export async function updateRunActivity(
runId: string,
activity: ActivityUpdate
) {
return db.ingestionRun.update({
where: { id: runId },
data: {
currentActivity: activity.currentActivity,
currentStep: activity.currentStep,
currentChannel: activity.currentChannel ?? undefined,
currentFile: activity.currentFile ?? undefined,
currentFileNum: activity.currentFileNum ?? undefined,
totalFiles: activity.totalFiles ?? undefined,
downloadedBytes: activity.downloadedBytes ?? undefined,
totalBytes: activity.totalBytes ?? undefined,
downloadPercent: activity.downloadPercent ?? undefined,
lastActivityAt: new Date(),
...(activity.messagesScanned !== undefined && { messagesScanned: activity.messagesScanned }),
...(activity.zipsFound !== undefined && { zipsFound: activity.zipsFound }),
...(activity.zipsDuplicate !== undefined && { zipsDuplicate: activity.zipsDuplicate }),
...(activity.zipsIngested !== undefined && { zipsIngested: activity.zipsIngested }),
},
});
}
const CLEAR_ACTIVITY = {
currentActivity: null,
currentStep: null,
currentChannel: null,
currentFile: null,
currentFileNum: null,
totalFiles: null,
downloadedBytes: null,
totalBytes: null,
downloadPercent: null,
lastActivityAt: new Date(),
};
export async function completeIngestionRun(
runId: string,
counters: {
messagesScanned: number;
zipsFound: number;
zipsDuplicate: number;
zipsIngested: number;
}
) {
return db.ingestionRun.update({
where: { id: runId },
data: {
status: "COMPLETED",
finishedAt: new Date(),
...counters,
...CLEAR_ACTIVITY,
},
});
}
export async function failIngestionRun(runId: string, errorMessage: string) {
return db.ingestionRun.update({
where: { id: runId },
data: {
status: "FAILED",
finishedAt: new Date(),
errorMessage,
...CLEAR_ACTIVITY,
},
});
}
export async function updateLastProcessedMessage(
mappingId: string,
messageId: bigint
) {
return db.accountChannelMap.update({
where: { id: mappingId },
data: { lastProcessedMessageId: messageId },
});
}
export async function markStaleRunsAsFailed() {
return db.ingestionRun.updateMany({
where: { status: "RUNNING" },
data: {
status: "FAILED",
finishedAt: new Date(),
errorMessage: "Worker restarted — run was still marked as RUNNING",
},
});
}
export async function updateAccountAuthState(
accountId: string,
authState: "PENDING" | "AWAITING_CODE" | "AWAITING_PASSWORD" | "AUTHENTICATED" | "EXPIRED",
authCode?: string | null
) {
return db.telegramAccount.update({
where: { id: accountId },
data: { authState, authCode, lastSeenAt: authState === "AUTHENTICATED" ? new Date() : undefined },
});
}
export async function getAccountAuthCode(accountId: string) {
const account = await db.telegramAccount.findUnique({
where: { id: accountId },
select: { authCode: true, authState: true },
});
return account;
}
// ── Forum / Topic progress ──
export async function setChannelForum(channelId: string, isForum: boolean) {
return db.telegramChannel.update({
where: { id: channelId },
data: { isForum },
});
}
export async function getTopicProgress(mappingId: string) {
return db.topicProgress.findMany({
where: { accountChannelMapId: mappingId },
});
}
export async function upsertTopicProgress(
mappingId: string,
topicId: bigint,
topicName: string | null,
lastProcessedMessageId: bigint
) {
return db.topicProgress.upsert({
where: {
accountChannelMapId_topicId: {
accountChannelMapId: mappingId,
topicId,
},
},
create: {
accountChannelMapId: mappingId,
topicId,
topicName,
lastProcessedMessageId,
},
update: {
topicName,
lastProcessedMessageId,
},
});
}

50
worker/src/index.ts Normal file
View File

@@ -0,0 +1,50 @@
import { mkdir } from "fs/promises";
import { config } from "./util/config.js";
import { logger } from "./util/logger.js";
import { markStaleRunsAsFailed } from "./db/queries.js";
import { cleanupTempDir } from "./worker.js";
import { startScheduler, stopScheduler } from "./scheduler.js";
import { db, pool } from "./db/client.js";
const log = logger.child({ module: "main" });
async function main(): Promise<void> {
log.info("DragonsStash Telegram Worker starting");
log.info({ config: { ...config, databaseUrl: "***" } }, "Configuration loaded");
// Ensure temp directory exists
await mkdir(config.tempDir, { recursive: true });
await mkdir(config.tdlibStateDir, { recursive: true });
// Clean up stale state
await cleanupTempDir();
await markStaleRunsAsFailed();
// Start the scheduler
await startScheduler();
}
// Graceful shutdown
function shutdown(signal: string): void {
log.info({ signal }, "Shutdown signal received");
stopScheduler();
// Close DB connections
Promise.all([db.$disconnect(), pool.end()])
.then(() => {
log.info("Shutdown complete");
process.exit(0);
})
.catch((err) => {
log.error({ err }, "Error during shutdown");
process.exit(1);
});
}
process.on("SIGTERM", () => shutdown("SIGTERM"));
process.on("SIGINT", () => shutdown("SIGINT"));
main().catch((err) => {
log.fatal({ err }, "Worker failed to start");
process.exit(1);
});

View File

@@ -0,0 +1,86 @@
import { childLogger } from "../util/logger.js";
const log = childLogger("preview-match");
export interface TelegramPhoto {
id: bigint;
date: Date;
/** Caption text on the photo message (if any). */
caption: string;
/** The smallest photo size available — used as thumbnail. */
fileId: string;
fileSize: number;
}
export interface ArchiveRef {
baseName: string;
firstMessageId: bigint;
firstMessageDate: Date;
}
/**
* Try to match a photo message to an archive by:
* 1. Caption contains the archive baseName (without extension)
* 2. Photo was posted within ±10 messages (time-window: ±6 hours)
*
* Returns the best match (closest in time), or null.
*/
export function matchPreviewToArchive(
photos: TelegramPhoto[],
archives: ArchiveRef[]
): Map<string, TelegramPhoto> {
const results = new Map<string, TelegramPhoto>();
const TIME_WINDOW_MS = 6 * 60 * 60 * 1000; // 6 hours
for (const archive of archives) {
// Normalize the archive base name for matching
const normalizedBase = normalizeForMatch(archive.baseName);
if (!normalizedBase) continue;
let bestMatch: TelegramPhoto | null = null;
let bestTimeDiff = Infinity;
for (const photo of photos) {
const timeDiff = Math.abs(
photo.date.getTime() - archive.firstMessageDate.getTime()
);
// Must be within time window
if (timeDiff > TIME_WINDOW_MS) continue;
// Check if the photo caption contains the archive base name
const normalizedCaption = normalizeForMatch(photo.caption);
if (!normalizedCaption) continue;
const matches =
normalizedCaption.includes(normalizedBase) ||
normalizedBase.includes(normalizedCaption);
if (matches && timeDiff < bestTimeDiff) {
bestMatch = photo;
bestTimeDiff = timeDiff;
}
}
if (bestMatch) {
log.debug(
{ baseName: archive.baseName, photoId: bestMatch.id.toString() },
"Matched preview photo to archive"
);
results.set(archive.baseName, bestMatch);
}
}
return results;
}
/**
* Strip extension, punctuation, and normalize for fuzzy matching.
*/
function normalizeForMatch(input: string): string {
return input
.toLowerCase()
.replace(/\.[a-z0-9]{1,5}$/i, "") // strip extension
.replace(/[_\-.\s]+/g, " ") // normalize separators
.trim();
}

92
worker/src/scheduler.ts Normal file
View File

@@ -0,0 +1,92 @@
import { config } from "./util/config.js";
import { childLogger } from "./util/logger.js";
import { getActiveAccounts } from "./db/queries.js";
import { runWorkerForAccount } from "./worker.js";
const log = childLogger("scheduler");
let running = false;
let timer: ReturnType<typeof setTimeout> | null = null;
/**
* Run one ingestion cycle: process all active, authenticated accounts sequentially.
*/
async function runCycle(): Promise<void> {
if (running) {
log.warn("Previous cycle still running, skipping");
return;
}
running = true;
log.info("Starting ingestion cycle");
try {
const accounts = await getActiveAccounts();
if (accounts.length === 0) {
log.info("No active authenticated accounts, nothing to do");
return;
}
log.info({ accountCount: accounts.length }, "Processing accounts");
for (const account of accounts) {
await runWorkerForAccount(account);
}
log.info("Ingestion cycle complete");
} catch (err) {
log.error({ err }, "Ingestion cycle failed");
} finally {
running = false;
}
}
/**
* Schedule the next cycle with jitter.
*/
function scheduleNext(): void {
const intervalMs = config.workerIntervalMinutes * 60 * 1000;
const jitterMs = Math.random() * config.jitterMinutes * 60 * 1000;
const delay = intervalMs + jitterMs;
log.info(
{ nextRunInMinutes: Math.round(delay / 60000) },
"Next cycle scheduled"
);
timer = setTimeout(async () => {
await runCycle();
scheduleNext();
}, delay);
}
/**
* Start the scheduler. Runs an immediate first cycle, then schedules subsequent ones.
*/
export async function startScheduler(): Promise<void> {
log.info(
{
intervalMinutes: config.workerIntervalMinutes,
jitterMinutes: config.jitterMinutes,
},
"Scheduler starting"
);
// Run immediately on start
await runCycle();
// Then schedule recurring cycles
scheduleNext();
}
/**
* Stop the scheduler gracefully.
*/
export function stopScheduler(): void {
if (timer) {
clearTimeout(timer);
timer = null;
}
log.info("Scheduler stopped");
}

120
worker/src/tdlib/client.ts Normal file
View File

@@ -0,0 +1,120 @@
import tdl, { createClient, type Client } from "tdl";
import { getTdjson } from "prebuilt-tdlib";
import path from "path";
import { config } from "../util/config.js";
import { childLogger } from "../util/logger.js";
import {
updateAccountAuthState,
getAccountAuthCode,
} from "../db/queries.js";
const log = childLogger("tdlib-client");
// Configure tdl to use the prebuilt tdjson shared library
tdl.configure({ tdjson: getTdjson() });
interface AccountConfig {
id: string;
phone: string;
}
/**
* Create and authenticate a TDLib client for a Telegram account.
* Authentication flow communicates with the admin UI via the database:
* - Worker sets authState to AWAITING_CODE when TDLib asks for phone code
* - Admin enters the code via UI, which writes it to authCode field
* - Worker polls DB for the code and feeds it to TDLib
*/
export async function createTdlibClient(
account: AccountConfig
): Promise<Client> {
const dbPath = path.join(config.tdlibStateDir, account.id);
const client = createClient({
apiId: config.telegramApiId,
apiHash: config.telegramApiHash,
databaseDirectory: dbPath,
filesDirectory: path.join(dbPath, "files"),
});
client.on("error", (err) => {
log.error({ err, accountId: account.id }, "TDLib client error");
});
try {
await client.login(() => ({
getPhoneNumber: async () => {
log.info({ accountId: account.id }, "TDLib requesting phone number");
return account.phone;
},
getAuthCode: async () => {
log.info({ accountId: account.id }, "TDLib requesting auth code");
await updateAccountAuthState(account.id, "AWAITING_CODE");
// Poll database for the code entered via admin UI
const code = await pollForAuthCode(account.id);
if (!code) {
throw new Error("Auth code not provided within timeout");
}
// Clear the code after reading
await updateAccountAuthState(account.id, "AUTHENTICATED", null);
return code;
},
getPassword: async () => {
log.info({ accountId: account.id }, "TDLib requesting 2FA password");
await updateAccountAuthState(account.id, "AWAITING_PASSWORD");
// Poll database for the password entered via admin UI
const code = await pollForAuthCode(account.id);
if (!code) {
throw new Error("2FA password not provided within timeout");
}
await updateAccountAuthState(account.id, "AUTHENTICATED", null);
return code;
},
}));
await updateAccountAuthState(account.id, "AUTHENTICATED");
log.info({ accountId: account.id }, "TDLib client authenticated");
return client;
} catch (err) {
log.error({ err, accountId: account.id }, "TDLib authentication failed");
await updateAccountAuthState(account.id, "EXPIRED");
throw err;
}
}
/**
* Poll the database every 5 seconds for an auth code, up to 5 minutes.
*/
async function pollForAuthCode(
accountId: string,
timeoutMs = 300_000
): Promise<string | null> {
const start = Date.now();
while (Date.now() - start < timeoutMs) {
const result = await getAccountAuthCode(accountId);
if (result?.authCode) {
return result.authCode;
}
await sleep(5000);
}
return null;
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
/**
* Close a TDLib client gracefully.
*/
export async function closeTdlibClient(client: Client): Promise<void> {
try {
await client.close();
} catch (err) {
log.warn({ err }, "Error closing TDLib client");
}
}

View File

@@ -0,0 +1,389 @@
import type { Client } from "tdl";
import { readFile, rename, stat } from "fs/promises";
import { config } from "../util/config.js";
import { childLogger } from "../util/logger.js";
import { isArchiveAttachment } from "../archive/detect.js";
import type { TelegramMessage } from "../archive/multipart.js";
import type { TelegramPhoto } from "../preview/match.js";
const log = childLogger("download");
interface TdPhotoSize {
type: string;
photo: {
id: number;
size: number;
expected_size: number;
local?: {
path?: string;
is_downloading_active?: boolean;
is_downloading_completed?: boolean;
downloaded_size?: number;
};
};
width: number;
height: number;
}
interface TdMessage {
id: number;
date: number;
content: {
_: string;
document?: {
file_name?: string;
document?: {
id: number;
size: number;
local?: {
path?: string;
is_downloading_completed?: boolean;
};
};
};
photo?: {
sizes?: TdPhotoSize[];
};
caption?: {
text?: string;
};
};
}
interface TdFile {
id: number;
size: number;
expected_size: number;
local: {
path: string;
is_downloading_active: boolean;
is_downloading_completed: boolean;
downloaded_size: number;
download_offset: number;
};
}
export interface ChannelScanResult {
archives: TelegramMessage[];
photos: TelegramPhoto[];
}
/**
* Fetch messages from a channel since a given message ID.
* Collects both archive attachments AND photo messages (for preview matching).
* Returns messages in chronological order (oldest first).
*/
export async function getChannelMessages(
client: Client,
chatId: bigint,
fromMessageId?: bigint | null,
limit = 100
): Promise<ChannelScanResult> {
const archives: TelegramMessage[] = [];
const photos: TelegramPhoto[] = [];
let currentFromId = fromMessageId ? Number(fromMessageId) : 0;
// eslint-disable-next-line no-constant-condition
while (true) {
const result = (await client.invoke({
_: "getChatHistory",
chat_id: Number(chatId),
from_message_id: currentFromId,
offset: 0,
limit: Math.min(limit, 100),
only_local: false,
})) as { messages: TdMessage[] };
if (!result.messages || result.messages.length === 0) break;
for (const msg of result.messages) {
// Check for archive documents
const doc = msg.content?.document;
if (doc?.file_name && doc.document && isArchiveAttachment(doc.file_name)) {
archives.push({
id: BigInt(msg.id),
fileName: doc.file_name,
fileId: String(doc.document.id),
fileSize: BigInt(doc.document.size),
date: new Date(msg.date * 1000),
});
continue;
}
// Check for photo messages (potential previews)
const photo = msg.content?.photo;
const caption = msg.content?.caption?.text ?? "";
if (photo?.sizes && photo.sizes.length > 0) {
// Pick the smallest size for thumbnail (type "s" or "m")
// TDLib photo sizes are ordered from smallest to largest
const smallest = photo.sizes[0];
photos.push({
id: BigInt(msg.id),
date: new Date(msg.date * 1000),
caption,
fileId: String(smallest.photo.id),
fileSize: smallest.photo.size || smallest.photo.expected_size,
});
}
}
currentFromId = result.messages[result.messages.length - 1].id;
if (result.messages.length < 100) break;
// Rate limit delay
await sleep(config.apiDelayMs);
}
// Return in chronological order (oldest first)
return {
archives: archives.reverse(),
photos: photos.reverse(),
};
}
/**
* Download a photo thumbnail from Telegram and return its raw bytes.
* Uses synchronous download (photos are small, typically < 100KB).
* Returns null if download fails (non-critical).
*/
export async function downloadPhotoThumbnail(
client: Client,
fileId: string
): Promise<Buffer | null> {
const numericId = parseInt(fileId, 10);
try {
const result = (await client.invoke({
_: "downloadFile",
file_id: numericId,
priority: 1, // Low priority — thumbnails are nice-to-have
offset: 0,
limit: 0,
synchronous: true, // Small file — wait for it
})) as TdFile;
if (result?.local?.is_downloading_completed && result.local.path) {
const data = await readFile(result.local.path);
log.debug(
{ fileId, bytes: data.length },
"Downloaded photo thumbnail"
);
return data;
}
} catch (err) {
log.warn({ fileId, err }, "Failed to download photo thumbnail");
}
return null;
}
export interface DownloadProgress {
fileId: string;
fileName: string;
downloadedBytes: number;
totalBytes: number;
percent: number;
isComplete: boolean;
}
export type ProgressCallback = (progress: DownloadProgress) => void;
/**
* Download a file from Telegram to a local path with progress tracking
* and integrity verification.
*
* Progress flow:
* 1. Starts async download via TDLib
* 2. Listens for `updateFile` events to track download progress
* 3. Logs progress at every 10% increment
* 4. Once complete, verifies the local file size matches the expected size
* 5. Moves the file from TDLib's cache to the destination path
*
* Verification:
* - Compares actual file size on disk to the expected size from Telegram
* - Throws on mismatch (partial/corrupt download)
* - Throws on timeout (configurable, scales with file size)
* - Throws if download stops without completing (network error, etc.)
*/
export async function downloadFile(
client: Client,
fileId: string,
destPath: string,
expectedSize: bigint,
fileName: string,
onProgress?: ProgressCallback
): Promise<void> {
const numericId = parseInt(fileId, 10);
const totalBytes = Number(expectedSize);
log.info(
{ fileId, fileName, destPath, totalBytes },
"Starting file download"
);
// Report initial progress
onProgress?.({
fileId,
fileName,
downloadedBytes: 0,
totalBytes,
percent: 0,
isComplete: false,
});
return new Promise<void>((resolve, reject) => {
let lastLoggedPercent = 0;
let settled = false;
// Timeout: 10 minutes per GB, minimum 5 minutes
const timeoutMs = Math.max(
5 * 60_000,
(totalBytes / (1024 * 1024 * 1024)) * 10 * 60_000
);
const timer = setTimeout(() => {
if (!settled) {
settled = true;
cleanup();
reject(
new Error(
`Download timed out after ${Math.round(timeoutMs / 60_000)}min for ${fileName}`
)
);
}
}, timeoutMs);
// Listen for file update events to track progress
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const handleUpdate = (update: any) => {
if (update?._ !== "updateFile") return;
const file = update.file as TdFile | undefined;
if (!file || file.id !== numericId) return;
const downloaded = file.local.downloaded_size;
const percent =
totalBytes > 0 ? Math.round((downloaded / totalBytes) * 100) : 0;
// Log at every 10% increment
if (percent >= lastLoggedPercent + 10) {
lastLoggedPercent = percent - (percent % 10);
log.info(
{ fileId, fileName, downloaded, totalBytes, percent: `${percent}%` },
"Download progress"
);
}
// Report to callback
onProgress?.({
fileId,
fileName,
downloadedBytes: downloaded,
totalBytes,
percent,
isComplete: file.local.is_downloading_completed,
});
// Download finished
if (file.local.is_downloading_completed) {
if (!settled) {
settled = true;
cleanup();
verifyAndMove(file.local.path, destPath, totalBytes, fileName, fileId)
.then(resolve)
.catch(reject);
}
}
// Download stopped without completing (network error, cancelled, etc.)
if (
!file.local.is_downloading_active &&
!file.local.is_downloading_completed
) {
if (!settled) {
settled = true;
cleanup();
reject(
new Error(
`Download stopped unexpectedly for ${fileName} ` +
`(${downloaded}/${totalBytes} bytes, ${percent}%)`
)
);
}
}
};
const cleanup = () => {
clearTimeout(timer);
client.off("update", handleUpdate);
};
// Subscribe to updates BEFORE starting download
client.on("update", handleUpdate);
// Start async download (non-blocking — progress via updateFile events)
client
.invoke({
_: "downloadFile",
file_id: numericId,
priority: 32,
offset: 0,
limit: 0,
synchronous: false,
})
.then((result: unknown) => {
// If the file was already cached locally, invoke returns immediately
const file = result as TdFile | undefined;
if (file?.local?.is_downloading_completed && !settled) {
settled = true;
cleanup();
verifyAndMove(file.local.path, destPath, totalBytes, fileName, fileId)
.then(resolve)
.catch(reject);
}
})
.catch((err: unknown) => {
if (!settled) {
settled = true;
cleanup();
reject(err);
}
});
});
}
/**
* Verify the downloaded file's size matches the expected size,
* then move it to the destination path.
*/
async function verifyAndMove(
localPath: string,
destPath: string,
expectedBytes: number,
fileName: string,
fileId: string
): Promise<void> {
const stats = await stat(localPath);
const actualBytes = stats.size;
if (expectedBytes > 0 && actualBytes !== expectedBytes) {
log.error(
{ fileId, fileName, expectedBytes, actualBytes },
"Download size mismatch — file is incomplete or corrupted"
);
throw new Error(
`Download verification failed for ${fileName}: ` +
`expected ${expectedBytes} bytes, got ${actualBytes} bytes`
);
}
log.info(
{ fileId, fileName, bytes: actualBytes, destPath },
"File verified and complete"
);
// Move from TDLib's cache to our temp directory
await rename(localPath, destPath);
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}

222
worker/src/tdlib/topics.ts Normal file
View File

@@ -0,0 +1,222 @@
import type { Client } from "tdl";
import { config } from "../util/config.js";
import { childLogger } from "../util/logger.js";
import { isArchiveAttachment } from "../archive/detect.js";
import type { TelegramMessage } from "../archive/multipart.js";
import type { TelegramPhoto } from "../preview/match.js";
import type { ChannelScanResult } from "./download.js";
const log = childLogger("topics");
export interface ForumTopic {
topicId: bigint;
name: string;
}
/**
* Check if a chat is a forum supergroup (topics enabled).
*/
export async function isChatForum(
client: Client,
chatId: bigint
): Promise<boolean> {
try {
const chat = (await client.invoke({
_: "getChat",
chat_id: Number(chatId),
})) as {
type?: {
_: string;
supergroup_id?: number;
is_forum?: boolean;
};
};
if (chat.type?._ === "chatTypeSupergroup" && chat.type.is_forum) {
return true;
}
// Also check via getSupergroup for older TDLib versions
if (chat.type?._ === "chatTypeSupergroup" && chat.type.supergroup_id) {
const sg = (await client.invoke({
_: "getSupergroup",
supergroup_id: chat.type.supergroup_id,
})) as { is_forum?: boolean };
return sg.is_forum === true;
}
return false;
} catch (err) {
log.warn({ err, chatId: chatId.toString() }, "Failed to check if chat is forum");
return false;
}
}
/**
* Get all forum topics in a supergroup.
*/
export async function getForumTopicList(
client: Client,
chatId: bigint
): Promise<ForumTopic[]> {
const topics: ForumTopic[] = [];
let offsetDate = 0;
let offsetMessageId = 0;
let offsetMessageThreadId = 0;
// eslint-disable-next-line no-constant-condition
while (true) {
const result = (await client.invoke({
_: "getForumTopics",
chat_id: Number(chatId),
query: "",
offset_date: offsetDate,
offset_message_id: offsetMessageId,
offset_message_thread_id: offsetMessageThreadId,
limit: 100,
})) as {
topics?: {
info?: {
message_thread_id?: number;
name?: string;
is_general?: boolean;
};
}[];
next_offset_date?: number;
next_offset_message_id?: number;
next_offset_message_thread_id?: number;
};
if (!result.topics || result.topics.length === 0) break;
for (const t of result.topics) {
if (!t.info?.message_thread_id) continue;
// Skip the "General" topic — it's not creator-specific
if (t.info.is_general) continue;
topics.push({
topicId: BigInt(t.info.message_thread_id),
name: t.info.name ?? "Unnamed",
});
}
// Check if there are more pages
if (
!result.next_offset_date &&
!result.next_offset_message_id &&
!result.next_offset_message_thread_id
) {
break;
}
offsetDate = result.next_offset_date ?? 0;
offsetMessageId = result.next_offset_message_id ?? 0;
offsetMessageThreadId = result.next_offset_message_thread_id ?? 0;
await sleep(config.apiDelayMs);
}
log.info(
{ chatId: chatId.toString(), topicCount: topics.length },
"Enumerated forum topics"
);
return topics;
}
/**
* Fetch messages from a specific forum topic (thread).
* Uses getMessageThreadHistory to scan within a topic.
*/
export async function getTopicMessages(
client: Client,
chatId: bigint,
topicId: bigint,
fromMessageId?: bigint | null,
limit = 100
): Promise<ChannelScanResult> {
const archives: TelegramMessage[] = [];
const photos: TelegramPhoto[] = [];
let currentFromId = fromMessageId ? Number(fromMessageId) : 0;
// eslint-disable-next-line no-constant-condition
while (true) {
const result = (await client.invoke({
_: "getMessageThreadHistory",
chat_id: Number(chatId),
message_id: Number(topicId),
from_message_id: currentFromId,
offset: 0,
limit: Math.min(limit, 100),
})) as {
messages?: {
id: number;
date: number;
content: {
_: string;
document?: {
file_name?: string;
document?: {
id: number;
size: number;
};
};
photo?: {
sizes?: {
type: string;
photo: { id: number; size: number; expected_size: number };
width: number;
height: number;
}[];
};
caption?: { text?: string };
};
}[];
};
if (!result.messages || result.messages.length === 0) break;
for (const msg of result.messages) {
// Check for archive documents
const doc = msg.content?.document;
if (doc?.file_name && doc.document && isArchiveAttachment(doc.file_name)) {
archives.push({
id: BigInt(msg.id),
fileName: doc.file_name,
fileId: String(doc.document.id),
fileSize: BigInt(doc.document.size),
date: new Date(msg.date * 1000),
});
continue;
}
// Check for photo messages (potential previews)
const photo = msg.content?.photo;
const caption = msg.content?.caption?.text ?? "";
if (photo?.sizes && photo.sizes.length > 0) {
const smallest = photo.sizes[0];
photos.push({
id: BigInt(msg.id),
date: new Date(msg.date * 1000),
caption,
fileId: String(smallest.photo.id),
fileSize: smallest.photo.size || smallest.photo.expected_size,
});
}
}
currentFromId = result.messages[result.messages.length - 1].id;
if (result.messages.length < 100) break;
await sleep(config.apiDelayMs);
}
return {
archives: archives.reverse(),
photos: photos.reverse(),
};
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}

View File

@@ -0,0 +1,76 @@
import type { Client } from "tdl";
import { config } from "../util/config.js";
import { childLogger } from "../util/logger.js";
const log = childLogger("upload");
export interface UploadResult {
messageId: bigint;
}
/**
* Upload one or more files to a destination Telegram channel.
* For multipart archives, each file is sent as a separate message.
* Returns the message ID of the first uploaded message.
*/
export async function uploadToChannel(
client: Client,
chatId: bigint,
filePaths: string[],
caption?: string
): Promise<UploadResult> {
let firstMessageId: bigint | null = null;
for (let i = 0; i < filePaths.length; i++) {
const filePath = filePaths[i];
const fileCaption =
i === 0 && caption ? caption : undefined;
log.debug(
{ chatId: Number(chatId), filePath, part: i + 1, total: filePaths.length },
"Uploading file to channel"
);
const result = (await client.invoke({
_: "sendMessage",
chat_id: Number(chatId),
input_message_content: {
_: "inputMessageDocument",
document: {
_: "inputFileLocal",
path: filePath,
},
caption: fileCaption
? {
_: "formattedText",
text: fileCaption,
}
: undefined,
},
})) as { id: number };
if (i === 0) {
firstMessageId = BigInt(result.id);
}
// Rate limit delay between uploads
if (i < filePaths.length - 1) {
await sleep(config.apiDelayMs);
}
}
if (firstMessageId === null) {
throw new Error("Upload failed: no messages sent");
}
log.info(
{ chatId: Number(chatId), messageId: Number(firstMessageId), files: filePaths.length },
"Upload complete"
);
return { messageId: firstMessageId };
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}

18
worker/src/util/config.ts Normal file
View File

@@ -0,0 +1,18 @@
export const config = {
databaseUrl: process.env.DATABASE_URL ?? "",
workerIntervalMinutes: parseInt(process.env.WORKER_INTERVAL_MINUTES ?? "60", 10),
tempDir: process.env.WORKER_TEMP_DIR ?? "/tmp/zips",
tdlibStateDir: process.env.TDLIB_STATE_DIR ?? "/data/tdlib",
maxZipSizeMB: parseInt(process.env.WORKER_MAX_ZIP_SIZE_MB ?? "4096", 10),
logLevel: (process.env.LOG_LEVEL ?? "info") as "debug" | "info" | "warn" | "error",
telegramApiId: parseInt(process.env.TELEGRAM_API_ID ?? "0", 10),
telegramApiHash: process.env.TELEGRAM_API_HASH ?? "",
/** Maximum jitter added to scheduler interval (in minutes) */
jitterMinutes: 5,
/** Maximum time between multipart archive parts (in hours) */
multipartTimeoutHours: 24,
/** Delay between Telegram API calls (in ms) to avoid rate limits */
apiDelayMs: 1000,
/** Max retries for rate-limited requests */
maxRetries: 5,
} as const;

14
worker/src/util/logger.ts Normal file
View File

@@ -0,0 +1,14 @@
import pino from "pino";
import { config } from "./config.js";
export const logger = pino({
level: config.logLevel,
transport:
config.logLevel === "debug"
? { target: "pino/file", options: { destination: 1 } }
: undefined,
});
export function childLogger(name: string, extra?: Record<string, unknown>) {
return logger.child({ module: name, ...extra });
}

665
worker/src/worker.ts Normal file
View File

@@ -0,0 +1,665 @@
import path from "path";
import { unlink, readdir } from "fs/promises";
import { config } from "./util/config.js";
import { childLogger } from "./util/logger.js";
import { tryAcquireLock, releaseLock } from "./db/locks.js";
import {
getSourceChannelMappings,
getDestinationChannel,
packageExistsByHash,
createPackageWithFiles,
createIngestionRun,
completeIngestionRun,
failIngestionRun,
updateLastProcessedMessage,
updateRunActivity,
setChannelForum,
getTopicProgress,
upsertTopicProgress,
} from "./db/queries.js";
import type { ActivityUpdate } from "./db/queries.js";
import { createTdlibClient, closeTdlibClient } from "./tdlib/client.js";
import { getChannelMessages, downloadFile, downloadPhotoThumbnail } from "./tdlib/download.js";
import type { DownloadProgress, ChannelScanResult } from "./tdlib/download.js";
import { isChatForum, getForumTopicList, getTopicMessages } from "./tdlib/topics.js";
import { matchPreviewToArchive } from "./preview/match.js";
import { groupArchiveSets } from "./archive/multipart.js";
import type { ArchiveSet } from "./archive/multipart.js";
import { extractCreatorFromFileName } from "./archive/creator.js";
import { hashParts } from "./archive/hash.js";
import { readZipCentralDirectory } from "./archive/zip-reader.js";
import { readRarContents } from "./archive/rar-reader.js";
import { byteLevelSplit } from "./archive/split.js";
import { uploadToChannel } from "./upload/channel.js";
import type { TelegramAccount, TelegramChannel } from "@prisma/client";
import type { Client } from "tdl";
const log = childLogger("worker");
/**
* Throttle DB writes for download progress to avoid hammering the DB.
* Only writes if at least 2 seconds have passed since the last write.
*/
function createThrottledActivityUpdater(runId: string, minIntervalMs = 2000) {
let lastWriteTime = 0;
let pendingUpdate: ActivityUpdate | null = null;
let flushTimer: ReturnType<typeof setTimeout> | null = null;
const flush = async () => {
if (pendingUpdate) {
const update = pendingUpdate;
pendingUpdate = null;
lastWriteTime = Date.now();
await updateRunActivity(runId, update).catch(() => {});
}
};
return {
update: (activity: ActivityUpdate) => {
pendingUpdate = activity;
const elapsed = Date.now() - lastWriteTime;
if (elapsed >= minIntervalMs) {
if (flushTimer) clearTimeout(flushTimer);
flush();
} else if (!flushTimer) {
flushTimer = setTimeout(() => {
flushTimer = null;
flush();
}, minIntervalMs - elapsed);
}
},
flush,
};
}
/** Shared context passed to the archive processing pipeline. */
interface PipelineContext {
client: Client;
runId: string;
channelTitle: string;
channel: TelegramChannel;
destChannelTelegramId: bigint;
destChannelId: string;
throttled: ReturnType<typeof createThrottledActivityUpdater>;
counters: {
messagesScanned: number;
zipsFound: number;
zipsDuplicate: number;
zipsIngested: number;
};
/** Creator from forum topic name (null for non-forum). */
topicCreator: string | null;
/** Forum topic ID (null for non-forum). */
sourceTopicId: bigint | null;
accountLog: ReturnType<typeof childLogger>;
}
/**
* Run a full ingestion cycle for a single Telegram account.
* Every step writes live activity to the DB so the admin UI can display it.
*/
export async function runWorkerForAccount(
account: TelegramAccount
): Promise<void> {
const accountLog = childLogger("worker", { accountId: account.id, phone: account.phone });
// 1. Acquire advisory lock
const acquired = await tryAcquireLock(account.id);
if (!acquired) {
accountLog.info("Account already locked, skipping");
return;
}
let runId: string | undefined;
try {
// 2. Create ingestion run
const run = await createIngestionRun(account.id);
runId = run.id;
const activeRunId = runId;
accountLog.info({ runId }, "Ingestion run started");
const throttled = createThrottledActivityUpdater(activeRunId);
// 3. Initialize TDLib client
await updateRunActivity(activeRunId, {
currentActivity: "Connecting to Telegram",
currentStep: "connecting",
});
const client = await createTdlibClient({
id: account.id,
phone: account.phone,
});
const counters = {
messagesScanned: 0,
zipsFound: 0,
zipsDuplicate: 0,
zipsIngested: 0,
};
try {
// 4. Get assigned source channels and destination
const channelMappings = await getSourceChannelMappings(account.id);
const destChannel = await getDestinationChannel(account.id);
if (!destChannel) {
throw new Error("No active destination channel configured");
}
for (const mapping of channelMappings) {
const channel = mapping.channel;
// ── Check if channel is a forum ──
const forum = await isChatForum(client, channel.telegramId);
if (forum !== channel.isForum) {
await setChannelForum(channel.id, forum);
accountLog.info(
{ channelId: channel.id, title: channel.title, isForum: forum },
"Updated channel forum status"
);
}
const pipelineCtx: PipelineContext = {
client,
runId: activeRunId,
channelTitle: channel.title,
channel,
destChannelTelegramId: destChannel.telegramId,
destChannelId: destChannel.id,
throttled,
counters,
topicCreator: null,
sourceTopicId: null,
accountLog,
};
if (forum) {
// ── Forum channel: scan per-topic ──
await updateRunActivity(activeRunId, {
currentActivity: `Enumerating topics in "${channel.title}"`,
currentStep: "scanning",
currentChannel: channel.title,
currentFile: null,
currentFileNum: null,
totalFiles: null,
downloadedBytes: null,
totalBytes: null,
downloadPercent: null,
});
const topics = await getForumTopicList(client, channel.telegramId);
const topicProgressList = await getTopicProgress(mapping.id);
accountLog.info(
{ channelId: channel.id, title: channel.title, topicCount: topics.length },
"Scanning forum channel by topic"
);
for (const topic of topics) {
const progress = topicProgressList.find(
(tp) => tp.topicId === topic.topicId
);
await updateRunActivity(activeRunId, {
currentActivity: `Scanning topic "${topic.name}" in "${channel.title}"`,
currentStep: "scanning",
currentChannel: `${channel.title} ${topic.name}`,
currentFile: null,
currentFileNum: null,
totalFiles: null,
downloadedBytes: null,
totalBytes: null,
downloadPercent: null,
});
const scanResult = await getTopicMessages(
client,
channel.telegramId,
topic.topicId,
progress?.lastProcessedMessageId
);
if (scanResult.archives.length === 0) {
accountLog.debug(
{ channelId: channel.id, topic: topic.name },
"No new archives in topic"
);
continue;
}
accountLog.info(
{ topic: topic.name, archives: scanResult.archives.length, photos: scanResult.photos.length },
"Found messages in topic"
);
// Process archives with topic creator
pipelineCtx.topicCreator = topic.name;
pipelineCtx.sourceTopicId = topic.topicId;
pipelineCtx.channelTitle = `${channel.title} ${topic.name}`;
await processArchiveSets(pipelineCtx, scanResult, run.id);
// Update topic progress
const allMsgIds = [
...scanResult.archives.map((m) => m.id),
...scanResult.photos.map((p) => p.id),
];
if (allMsgIds.length > 0) {
const maxId = allMsgIds.reduce((a, b) => (a > b ? a : b));
await upsertTopicProgress(
mapping.id,
topic.topicId,
topic.name,
maxId
);
}
}
} else {
// ── Non-forum channel: flat scan (existing behavior) ──
await updateRunActivity(activeRunId, {
currentActivity: `Scanning "${channel.title}" for new archives`,
currentStep: "scanning",
currentChannel: channel.title,
currentFile: null,
currentFileNum: null,
totalFiles: null,
downloadedBytes: null,
totalBytes: null,
downloadPercent: null,
});
accountLog.info(
{ channelId: channel.id, title: channel.title },
"Processing source channel"
);
const scanResult = await getChannelMessages(
client,
channel.telegramId,
mapping.lastProcessedMessageId
);
if (scanResult.archives.length === 0) {
accountLog.debug({ channelId: channel.id }, "No new archives");
continue;
}
accountLog.info(
{ archives: scanResult.archives.length, photos: scanResult.photos.length },
"Found messages in channel"
);
// For non-forum, creator comes from filename (set to null, resolved per-archive)
pipelineCtx.topicCreator = null;
pipelineCtx.sourceTopicId = null;
pipelineCtx.channelTitle = channel.title;
await processArchiveSets(pipelineCtx, scanResult, run.id);
// Update last processed message
const allMsgIds = [
...scanResult.archives.map((m) => m.id),
...scanResult.photos.map((p) => p.id),
];
if (allMsgIds.length > 0) {
const maxId = allMsgIds.reduce((a, b) => (a > b ? a : b));
await updateLastProcessedMessage(mapping.id, maxId);
}
}
}
// ── Done ──
await completeIngestionRun(activeRunId, counters);
accountLog.info({ counters }, "Ingestion run completed");
} finally {
await closeTdlibClient(client);
}
} catch (err) {
const message = err instanceof Error ? err.message : String(err);
accountLog.error({ err }, "Ingestion run failed");
if (runId) {
await failIngestionRun(runId, message).catch((e) =>
accountLog.error({ e }, "Failed to mark run as failed")
);
}
} finally {
await releaseLock(account.id);
}
}
/**
* Process a scan result through the archive pipeline:
* group → download → hash → dedup → metadata → split → upload → preview → index.
*/
async function processArchiveSets(
ctx: PipelineContext,
scanResult: ChannelScanResult,
ingestionRunId: string
): Promise<void> {
const { client, runId, channelTitle, channel, throttled, counters, accountLog } = ctx;
// Group into archive sets
const archiveSets = groupArchiveSets(scanResult.archives);
counters.zipsFound += archiveSets.length;
// Match preview photos to archive sets
const previewMatches = matchPreviewToArchive(
scanResult.photos,
archiveSets.map((s) => ({
baseName: s.baseName,
firstMessageId: s.parts[0].id,
firstMessageDate: s.parts[0].date,
}))
);
if (previewMatches.size > 0) {
accountLog.info(
{ matched: previewMatches.size, total: archiveSets.length },
"Matched preview photos to archives"
);
}
await updateRunActivity(runId, {
currentActivity: `Found ${archiveSets.length} archive(s) in "${channelTitle}"`,
currentStep: "scanning",
currentChannel: channelTitle,
totalFiles: archiveSets.length,
zipsFound: counters.zipsFound,
});
for (let setIdx = 0; setIdx < archiveSets.length; setIdx++) {
await processOneArchiveSet(
ctx,
archiveSets[setIdx],
setIdx,
archiveSets.length,
previewMatches,
ingestionRunId
);
}
}
/**
* Process a single archive set through the full pipeline.
*/
async function processOneArchiveSet(
ctx: PipelineContext,
archiveSet: ArchiveSet,
setIdx: number,
totalSets: number,
previewMatches: Map<string, { id: bigint; fileId: string }>,
ingestionRunId: string
): Promise<void> {
const {
client, runId, channelTitle, channel,
destChannelTelegramId, destChannelId,
throttled, counters, topicCreator, sourceTopicId, accountLog,
} = ctx;
counters.messagesScanned += archiveSet.parts.length;
const archiveName = archiveSet.parts[0].fileName;
const tempPaths: string[] = [];
let splitPaths: string[] = [];
try {
// ── Downloading ──
for (let partIdx = 0; partIdx < archiveSet.parts.length; partIdx++) {
const part = archiveSet.parts[partIdx];
const tempPath = path.join(
config.tempDir,
`${ingestionRunId}_${part.id}_${part.fileName}`
);
const partLabel = archiveSet.parts.length > 1
? ` (part ${partIdx + 1}/${archiveSet.parts.length})`
: "";
await updateRunActivity(runId, {
currentActivity: `Downloading ${part.fileName}${partLabel}`,
currentStep: "downloading",
currentChannel: channelTitle,
currentFile: part.fileName,
currentFileNum: setIdx + 1,
totalFiles: totalSets,
downloadedBytes: 0n,
totalBytes: part.fileSize,
downloadPercent: 0,
messagesScanned: counters.messagesScanned,
});
accountLog.info(
{
fileName: part.fileName,
fileSize: Number(part.fileSize),
part: partIdx + 1,
totalParts: archiveSet.parts.length,
},
"Downloading archive part"
);
await downloadFile(
client,
part.fileId,
tempPath,
part.fileSize,
part.fileName,
(progress: DownloadProgress) => {
throttled.update({
currentActivity: `Downloading ${part.fileName}${partLabel}${progress.percent}%`,
currentStep: "downloading",
currentChannel: channelTitle,
currentFile: part.fileName,
currentFileNum: setIdx + 1,
totalFiles: totalSets,
downloadedBytes: BigInt(progress.downloadedBytes),
totalBytes: BigInt(progress.totalBytes),
downloadPercent: progress.percent,
});
}
);
await throttled.flush();
tempPaths.push(tempPath);
}
// ── Hashing ──
await updateRunActivity(runId, {
currentActivity: `Computing hash for ${archiveName}`,
currentStep: "hashing",
currentChannel: channelTitle,
currentFile: archiveName,
currentFileNum: setIdx + 1,
totalFiles: totalSets,
downloadedBytes: null,
totalBytes: null,
downloadPercent: null,
});
const contentHash = await hashParts(tempPaths);
// ── Deduplicating ──
await updateRunActivity(runId, {
currentActivity: `Checking if ${archiveName} is a duplicate`,
currentStep: "deduplicating",
currentChannel: channelTitle,
currentFile: archiveName,
currentFileNum: setIdx + 1,
totalFiles: totalSets,
});
const exists = await packageExistsByHash(contentHash);
if (exists) {
counters.zipsDuplicate++;
accountLog.debug({ contentHash }, "Duplicate archive, skipping");
await updateRunActivity(runId, {
currentActivity: `Skipped ${archiveName} (duplicate)`,
currentStep: "deduplicating",
currentChannel: channelTitle,
currentFile: archiveName,
currentFileNum: setIdx + 1,
totalFiles: totalSets,
zipsDuplicate: counters.zipsDuplicate,
});
return;
}
// ── Reading metadata ──
await updateRunActivity(runId, {
currentActivity: `Reading file list from ${archiveName}`,
currentStep: "reading_metadata",
currentChannel: channelTitle,
currentFile: archiveName,
currentFileNum: setIdx + 1,
totalFiles: totalSets,
});
let entries: { path: string; fileName: string; extension: string | null; compressedSize: bigint; uncompressedSize: bigint; crc32: string | null }[] = [];
try {
if (archiveSet.type === "ZIP") {
entries = await readZipCentralDirectory(tempPaths);
} else {
entries = await readRarContents(tempPaths[0]);
}
} catch (err) {
accountLog.warn({ err, baseName: archiveSet.baseName }, "Failed to read archive metadata, ingesting without file list");
}
// ── Splitting (if needed) ──
let uploadPaths = tempPaths;
const totalSize = archiveSet.parts.reduce(
(sum, p) => sum + p.fileSize,
0n
);
if (!archiveSet.isMultipart && totalSize > 2n * 1024n * 1024n * 1024n) {
await updateRunActivity(runId, {
currentActivity: `Splitting ${archiveName} for upload (>2GB)`,
currentStep: "splitting",
currentChannel: channelTitle,
currentFile: archiveName,
currentFileNum: setIdx + 1,
totalFiles: totalSets,
});
splitPaths = await byteLevelSplit(tempPaths[0]);
uploadPaths = splitPaths;
}
// ── Uploading ──
const uploadLabel = uploadPaths.length > 1
? ` (${uploadPaths.length} parts)`
: "";
await updateRunActivity(runId, {
currentActivity: `Uploading ${archiveName} to archive channel${uploadLabel}`,
currentStep: "uploading",
currentChannel: channelTitle,
currentFile: archiveName,
currentFileNum: setIdx + 1,
totalFiles: totalSets,
});
const destResult = await uploadToChannel(
client,
destChannelTelegramId,
uploadPaths
);
// ── Preview thumbnail ──
let previewData: Buffer | null = null;
let previewMsgId: bigint | null = null;
const matchedPhoto = previewMatches.get(archiveSet.baseName);
if (matchedPhoto) {
await updateRunActivity(runId, {
currentActivity: `Downloading preview image for ${archiveName}`,
currentStep: "preview",
currentChannel: channelTitle,
currentFile: archiveName,
currentFileNum: setIdx + 1,
totalFiles: totalSets,
});
previewData = await downloadPhotoThumbnail(client, matchedPhoto.fileId);
previewMsgId = matchedPhoto.id;
}
// ── Resolve creator: topic name > filename extraction > null ──
const creator = topicCreator ?? extractCreatorFromFileName(archiveName) ?? null;
// ── Indexing ──
await updateRunActivity(runId, {
currentActivity: `Saving metadata for ${archiveName} (${entries.length} files)`,
currentStep: "indexing",
currentChannel: channelTitle,
currentFile: archiveName,
currentFileNum: setIdx + 1,
totalFiles: totalSets,
});
await createPackageWithFiles({
contentHash,
fileName: archiveName,
fileSize: totalSize,
archiveType: archiveSet.type,
sourceChannelId: channel.id,
sourceMessageId: archiveSet.parts[0].id,
sourceTopicId,
destChannelId,
destMessageId: destResult.messageId,
isMultipart:
archiveSet.parts.length > 1 || uploadPaths.length > 1,
partCount: uploadPaths.length,
ingestionRunId,
creator,
previewData,
previewMsgId,
files: entries,
});
counters.zipsIngested++;
await updateRunActivity(runId, {
currentActivity: `Ingested ${archiveName} (${entries.length} files indexed)`,
currentStep: "complete",
currentChannel: channelTitle,
currentFile: archiveName,
currentFileNum: setIdx + 1,
totalFiles: totalSets,
zipsIngested: counters.zipsIngested,
});
accountLog.info(
{ fileName: archiveName, contentHash, fileCount: entries.length, creator },
"Archive ingested"
);
} finally {
// ALWAYS delete temp files
await deleteFiles([...tempPaths, ...splitPaths]);
}
}
async function deleteFiles(paths: string[]): Promise<void> {
for (const p of paths) {
try {
await unlink(p);
} catch {
// File may already be deleted or never created
}
}
}
/**
* Clean up any leftover temp files from previous runs.
*/
export async function cleanupTempDir(): Promise<void> {
try {
const files = await readdir(config.tempDir);
for (const file of files) {
await unlink(path.join(config.tempDir, file)).catch(() => {});
}
if (files.length > 0) {
log.info({ count: files.length }, "Cleaned up stale temp files");
}
} catch {
// Directory might not exist yet
}
}