feat: add Telegram integration with forum topic support and creator tracking

Adds full Telegram ZIP ingestion pipeline: TDLib worker service scans source
channels for archive files, deduplicates by content hash, extracts metadata,
uploads to archive channel, and indexes in Postgres. Forum supergroups are
scanned per-topic with topic names used as creator. Filename-based creator
extraction (e.g. "Mammoth Factory - 2026-01.zip") serves as fallback.

Includes admin UI for managing accounts/channels, simplified account setup
(API credentials via env vars), auth code/password submission dialog,
package browser with creator column, and live ingestion activity tracking.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
xCyanGrizzly
2026-02-24 16:02:06 +01:00
parent beb9cfb312
commit b427193d17
70 changed files with 8627 additions and 2 deletions

120
worker/src/tdlib/client.ts Normal file
View File

@@ -0,0 +1,120 @@
import tdl, { createClient, type Client } from "tdl";
import { getTdjson } from "prebuilt-tdlib";
import path from "path";
import { config } from "../util/config.js";
import { childLogger } from "../util/logger.js";
import {
updateAccountAuthState,
getAccountAuthCode,
} from "../db/queries.js";
const log = childLogger("tdlib-client");
// Configure tdl to use the prebuilt tdjson shared library
tdl.configure({ tdjson: getTdjson() });
interface AccountConfig {
id: string;
phone: string;
}
/**
* Create and authenticate a TDLib client for a Telegram account.
* Authentication flow communicates with the admin UI via the database:
* - Worker sets authState to AWAITING_CODE when TDLib asks for phone code
* - Admin enters the code via UI, which writes it to authCode field
* - Worker polls DB for the code and feeds it to TDLib
*/
export async function createTdlibClient(
account: AccountConfig
): Promise<Client> {
const dbPath = path.join(config.tdlibStateDir, account.id);
const client = createClient({
apiId: config.telegramApiId,
apiHash: config.telegramApiHash,
databaseDirectory: dbPath,
filesDirectory: path.join(dbPath, "files"),
});
client.on("error", (err) => {
log.error({ err, accountId: account.id }, "TDLib client error");
});
try {
await client.login(() => ({
getPhoneNumber: async () => {
log.info({ accountId: account.id }, "TDLib requesting phone number");
return account.phone;
},
getAuthCode: async () => {
log.info({ accountId: account.id }, "TDLib requesting auth code");
await updateAccountAuthState(account.id, "AWAITING_CODE");
// Poll database for the code entered via admin UI
const code = await pollForAuthCode(account.id);
if (!code) {
throw new Error("Auth code not provided within timeout");
}
// Clear the code after reading
await updateAccountAuthState(account.id, "AUTHENTICATED", null);
return code;
},
getPassword: async () => {
log.info({ accountId: account.id }, "TDLib requesting 2FA password");
await updateAccountAuthState(account.id, "AWAITING_PASSWORD");
// Poll database for the password entered via admin UI
const code = await pollForAuthCode(account.id);
if (!code) {
throw new Error("2FA password not provided within timeout");
}
await updateAccountAuthState(account.id, "AUTHENTICATED", null);
return code;
},
}));
await updateAccountAuthState(account.id, "AUTHENTICATED");
log.info({ accountId: account.id }, "TDLib client authenticated");
return client;
} catch (err) {
log.error({ err, accountId: account.id }, "TDLib authentication failed");
await updateAccountAuthState(account.id, "EXPIRED");
throw err;
}
}
/**
* Poll the database every 5 seconds for an auth code, up to 5 minutes.
*/
async function pollForAuthCode(
accountId: string,
timeoutMs = 300_000
): Promise<string | null> {
const start = Date.now();
while (Date.now() - start < timeoutMs) {
const result = await getAccountAuthCode(accountId);
if (result?.authCode) {
return result.authCode;
}
await sleep(5000);
}
return null;
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
/**
* Close a TDLib client gracefully.
*/
export async function closeTdlibClient(client: Client): Promise<void> {
try {
await client.close();
} catch (err) {
log.warn({ err }, "Error closing TDLib client");
}
}

View File

@@ -0,0 +1,389 @@
import type { Client } from "tdl";
import { readFile, rename, stat } from "fs/promises";
import { config } from "../util/config.js";
import { childLogger } from "../util/logger.js";
import { isArchiveAttachment } from "../archive/detect.js";
import type { TelegramMessage } from "../archive/multipart.js";
import type { TelegramPhoto } from "../preview/match.js";
const log = childLogger("download");
interface TdPhotoSize {
type: string;
photo: {
id: number;
size: number;
expected_size: number;
local?: {
path?: string;
is_downloading_active?: boolean;
is_downloading_completed?: boolean;
downloaded_size?: number;
};
};
width: number;
height: number;
}
interface TdMessage {
id: number;
date: number;
content: {
_: string;
document?: {
file_name?: string;
document?: {
id: number;
size: number;
local?: {
path?: string;
is_downloading_completed?: boolean;
};
};
};
photo?: {
sizes?: TdPhotoSize[];
};
caption?: {
text?: string;
};
};
}
interface TdFile {
id: number;
size: number;
expected_size: number;
local: {
path: string;
is_downloading_active: boolean;
is_downloading_completed: boolean;
downloaded_size: number;
download_offset: number;
};
}
export interface ChannelScanResult {
archives: TelegramMessage[];
photos: TelegramPhoto[];
}
/**
* Fetch messages from a channel since a given message ID.
* Collects both archive attachments AND photo messages (for preview matching).
* Returns messages in chronological order (oldest first).
*/
export async function getChannelMessages(
client: Client,
chatId: bigint,
fromMessageId?: bigint | null,
limit = 100
): Promise<ChannelScanResult> {
const archives: TelegramMessage[] = [];
const photos: TelegramPhoto[] = [];
let currentFromId = fromMessageId ? Number(fromMessageId) : 0;
// eslint-disable-next-line no-constant-condition
while (true) {
const result = (await client.invoke({
_: "getChatHistory",
chat_id: Number(chatId),
from_message_id: currentFromId,
offset: 0,
limit: Math.min(limit, 100),
only_local: false,
})) as { messages: TdMessage[] };
if (!result.messages || result.messages.length === 0) break;
for (const msg of result.messages) {
// Check for archive documents
const doc = msg.content?.document;
if (doc?.file_name && doc.document && isArchiveAttachment(doc.file_name)) {
archives.push({
id: BigInt(msg.id),
fileName: doc.file_name,
fileId: String(doc.document.id),
fileSize: BigInt(doc.document.size),
date: new Date(msg.date * 1000),
});
continue;
}
// Check for photo messages (potential previews)
const photo = msg.content?.photo;
const caption = msg.content?.caption?.text ?? "";
if (photo?.sizes && photo.sizes.length > 0) {
// Pick the smallest size for thumbnail (type "s" or "m")
// TDLib photo sizes are ordered from smallest to largest
const smallest = photo.sizes[0];
photos.push({
id: BigInt(msg.id),
date: new Date(msg.date * 1000),
caption,
fileId: String(smallest.photo.id),
fileSize: smallest.photo.size || smallest.photo.expected_size,
});
}
}
currentFromId = result.messages[result.messages.length - 1].id;
if (result.messages.length < 100) break;
// Rate limit delay
await sleep(config.apiDelayMs);
}
// Return in chronological order (oldest first)
return {
archives: archives.reverse(),
photos: photos.reverse(),
};
}
/**
* Download a photo thumbnail from Telegram and return its raw bytes.
* Uses synchronous download (photos are small, typically < 100KB).
* Returns null if download fails (non-critical).
*/
export async function downloadPhotoThumbnail(
client: Client,
fileId: string
): Promise<Buffer | null> {
const numericId = parseInt(fileId, 10);
try {
const result = (await client.invoke({
_: "downloadFile",
file_id: numericId,
priority: 1, // Low priority — thumbnails are nice-to-have
offset: 0,
limit: 0,
synchronous: true, // Small file — wait for it
})) as TdFile;
if (result?.local?.is_downloading_completed && result.local.path) {
const data = await readFile(result.local.path);
log.debug(
{ fileId, bytes: data.length },
"Downloaded photo thumbnail"
);
return data;
}
} catch (err) {
log.warn({ fileId, err }, "Failed to download photo thumbnail");
}
return null;
}
export interface DownloadProgress {
fileId: string;
fileName: string;
downloadedBytes: number;
totalBytes: number;
percent: number;
isComplete: boolean;
}
export type ProgressCallback = (progress: DownloadProgress) => void;
/**
* Download a file from Telegram to a local path with progress tracking
* and integrity verification.
*
* Progress flow:
* 1. Starts async download via TDLib
* 2. Listens for `updateFile` events to track download progress
* 3. Logs progress at every 10% increment
* 4. Once complete, verifies the local file size matches the expected size
* 5. Moves the file from TDLib's cache to the destination path
*
* Verification:
* - Compares actual file size on disk to the expected size from Telegram
* - Throws on mismatch (partial/corrupt download)
* - Throws on timeout (configurable, scales with file size)
* - Throws if download stops without completing (network error, etc.)
*/
export async function downloadFile(
client: Client,
fileId: string,
destPath: string,
expectedSize: bigint,
fileName: string,
onProgress?: ProgressCallback
): Promise<void> {
const numericId = parseInt(fileId, 10);
const totalBytes = Number(expectedSize);
log.info(
{ fileId, fileName, destPath, totalBytes },
"Starting file download"
);
// Report initial progress
onProgress?.({
fileId,
fileName,
downloadedBytes: 0,
totalBytes,
percent: 0,
isComplete: false,
});
return new Promise<void>((resolve, reject) => {
let lastLoggedPercent = 0;
let settled = false;
// Timeout: 10 minutes per GB, minimum 5 minutes
const timeoutMs = Math.max(
5 * 60_000,
(totalBytes / (1024 * 1024 * 1024)) * 10 * 60_000
);
const timer = setTimeout(() => {
if (!settled) {
settled = true;
cleanup();
reject(
new Error(
`Download timed out after ${Math.round(timeoutMs / 60_000)}min for ${fileName}`
)
);
}
}, timeoutMs);
// Listen for file update events to track progress
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const handleUpdate = (update: any) => {
if (update?._ !== "updateFile") return;
const file = update.file as TdFile | undefined;
if (!file || file.id !== numericId) return;
const downloaded = file.local.downloaded_size;
const percent =
totalBytes > 0 ? Math.round((downloaded / totalBytes) * 100) : 0;
// Log at every 10% increment
if (percent >= lastLoggedPercent + 10) {
lastLoggedPercent = percent - (percent % 10);
log.info(
{ fileId, fileName, downloaded, totalBytes, percent: `${percent}%` },
"Download progress"
);
}
// Report to callback
onProgress?.({
fileId,
fileName,
downloadedBytes: downloaded,
totalBytes,
percent,
isComplete: file.local.is_downloading_completed,
});
// Download finished
if (file.local.is_downloading_completed) {
if (!settled) {
settled = true;
cleanup();
verifyAndMove(file.local.path, destPath, totalBytes, fileName, fileId)
.then(resolve)
.catch(reject);
}
}
// Download stopped without completing (network error, cancelled, etc.)
if (
!file.local.is_downloading_active &&
!file.local.is_downloading_completed
) {
if (!settled) {
settled = true;
cleanup();
reject(
new Error(
`Download stopped unexpectedly for ${fileName} ` +
`(${downloaded}/${totalBytes} bytes, ${percent}%)`
)
);
}
}
};
const cleanup = () => {
clearTimeout(timer);
client.off("update", handleUpdate);
};
// Subscribe to updates BEFORE starting download
client.on("update", handleUpdate);
// Start async download (non-blocking — progress via updateFile events)
client
.invoke({
_: "downloadFile",
file_id: numericId,
priority: 32,
offset: 0,
limit: 0,
synchronous: false,
})
.then((result: unknown) => {
// If the file was already cached locally, invoke returns immediately
const file = result as TdFile | undefined;
if (file?.local?.is_downloading_completed && !settled) {
settled = true;
cleanup();
verifyAndMove(file.local.path, destPath, totalBytes, fileName, fileId)
.then(resolve)
.catch(reject);
}
})
.catch((err: unknown) => {
if (!settled) {
settled = true;
cleanup();
reject(err);
}
});
});
}
/**
* Verify the downloaded file's size matches the expected size,
* then move it to the destination path.
*/
async function verifyAndMove(
localPath: string,
destPath: string,
expectedBytes: number,
fileName: string,
fileId: string
): Promise<void> {
const stats = await stat(localPath);
const actualBytes = stats.size;
if (expectedBytes > 0 && actualBytes !== expectedBytes) {
log.error(
{ fileId, fileName, expectedBytes, actualBytes },
"Download size mismatch — file is incomplete or corrupted"
);
throw new Error(
`Download verification failed for ${fileName}: ` +
`expected ${expectedBytes} bytes, got ${actualBytes} bytes`
);
}
log.info(
{ fileId, fileName, bytes: actualBytes, destPath },
"File verified and complete"
);
// Move from TDLib's cache to our temp directory
await rename(localPath, destPath);
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}

222
worker/src/tdlib/topics.ts Normal file
View File

@@ -0,0 +1,222 @@
import type { Client } from "tdl";
import { config } from "../util/config.js";
import { childLogger } from "../util/logger.js";
import { isArchiveAttachment } from "../archive/detect.js";
import type { TelegramMessage } from "../archive/multipart.js";
import type { TelegramPhoto } from "../preview/match.js";
import type { ChannelScanResult } from "./download.js";
const log = childLogger("topics");
export interface ForumTopic {
topicId: bigint;
name: string;
}
/**
* Check if a chat is a forum supergroup (topics enabled).
*/
export async function isChatForum(
client: Client,
chatId: bigint
): Promise<boolean> {
try {
const chat = (await client.invoke({
_: "getChat",
chat_id: Number(chatId),
})) as {
type?: {
_: string;
supergroup_id?: number;
is_forum?: boolean;
};
};
if (chat.type?._ === "chatTypeSupergroup" && chat.type.is_forum) {
return true;
}
// Also check via getSupergroup for older TDLib versions
if (chat.type?._ === "chatTypeSupergroup" && chat.type.supergroup_id) {
const sg = (await client.invoke({
_: "getSupergroup",
supergroup_id: chat.type.supergroup_id,
})) as { is_forum?: boolean };
return sg.is_forum === true;
}
return false;
} catch (err) {
log.warn({ err, chatId: chatId.toString() }, "Failed to check if chat is forum");
return false;
}
}
/**
* Get all forum topics in a supergroup.
*/
export async function getForumTopicList(
client: Client,
chatId: bigint
): Promise<ForumTopic[]> {
const topics: ForumTopic[] = [];
let offsetDate = 0;
let offsetMessageId = 0;
let offsetMessageThreadId = 0;
// eslint-disable-next-line no-constant-condition
while (true) {
const result = (await client.invoke({
_: "getForumTopics",
chat_id: Number(chatId),
query: "",
offset_date: offsetDate,
offset_message_id: offsetMessageId,
offset_message_thread_id: offsetMessageThreadId,
limit: 100,
})) as {
topics?: {
info?: {
message_thread_id?: number;
name?: string;
is_general?: boolean;
};
}[];
next_offset_date?: number;
next_offset_message_id?: number;
next_offset_message_thread_id?: number;
};
if (!result.topics || result.topics.length === 0) break;
for (const t of result.topics) {
if (!t.info?.message_thread_id) continue;
// Skip the "General" topic — it's not creator-specific
if (t.info.is_general) continue;
topics.push({
topicId: BigInt(t.info.message_thread_id),
name: t.info.name ?? "Unnamed",
});
}
// Check if there are more pages
if (
!result.next_offset_date &&
!result.next_offset_message_id &&
!result.next_offset_message_thread_id
) {
break;
}
offsetDate = result.next_offset_date ?? 0;
offsetMessageId = result.next_offset_message_id ?? 0;
offsetMessageThreadId = result.next_offset_message_thread_id ?? 0;
await sleep(config.apiDelayMs);
}
log.info(
{ chatId: chatId.toString(), topicCount: topics.length },
"Enumerated forum topics"
);
return topics;
}
/**
* Fetch messages from a specific forum topic (thread).
* Uses getMessageThreadHistory to scan within a topic.
*/
export async function getTopicMessages(
client: Client,
chatId: bigint,
topicId: bigint,
fromMessageId?: bigint | null,
limit = 100
): Promise<ChannelScanResult> {
const archives: TelegramMessage[] = [];
const photos: TelegramPhoto[] = [];
let currentFromId = fromMessageId ? Number(fromMessageId) : 0;
// eslint-disable-next-line no-constant-condition
while (true) {
const result = (await client.invoke({
_: "getMessageThreadHistory",
chat_id: Number(chatId),
message_id: Number(topicId),
from_message_id: currentFromId,
offset: 0,
limit: Math.min(limit, 100),
})) as {
messages?: {
id: number;
date: number;
content: {
_: string;
document?: {
file_name?: string;
document?: {
id: number;
size: number;
};
};
photo?: {
sizes?: {
type: string;
photo: { id: number; size: number; expected_size: number };
width: number;
height: number;
}[];
};
caption?: { text?: string };
};
}[];
};
if (!result.messages || result.messages.length === 0) break;
for (const msg of result.messages) {
// Check for archive documents
const doc = msg.content?.document;
if (doc?.file_name && doc.document && isArchiveAttachment(doc.file_name)) {
archives.push({
id: BigInt(msg.id),
fileName: doc.file_name,
fileId: String(doc.document.id),
fileSize: BigInt(doc.document.size),
date: new Date(msg.date * 1000),
});
continue;
}
// Check for photo messages (potential previews)
const photo = msg.content?.photo;
const caption = msg.content?.caption?.text ?? "";
if (photo?.sizes && photo.sizes.length > 0) {
const smallest = photo.sizes[0];
photos.push({
id: BigInt(msg.id),
date: new Date(msg.date * 1000),
caption,
fileId: String(smallest.photo.id),
fileSize: smallest.photo.size || smallest.photo.expected_size,
});
}
}
currentFromId = result.messages[result.messages.length - 1].id;
if (result.messages.length < 100) break;
await sleep(config.apiDelayMs);
}
return {
archives: archives.reverse(),
photos: photos.reverse(),
};
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}