mirror of
https://github.com/xCyanGrizzly/DragonsStash.git
synced 2026-05-11 06:11:15 +00:00
feat: add Telegram integration with forum topic support and creator tracking
Adds full Telegram ZIP ingestion pipeline: TDLib worker service scans source channels for archive files, deduplicates by content hash, extracts metadata, uploads to archive channel, and indexes in Postgres. Forum supergroups are scanned per-topic with topic names used as creator. Filename-based creator extraction (e.g. "Mammoth Factory - 2026-01.zip") serves as fallback. Includes admin UI for managing accounts/channels, simplified account setup (API credentials via env vars), auth code/password submission dialog, package browser with creator column, and live ingestion activity tracking. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
21
worker/src/archive/creator.ts
Normal file
21
worker/src/archive/creator.ts
Normal file
@@ -0,0 +1,21 @@
|
||||
/**
|
||||
* Extract a creator name from common archive file naming patterns.
|
||||
*
|
||||
* Priority in the worker: topic name > filename extraction.
|
||||
* This is the fallback when no forum topic name is available.
|
||||
*
|
||||
* Patterns handled (split on ` - `):
|
||||
* "Mammoth Factory - 2026-01.zip" → "Mammoth Factory"
|
||||
* "Artist Name - Pack Title.part01.rar" → "Artist Name"
|
||||
* "some_random_file.zip" → null
|
||||
*/
|
||||
export function extractCreatorFromFileName(fileName: string): string | null {
|
||||
// Strip archive extensions (.zip, .rar, .part01.rar, .z01, etc.)
|
||||
const bare = fileName.replace(/(\.(part\d+\.rar|z\d{2}|zip|rar))+$/i, "");
|
||||
|
||||
const idx = bare.indexOf(" - ");
|
||||
if (idx <= 0) return null;
|
||||
|
||||
const creator = bare.slice(0, idx).trim();
|
||||
return creator.length > 0 ? creator : null;
|
||||
}
|
||||
96
worker/src/archive/detect.ts
Normal file
96
worker/src/archive/detect.ts
Normal file
@@ -0,0 +1,96 @@
|
||||
export type ArchiveFormat = "ZIP" | "RAR";
|
||||
|
||||
export interface MultipartInfo {
|
||||
baseName: string;
|
||||
partNumber: number;
|
||||
format: ArchiveFormat;
|
||||
pattern: "ZIP_NUMBERED" | "ZIP_LEGACY" | "RAR_PART" | "RAR_LEGACY" | "SINGLE";
|
||||
}
|
||||
|
||||
const patterns: {
|
||||
regex: RegExp;
|
||||
format: ArchiveFormat;
|
||||
pattern: MultipartInfo["pattern"];
|
||||
getBaseName: (match: RegExpMatchArray) => string;
|
||||
getPartNumber: (match: RegExpMatchArray) => number;
|
||||
}[] = [
|
||||
// pack.zip.001, pack.zip.002
|
||||
{
|
||||
regex: /^(.+\.zip)\.(\d{3,})$/i,
|
||||
format: "ZIP",
|
||||
pattern: "ZIP_NUMBERED",
|
||||
getBaseName: (m) => m[1],
|
||||
getPartNumber: (m) => parseInt(m[2], 10),
|
||||
},
|
||||
// pack.z01, pack.z02 (legacy split — final part is pack.zip)
|
||||
{
|
||||
regex: /^(.+)\.z(\d{2,})$/i,
|
||||
format: "ZIP",
|
||||
pattern: "ZIP_LEGACY",
|
||||
getBaseName: (m) => m[1],
|
||||
getPartNumber: (m) => parseInt(m[2], 10),
|
||||
},
|
||||
// pack.part1.rar, pack.part2.rar
|
||||
{
|
||||
regex: /^(.+)\.part(\d+)\.rar$/i,
|
||||
format: "RAR",
|
||||
pattern: "RAR_PART",
|
||||
getBaseName: (m) => m[1],
|
||||
getPartNumber: (m) => parseInt(m[2], 10),
|
||||
},
|
||||
// pack.r00, pack.r01 (legacy split — final part is pack.rar)
|
||||
{
|
||||
regex: /^(.+)\.r(\d{2,})$/i,
|
||||
format: "RAR",
|
||||
pattern: "RAR_LEGACY",
|
||||
getBaseName: (m) => m[1],
|
||||
getPartNumber: (m) => parseInt(m[2], 10),
|
||||
},
|
||||
];
|
||||
|
||||
/**
|
||||
* Detect if a filename is an archive and extract multipart info.
|
||||
*/
|
||||
export function detectArchive(fileName: string): MultipartInfo | null {
|
||||
// Check multipart patterns first
|
||||
for (const p of patterns) {
|
||||
const match = fileName.match(p.regex);
|
||||
if (match) {
|
||||
return {
|
||||
baseName: p.getBaseName(match),
|
||||
partNumber: p.getPartNumber(match),
|
||||
format: p.format,
|
||||
pattern: p.pattern,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Single .zip file — could be a standalone or the final part of a ZIP_LEGACY set
|
||||
if (/\.zip$/i.test(fileName)) {
|
||||
return {
|
||||
baseName: fileName.replace(/\.zip$/i, ""),
|
||||
partNumber: -1, // -1 signals "could be single or final legacy part"
|
||||
format: "ZIP",
|
||||
pattern: "SINGLE",
|
||||
};
|
||||
}
|
||||
|
||||
// Single .rar file — could be standalone or final part of RAR_LEGACY set
|
||||
if (/\.rar$/i.test(fileName)) {
|
||||
return {
|
||||
baseName: fileName.replace(/\.rar$/i, ""),
|
||||
partNumber: -1,
|
||||
format: "RAR",
|
||||
pattern: "SINGLE",
|
||||
};
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a filename looks like any archive attachment we should process.
|
||||
*/
|
||||
export function isArchiveAttachment(fileName: string): boolean {
|
||||
return detectArchive(fileName) !== null;
|
||||
}
|
||||
25
worker/src/archive/hash.ts
Normal file
25
worker/src/archive/hash.ts
Normal file
@@ -0,0 +1,25 @@
|
||||
import { createReadStream } from "fs";
|
||||
import { createHash } from "crypto";
|
||||
import { pipeline } from "stream/promises";
|
||||
import { PassThrough } from "stream";
|
||||
|
||||
/**
|
||||
* Compute SHA-256 hash of one or more files by streaming them in order.
|
||||
* Memory usage: O(1) — reads in 64KB chunks regardless of total size.
|
||||
* For multipart archives, pass all parts sorted by part number.
|
||||
*/
|
||||
export async function hashParts(filePaths: string[]): Promise<string> {
|
||||
const hash = createHash("sha256");
|
||||
for (const filePath of filePaths) {
|
||||
await pipeline(
|
||||
createReadStream(filePath),
|
||||
new PassThrough({
|
||||
transform(chunk, _encoding, callback) {
|
||||
hash.update(chunk);
|
||||
callback();
|
||||
},
|
||||
})
|
||||
);
|
||||
}
|
||||
return hash.digest("hex");
|
||||
}
|
||||
100
worker/src/archive/multipart.ts
Normal file
100
worker/src/archive/multipart.ts
Normal file
@@ -0,0 +1,100 @@
|
||||
import { detectArchive, type ArchiveFormat, type MultipartInfo } from "./detect.js";
|
||||
import { config } from "../util/config.js";
|
||||
import { childLogger } from "../util/logger.js";
|
||||
|
||||
const log = childLogger("multipart");
|
||||
|
||||
export interface TelegramMessage {
|
||||
id: bigint;
|
||||
fileName: string;
|
||||
fileId: string;
|
||||
fileSize: bigint;
|
||||
date: Date;
|
||||
}
|
||||
|
||||
export interface ArchiveSet {
|
||||
type: ArchiveFormat;
|
||||
baseName: string;
|
||||
parts: TelegramMessage[];
|
||||
isMultipart: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Group messages into archive sets (single files + multipart groups).
|
||||
* Messages should be pre-filtered to only include archive attachments.
|
||||
*/
|
||||
export function groupArchiveSets(messages: TelegramMessage[]): ArchiveSet[] {
|
||||
// Detect and annotate each message
|
||||
const annotated: { msg: TelegramMessage; info: MultipartInfo }[] = [];
|
||||
for (const msg of messages) {
|
||||
const info = detectArchive(msg.fileName);
|
||||
if (info) {
|
||||
annotated.push({ msg, info });
|
||||
}
|
||||
}
|
||||
|
||||
// Group by baseName + format
|
||||
const groups = new Map<string, { msg: TelegramMessage; info: MultipartInfo }[]>();
|
||||
for (const item of annotated) {
|
||||
const key = `${item.info.format}:${item.info.baseName.toLowerCase()}`;
|
||||
const group = groups.get(key) ?? [];
|
||||
group.push(item);
|
||||
groups.set(key, group);
|
||||
}
|
||||
|
||||
const results: ArchiveSet[] = [];
|
||||
|
||||
for (const [, group] of groups) {
|
||||
const format = group[0].info.format;
|
||||
const baseName = group[0].info.baseName;
|
||||
|
||||
// Separate explicit multipart entries from potential singles
|
||||
const multipartEntries = group.filter((g) => g.info.pattern !== "SINGLE");
|
||||
const singleEntries = group.filter((g) => g.info.pattern === "SINGLE");
|
||||
|
||||
if (multipartEntries.length > 0) {
|
||||
// This is a multipart set
|
||||
// Check if any single entry is the "final part" of a legacy split
|
||||
const allEntries = [...multipartEntries, ...singleEntries];
|
||||
|
||||
// Check time span — skip if parts span too long
|
||||
const dates = allEntries.map((e) => e.msg.date.getTime());
|
||||
const span = Math.max(...dates) - Math.min(...dates);
|
||||
const maxSpanMs = config.multipartTimeoutHours * 60 * 60 * 1000;
|
||||
|
||||
if (span > maxSpanMs) {
|
||||
log.warn(
|
||||
{ baseName, format, span: span / 3600000 },
|
||||
"Multipart set spans too long, skipping"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Sort by part number (singles get a very high number so they come last — they're the final part)
|
||||
allEntries.sort((a, b) => {
|
||||
const aNum = a.info.partNumber === -1 ? 999999 : a.info.partNumber;
|
||||
const bNum = b.info.partNumber === -1 ? 999999 : b.info.partNumber;
|
||||
return aNum - bNum;
|
||||
});
|
||||
|
||||
results.push({
|
||||
type: format,
|
||||
baseName,
|
||||
parts: allEntries.map((e) => e.msg),
|
||||
isMultipart: true,
|
||||
});
|
||||
} else {
|
||||
// All entries are singles — each is its own archive set
|
||||
for (const entry of singleEntries) {
|
||||
results.push({
|
||||
type: format,
|
||||
baseName: entry.info.baseName,
|
||||
parts: [entry.msg],
|
||||
isMultipart: false,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
90
worker/src/archive/rar-reader.ts
Normal file
90
worker/src/archive/rar-reader.ts
Normal file
@@ -0,0 +1,90 @@
|
||||
import { execFile } from "child_process";
|
||||
import { promisify } from "util";
|
||||
import path from "path";
|
||||
import { childLogger } from "../util/logger.js";
|
||||
import type { FileEntry } from "./zip-reader.js";
|
||||
|
||||
const execFileAsync = promisify(execFile);
|
||||
const log = childLogger("rar-reader");
|
||||
|
||||
/**
|
||||
* Parse output of `unrar l -v <file>` to extract file metadata.
|
||||
* unrar automatically discovers sibling parts when they're co-located.
|
||||
*/
|
||||
export async function readRarContents(
|
||||
firstPartPath: string
|
||||
): Promise<FileEntry[]> {
|
||||
try {
|
||||
const { stdout } = await execFileAsync("unrar", ["l", "-v", firstPartPath], {
|
||||
timeout: 30000,
|
||||
maxBuffer: 10 * 1024 * 1024, // 10MB for very large archives
|
||||
});
|
||||
|
||||
return parseUnrarOutput(stdout);
|
||||
} catch (err) {
|
||||
log.warn({ err, file: firstPartPath }, "Failed to read RAR contents");
|
||||
return []; // Fallback: return empty on error
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse the tabular output of `unrar l -v`.
|
||||
*
|
||||
* Example output format:
|
||||
* Archive: test.rar
|
||||
* Details: RAR 5
|
||||
*
|
||||
* Attributes Size Packed Ratio Date Time CRC-32 Name
|
||||
* ----------- --------- --------- ----- -------- ----- -------- ----
|
||||
* ...A.... 12345 10234 83% 2024-01-15 10:30 DEADBEEF folder/file.stl
|
||||
* ----------- --------- --------- ----- -------- ----- -------- ----
|
||||
*/
|
||||
function parseUnrarOutput(output: string): FileEntry[] {
|
||||
const entries: FileEntry[] = [];
|
||||
const lines = output.split("\n");
|
||||
|
||||
let inFileList = false;
|
||||
let separatorCount = 0;
|
||||
|
||||
for (const line of lines) {
|
||||
const trimmed = line.trim();
|
||||
|
||||
// Detect separator lines (------- pattern)
|
||||
if (/^-{5,}/.test(trimmed)) {
|
||||
separatorCount++;
|
||||
if (separatorCount === 1) {
|
||||
inFileList = true;
|
||||
} else if (separatorCount >= 2) {
|
||||
inFileList = false;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!inFileList) continue;
|
||||
|
||||
// Parse file entry line
|
||||
// Format: Attributes Size Packed Ratio Date Time CRC Name
|
||||
const match = trimmed.match(
|
||||
/^\S+\s+(\d+)\s+(\d+)\s+\d+%\s+\S+\s+\S+\s+([0-9A-Fa-f]+)\s+(.+)$/
|
||||
);
|
||||
|
||||
if (match) {
|
||||
const [, uncompressedStr, compressedStr, crc32, filePath] = match;
|
||||
|
||||
// Skip directory entries (typically end with / or have size 0 with dir attributes)
|
||||
if (filePath.endsWith("/") || filePath.endsWith("\\")) continue;
|
||||
|
||||
const ext = path.extname(filePath).toLowerCase();
|
||||
entries.push({
|
||||
path: filePath,
|
||||
fileName: path.basename(filePath),
|
||||
extension: ext ? ext.slice(1) : null,
|
||||
compressedSize: BigInt(compressedStr),
|
||||
uncompressedSize: BigInt(uncompressedStr),
|
||||
crc32: crc32.toLowerCase(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return entries;
|
||||
}
|
||||
48
worker/src/archive/split.ts
Normal file
48
worker/src/archive/split.ts
Normal file
@@ -0,0 +1,48 @@
|
||||
import { createReadStream, createWriteStream } from "fs";
|
||||
import { stat } from "fs/promises";
|
||||
import path from "path";
|
||||
import { pipeline } from "stream/promises";
|
||||
import { childLogger } from "../util/logger.js";
|
||||
|
||||
const log = childLogger("split");
|
||||
|
||||
/** 2GB in bytes — Telegram's file size limit */
|
||||
const MAX_PART_SIZE = 2n * 1024n * 1024n * 1024n;
|
||||
|
||||
/**
|
||||
* Split a file into ≤2GB parts using byte-level splitting.
|
||||
* Returns paths to the split parts. If the file is already ≤2GB, returns the original path.
|
||||
*/
|
||||
export async function byteLevelSplit(filePath: string): Promise<string[]> {
|
||||
const stats = await stat(filePath);
|
||||
const fileSize = BigInt(stats.size);
|
||||
|
||||
if (fileSize <= MAX_PART_SIZE) {
|
||||
return [filePath];
|
||||
}
|
||||
|
||||
const dir = path.dirname(filePath);
|
||||
const baseName = path.basename(filePath);
|
||||
const partSize = Number(MAX_PART_SIZE);
|
||||
const totalParts = Math.ceil(Number(fileSize) / partSize);
|
||||
const parts: string[] = [];
|
||||
|
||||
log.info({ filePath, fileSize: Number(fileSize), totalParts }, "Splitting file");
|
||||
|
||||
for (let i = 0; i < totalParts; i++) {
|
||||
const partNum = String(i + 1).padStart(3, "0");
|
||||
const partPath = path.join(dir, `${baseName}.${partNum}`);
|
||||
const start = i * partSize;
|
||||
const end = Math.min(start + partSize - 1, Number(fileSize) - 1);
|
||||
|
||||
await pipeline(
|
||||
createReadStream(filePath, { start, end }),
|
||||
createWriteStream(partPath)
|
||||
);
|
||||
|
||||
parts.push(partPath);
|
||||
}
|
||||
|
||||
log.info({ filePath, parts: parts.length }, "File split complete");
|
||||
return parts;
|
||||
}
|
||||
61
worker/src/archive/zip-reader.ts
Normal file
61
worker/src/archive/zip-reader.ts
Normal file
@@ -0,0 +1,61 @@
|
||||
import yauzl from "yauzl";
|
||||
import path from "path";
|
||||
import { childLogger } from "../util/logger.js";
|
||||
|
||||
const log = childLogger("zip-reader");
|
||||
|
||||
export interface FileEntry {
|
||||
path: string;
|
||||
fileName: string;
|
||||
extension: string | null;
|
||||
compressedSize: bigint;
|
||||
uncompressedSize: bigint;
|
||||
crc32: string | null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the central directory of a ZIP file without extracting any contents.
|
||||
* For multipart ZIPs, pass the paths sorted by part order.
|
||||
* We attempt to read from the last part first (central directory is at the end).
|
||||
*/
|
||||
export async function readZipCentralDirectory(
|
||||
filePaths: string[]
|
||||
): Promise<FileEntry[]> {
|
||||
// The central directory lives at the end of the last file
|
||||
const targetFile = filePaths[filePaths.length - 1];
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
yauzl.open(targetFile, { lazyEntries: true, autoClose: true }, (err, zipFile) => {
|
||||
if (err) {
|
||||
log.warn({ err, file: targetFile }, "Failed to open ZIP for reading");
|
||||
resolve([]); // Fallback: return empty on error
|
||||
return;
|
||||
}
|
||||
|
||||
const entries: FileEntry[] = [];
|
||||
|
||||
zipFile.readEntry();
|
||||
zipFile.on("entry", (entry: yauzl.Entry) => {
|
||||
// Skip directories
|
||||
if (!entry.fileName.endsWith("/")) {
|
||||
const ext = path.extname(entry.fileName).toLowerCase();
|
||||
entries.push({
|
||||
path: entry.fileName,
|
||||
fileName: path.basename(entry.fileName),
|
||||
extension: ext ? ext.slice(1) : null, // Remove leading dot
|
||||
compressedSize: BigInt(entry.compressedSize),
|
||||
uncompressedSize: BigInt(entry.uncompressedSize),
|
||||
crc32: entry.crc32 !== 0 ? entry.crc32.toString(16).padStart(8, "0") : null,
|
||||
});
|
||||
}
|
||||
zipFile.readEntry();
|
||||
});
|
||||
|
||||
zipFile.on("end", () => resolve(entries));
|
||||
zipFile.on("error", (error) => {
|
||||
log.warn({ error, file: targetFile }, "Error reading ZIP entries");
|
||||
resolve(entries); // Return whatever we got
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
Reference in New Issue
Block a user