mirror of
https://github.com/xCyanGrizzly/DragonsStash.git
synced 2026-05-10 22:01:16 +00:00
When TDLib's event stream degrades, uploads complete (bytes sent) but confirmations never arrive. Previously the worker retried 3x with the same broken client, wasting 60+ min per archive and holding the mutex. - Add UploadStallError class to distinguish stalls from other failures - Reduce stall detection timeout from 5min to 3min (faster detection) - Recreate TDLib client after consecutive upload stalls instead of retrying on the same degraded connection - Add forceReleaseMutex() to prevent cascade failures when one account blocks others via stuck mutex after cycle timeout Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
204 lines
6.4 KiB
TypeScript
204 lines
6.4 KiB
TypeScript
import { config } from "./util/config.js";
|
|
import { childLogger } from "./util/logger.js";
|
|
import { withTdlibMutex, forceReleaseMutex } from "./util/mutex.js";
|
|
import { getActiveAccounts, getPendingAccounts } from "./db/queries.js";
|
|
import { runWorkerForAccount, authenticateAccount } from "./worker.js";
|
|
import { runIntegrityAudit } from "./audit.js";
|
|
|
|
const log = childLogger("scheduler");
|
|
|
|
let running = false;
|
|
let timer: ReturnType<typeof setTimeout> | null = null;
|
|
let cycleCount = 0;
|
|
let activeCyclePromise: Promise<void> | null = null;
|
|
|
|
/**
|
|
* Maximum time for a single ingestion cycle (ms).
|
|
* After this, new accounts won't be started (in-progress work finishes).
|
|
* Default: 4 hours. Configurable via WORKER_CYCLE_TIMEOUT_MINUTES.
|
|
*/
|
|
const CYCLE_TIMEOUT_MS = (parseInt(process.env.WORKER_CYCLE_TIMEOUT_MINUTES ?? "240", 10)) * 60 * 1000;
|
|
|
|
/**
|
|
* Run one ingestion cycle:
|
|
* 1. Authenticate any PENDING accounts (triggers SMS code flow + auto-fetch channels)
|
|
* 2. Process all active AUTHENTICATED accounts for ingestion
|
|
*
|
|
* Each account's TDLib operations are wrapped in a per-key mutex so different
|
|
* accounts run concurrently while the same account is still serialized.
|
|
*
|
|
* The cycle has a configurable timeout (WORKER_CYCLE_TIMEOUT_MINUTES, default 4h).
|
|
* Once the timeout elapses, no new accounts will be started but any in-progress
|
|
* account processing is allowed to finish its current archive set.
|
|
*/
|
|
async function runCycle(): Promise<void> {
|
|
if (running) {
|
|
log.warn("Previous cycle still running, skipping");
|
|
return;
|
|
}
|
|
|
|
running = true;
|
|
cycleCount++;
|
|
const cycleStart = Date.now();
|
|
log.info({ cycle: cycleCount, timeoutMinutes: CYCLE_TIMEOUT_MS / 60_000 }, "Starting ingestion cycle");
|
|
|
|
try {
|
|
// ── Phase 1: Authenticate pending accounts ──
|
|
const pendingAccounts = await getPendingAccounts();
|
|
if (pendingAccounts.length > 0) {
|
|
log.info(
|
|
{ count: pendingAccounts.length },
|
|
"Found pending accounts, starting authentication"
|
|
);
|
|
for (const account of pendingAccounts) {
|
|
if (Date.now() - cycleStart > CYCLE_TIMEOUT_MS) {
|
|
log.warn("Cycle timeout reached during authentication phase, stopping");
|
|
break;
|
|
}
|
|
await withTdlibMutex(account.phone, `auth:${account.phone}`, () =>
|
|
authenticateAccount(account)
|
|
);
|
|
}
|
|
}
|
|
|
|
// ── Phase 2: Ingest for authenticated accounts ──
|
|
const accounts = await getActiveAccounts();
|
|
|
|
if (accounts.length === 0) {
|
|
log.info("No active authenticated accounts, nothing to ingest");
|
|
return;
|
|
}
|
|
|
|
log.info({ accountCount: accounts.length }, "Processing accounts");
|
|
|
|
const results = await Promise.allSettled(
|
|
accounts.map((account) => {
|
|
let timer: ReturnType<typeof setTimeout>;
|
|
return Promise.race([
|
|
withTdlibMutex(account.phone, `ingest:${account.phone}`, () =>
|
|
runWorkerForAccount(account)
|
|
),
|
|
new Promise<never>((_, reject) => {
|
|
timer = setTimeout(
|
|
() => reject(new Error(`Account ${account.phone} ingestion timed out after ${CYCLE_TIMEOUT_MS / 60_000}min`)),
|
|
CYCLE_TIMEOUT_MS
|
|
);
|
|
}),
|
|
]).finally(() => clearTimeout(timer));
|
|
})
|
|
);
|
|
|
|
for (let i = 0; i < results.length; i++) {
|
|
if (results[i].status === "rejected") {
|
|
const reason = (results[i] as PromiseRejectedResult).reason;
|
|
log.error(
|
|
{ phone: accounts[i].phone, err: reason },
|
|
"Account ingestion failed"
|
|
);
|
|
// If the cycle timed out, force-release the mutex so the next cycle
|
|
// (or other operations like fetch-channels) can proceed immediately
|
|
// instead of waiting 30 minutes for the mutex timeout.
|
|
const errMsg = reason instanceof Error ? reason.message : String(reason);
|
|
if (errMsg.includes("timed out") || errMsg.includes("mutex wait timeout")) {
|
|
forceReleaseMutex(accounts[i].phone);
|
|
}
|
|
}
|
|
}
|
|
|
|
log.info(
|
|
{ elapsed: Math.round((Date.now() - cycleStart) / 1000) },
|
|
"Ingestion cycle complete"
|
|
);
|
|
|
|
// Run integrity audit after all accounts are processed
|
|
try {
|
|
const auditResult = await runIntegrityAudit();
|
|
if (auditResult.issues > 0) {
|
|
log.info({ ...auditResult }, "Integrity audit found issues");
|
|
}
|
|
} catch (auditErr) {
|
|
log.warn({ err: auditErr }, "Integrity audit failed");
|
|
}
|
|
} catch (err) {
|
|
log.error({ err }, "Ingestion cycle failed");
|
|
} finally {
|
|
running = false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Schedule the next cycle with jitter.
|
|
*/
|
|
function scheduleNext(): void {
|
|
const intervalMs = config.workerIntervalMinutes * 60 * 1000;
|
|
const jitterMs = Math.random() * config.jitterMinutes * 60 * 1000;
|
|
const delay = intervalMs + jitterMs;
|
|
|
|
log.info(
|
|
{ nextRunInMinutes: Math.round(delay / 60000) },
|
|
"Next cycle scheduled"
|
|
);
|
|
|
|
timer = setTimeout(async () => {
|
|
activeCyclePromise = runCycle();
|
|
await activeCyclePromise;
|
|
activeCyclePromise = null;
|
|
scheduleNext();
|
|
}, delay);
|
|
}
|
|
|
|
/**
|
|
* Start the scheduler. Runs an immediate first cycle, then schedules subsequent ones.
|
|
*/
|
|
export async function startScheduler(): Promise<void> {
|
|
log.info(
|
|
{
|
|
intervalMinutes: config.workerIntervalMinutes,
|
|
jitterMinutes: config.jitterMinutes,
|
|
},
|
|
"Scheduler starting"
|
|
);
|
|
|
|
// Run immediately on start
|
|
activeCyclePromise = runCycle();
|
|
await activeCyclePromise;
|
|
activeCyclePromise = null;
|
|
|
|
// Then schedule recurring cycles
|
|
scheduleNext();
|
|
}
|
|
|
|
/**
|
|
* Trigger an immediate ingestion cycle (e.g. from the admin UI).
|
|
* If a cycle is already running, this is a no-op.
|
|
*/
|
|
export async function triggerImmediateCycle(): Promise<void> {
|
|
if (running) {
|
|
log.info("Cycle already running, ignoring trigger");
|
|
return;
|
|
}
|
|
log.info("Immediate cycle triggered via UI");
|
|
await runCycle();
|
|
}
|
|
|
|
/**
|
|
* Stop the scheduler gracefully.
|
|
* Returns a promise that resolves when any active cycle finishes,
|
|
* so callers can wait before closing DB connections.
|
|
*/
|
|
export function stopScheduler(): Promise<void> {
|
|
if (timer) {
|
|
clearTimeout(timer);
|
|
timer = null;
|
|
}
|
|
if (activeCyclePromise) {
|
|
log.info("Scheduler stopping — waiting for active cycle to finish");
|
|
return activeCyclePromise.finally(() => {
|
|
activeCyclePromise = null;
|
|
log.info("Scheduler stopped");
|
|
});
|
|
}
|
|
log.info("Scheduler stopped");
|
|
return Promise.resolve();
|
|
}
|