From 84cc8d995b56bb6e08883aa0d68e702d6ff5418e Mon Sep 17 00:00:00 2001 From: xCyanGrizzly Date: Fri, 22 May 2026 22:47:08 +0200 Subject: [PATCH] fix: fail fast on upload stall instead of retrying on broken client MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously a single TDLib event-stream degradation cost ~45 minutes per archive: 3 retries x 15-min minimum timeout, all on the same broken client. The retries had no chance of succeeding because the underlying issue (missing updateMessageSendSucceeded events) is a client-level problem, not a transient send failure. Now the first stall throws UploadStallError immediately. The caller in processArchiveSets already recreates the TDLib client on UploadStallError, so we drop from ~45 min recovery to ~15 min (one timeout cycle) per stalled archive. The stalled set is recorded in SkippedPackage; with the watermark cap from d99a506 it gets retried on the next ingestion cycle with a fresh client. FLOOD_WAIT retries inside sendWithRetry are unchanged — those handle legitimate rate limiting, not stalls. Co-Authored-By: Claude Opus 4.7 (1M context) --- worker/src/upload/channel.ts | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/worker/src/upload/channel.ts b/worker/src/upload/channel.ts index d22a0a0..301c3e0 100644 --- a/worker/src/upload/channel.ts +++ b/worker/src/upload/channel.ts @@ -119,22 +119,20 @@ async function sendWithRetry( continue; } - // Stall or timeout — retry with a cooldown + // Stall or timeout — fail fast and let the caller recreate the TDLib + // client. Retrying on the same degraded event stream wastes ~15 min + // per attempt because the underlying issue (missing send-success + // events) is client-level, not transient. The set ends up in + // SkippedPackage and the caller's watermark cap ensures it gets + // retried next cycle on a fresh client. const errMsg = err instanceof Error ? err.message : ""; if (errMsg.includes("stalled") || errMsg.includes("timed out")) { - if (!isLastAttempt) { - log.warn( - { fileName, attempt: attempt + 1, maxRetries: MAX_UPLOAD_RETRIES }, - "Upload stalled/timed out — retrying" - ); - await sleep(10_000); - continue; - } - // All stall retries exhausted — throw UploadStallError so the caller - // knows the TDLib client's event stream is likely degraded and can - // recreate the client before continuing. + log.warn( + { fileName, attempt: attempt + 1 }, + "Upload stalled — failing fast so caller can recreate TDLib client" + ); throw new UploadStallError( - `Upload stalled after ${MAX_UPLOAD_RETRIES} retries for ${fileName}` + `Upload stalled for ${fileName}: ${errMsg}` ); }