From 59038889aeaab784736d6bd72186520399399bbf Mon Sep 17 00:00:00 2001 From: xCyanGrizzly Date: Wed, 6 May 2026 20:39:00 +0200 Subject: [PATCH] fix: prevent pool exhaustion that caused 4-hour duplicate check stall MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pg pool had max=5 connections shared between Prisma operations and advisory locks. With 2 account locks held permanently and hash locks from timed-out (but still running) background work, pool.connect() would block forever — causing the Turnbase.7z stall. - Increase pool max from 5 to 15 for headroom - Add 30s connectionTimeoutMillis so pool.connect() throws instead of hanging forever when the pool is exhausted - On startup, terminate zombie PostgreSQL sessions from previous worker instances that hold stale advisory locks Co-Authored-By: Claude Opus 4.6 (1M context) --- worker/src/db/client.ts | 9 ++++++++- worker/src/index.ts | 27 +++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/worker/src/db/client.ts b/worker/src/db/client.ts index 1b125a1..01342bf 100644 --- a/worker/src/db/client.ts +++ b/worker/src/db/client.ts @@ -5,7 +5,14 @@ import { config } from "../util/config.js"; const pool = new pg.Pool({ connectionString: config.databaseUrl, - max: 5, + // Pool needs headroom for: 2 account advisory locks (held for entire cycle), + // up to 2 concurrent hash locks, plus Prisma operations from both accounts. + // Previously max=5 caused pool exhaustion and indefinite hangs. + max: 15, + // Prevent pool.connect() from blocking forever when pool is exhausted. + // Throws an error after 30s so the operation can fail and retry instead of + // silently hanging for hours (as happened with the Turnbase.7z stall). + connectionTimeoutMillis: 30_000, }); const adapter = new PrismaPg(pool); diff --git a/worker/src/index.ts b/worker/src/index.ts index dc2e556..037865b 100644 --- a/worker/src/index.ts +++ b/worker/src/index.ts @@ -27,6 +27,33 @@ async function main(): Promise { await cleanupTempDir(); await markStaleRunsAsFailed(); + // Release any advisory locks orphaned by a previous worker instance. + // When Docker kills a container, PostgreSQL may keep the session alive + // (zombie connections), holding advisory locks that block the new worker. + try { + const result = await pool.query(` + SELECT pid, state, left(query, 80) as query, age(clock_timestamp(), state_change) as idle_time + FROM pg_stat_activity + WHERE datname = current_database() + AND pid != pg_backend_pid() + AND state = 'idle' + AND query LIKE '%pg_try_advisory_lock%' + AND state_change < clock_timestamp() - interval '5 minutes' + `); + for (const row of result.rows) { + log.warn( + { pid: row.pid, idleTime: row.idle_time, query: row.query }, + "Terminating stale advisory lock session from previous worker" + ); + await pool.query("SELECT pg_terminate_backend($1)", [row.pid]); + } + if (result.rows.length > 0) { + log.info({ terminated: result.rows.length }, "Cleaned up stale advisory lock sessions"); + } + } catch (err) { + log.warn({ err }, "Failed to clean up stale advisory locks (non-fatal)"); + } + // Verify destination messages exist for all "uploaded" packages. // Resets any packages whose dest message is missing so they get re-processed. await recoverIncompleteUploads();