From 59038889aeaab784736d6bd72186520399399bbf Mon Sep 17 00:00:00 2001
From: xCyanGrizzly <sam.agsteribbe@hotmail.com>
Date: Wed, 6 May 2026 20:39:00 +0200
Subject: [PATCH] fix: prevent pool exhaustion that caused 4-hour duplicate
 check stall
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The pg pool had max=5 connections shared between Prisma operations and
advisory locks. With 2 account locks held permanently and hash locks
from timed-out (but still running) background work, pool.connect()
would block forever — causing the Turnbase.7z stall.

- Increase pool max from 5 to 15 for headroom
- Add 30s connectionTimeoutMillis so pool.connect() throws instead of
  hanging forever when the pool is exhausted
- On startup, terminate zombie PostgreSQL sessions from previous worker
  instances that hold stale advisory locks

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 worker/src/db/client.ts |  9 ++++++++-
 worker/src/index.ts     | 27 +++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)
diff --git a/worker/src/db/client.ts b/worker/src/db/client.ts
index 1b125a1..01342bf 100644
--- a/worker/src/db/client.ts
+++ b/worker/src/db/client.ts
@@ -5,7 +5,14 @@ import { config } from "../util/config.js";
 
 const pool = new pg.Pool({
   connectionString: config.databaseUrl,
-  max: 5,
+  // Pool needs headroom for: 2 account advisory locks (held for entire cycle),
+  // up to 2 concurrent hash locks, plus Prisma operations from both accounts.
+  // Previously max=5 caused pool exhaustion and indefinite hangs.
+  max: 15,
+  // Prevent pool.connect() from blocking forever when pool is exhausted.
+  // Throws an error after 30s so the operation can fail and retry instead of
+  // silently hanging for hours (as happened with the Turnbase.7z stall).
+  connectionTimeoutMillis: 30_000,
 });
 
 const adapter = new PrismaPg(pool);
diff --git a/worker/src/index.ts b/worker/src/index.ts
index dc2e556..037865b 100644
--- a/worker/src/index.ts
+++ b/worker/src/index.ts
@@ -27,6 +27,33 @@ async function main(): Promise<void> {
   await cleanupTempDir();
   await markStaleRunsAsFailed();
 
+  // Release any advisory locks orphaned by a previous worker instance.
+  // When Docker kills a container, PostgreSQL may keep the session alive
+  // (zombie connections), holding advisory locks that block the new worker.
+  try {
+    const result = await pool.query(`
+      SELECT pid, state, left(query, 80) as query, age(clock_timestamp(), state_change) as idle_time
+      FROM pg_stat_activity
+      WHERE datname = current_database()
+        AND pid != pg_backend_pid()
+        AND state = 'idle'
+        AND query LIKE '%pg_try_advisory_lock%'
+        AND state_change < clock_timestamp() - interval '5 minutes'
+    `);
+    for (const row of result.rows) {
+      log.warn(
+        { pid: row.pid, idleTime: row.idle_time, query: row.query },
+        "Terminating stale advisory lock session from previous worker"
+      );
+      await pool.query("SELECT pg_terminate_backend($1)", [row.pid]);
+    }
+    if (result.rows.length > 0) {
+      log.info({ terminated: result.rows.length }, "Cleaned up stale advisory lock sessions");
+    }
+  } catch (err) {
+    log.warn({ err }, "Failed to clean up stale advisory locks (non-fatal)");
+  }
+
   // Verify destination messages exist for all "uploaded" packages.
   // Resets any packages whose dest message is missing so they get re-processed.
   await recoverIncompleteUploads();