1 Commits

Author SHA1 Message Date
59038889ae fix: prevent pool exhaustion that caused 4-hour duplicate check stall
All checks were successful
continuous-integration/drone/push Build is passing
The pg pool had max=5 connections shared between Prisma operations and
advisory locks. With 2 account locks held permanently and hash locks
from timed-out (but still running) background work, pool.connect()
would block forever — causing the Turnbase.7z stall.

- Increase pool max from 5 to 15 for headroom
- Add 30s connectionTimeoutMillis so pool.connect() throws instead of
  hanging forever when the pool is exhausted
- On startup, terminate zombie PostgreSQL sessions from previous worker
  instances that hold stale advisory locks

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-06 20:39:00 +02:00
2 changed files with 35 additions and 1 deletions

View File

@@ -5,7 +5,14 @@ import { config } from "../util/config.js";
const pool = new pg.Pool({ const pool = new pg.Pool({
connectionString: config.databaseUrl, connectionString: config.databaseUrl,
max: 5, // Pool needs headroom for: 2 account advisory locks (held for entire cycle),
// up to 2 concurrent hash locks, plus Prisma operations from both accounts.
// Previously max=5 caused pool exhaustion and indefinite hangs.
max: 15,
// Prevent pool.connect() from blocking forever when pool is exhausted.
// Throws an error after 30s so the operation can fail and retry instead of
// silently hanging for hours (as happened with the Turnbase.7z stall).
connectionTimeoutMillis: 30_000,
}); });
const adapter = new PrismaPg(pool); const adapter = new PrismaPg(pool);

View File

@@ -27,6 +27,33 @@ async function main(): Promise<void> {
await cleanupTempDir(); await cleanupTempDir();
await markStaleRunsAsFailed(); await markStaleRunsAsFailed();
// Release any advisory locks orphaned by a previous worker instance.
// When Docker kills a container, PostgreSQL may keep the session alive
// (zombie connections), holding advisory locks that block the new worker.
try {
const result = await pool.query(`
SELECT pid, state, left(query, 80) as query, age(clock_timestamp(), state_change) as idle_time
FROM pg_stat_activity
WHERE datname = current_database()
AND pid != pg_backend_pid()
AND state = 'idle'
AND query LIKE '%pg_try_advisory_lock%'
AND state_change < clock_timestamp() - interval '5 minutes'
`);
for (const row of result.rows) {
log.warn(
{ pid: row.pid, idleTime: row.idle_time, query: row.query },
"Terminating stale advisory lock session from previous worker"
);
await pool.query("SELECT pg_terminate_backend($1)", [row.pid]);
}
if (result.rows.length > 0) {
log.info({ terminated: result.rows.length }, "Cleaned up stale advisory lock sessions");
}
} catch (err) {
log.warn({ err }, "Failed to clean up stale advisory locks (non-fatal)");
}
// Verify destination messages exist for all "uploaded" packages. // Verify destination messages exist for all "uploaded" packages.
// Resets any packages whose dest message is missing so they get re-processed. // Resets any packages whose dest message is missing so they get re-processed.
await recoverIncompleteUploads(); await recoverIncompleteUploads();