feat: add two-phase DB write and hash advisory lock to prevent double-uploads

2026-07-24 07:42:47 +00:00 · 2026-05-02 23:13:55 +02:00
parent 614c8e5b74
commit b48cc510a4
1 changed files with 139 additions and 105 deletions
--- a/worker/src/worker.ts
+++ b/worker/src/worker.ts
@@ -2,13 +2,14 @@ import path from "path";
 import { unlink, readdir, mkdir, rm } from "fs/promises";
 import { config } from "./util/config.js";
 import { childLogger } from "./util/logger.js";
-import { tryAcquireLock, releaseLock } from "./db/locks.js";
+import { tryAcquireLock, releaseLock, tryAcquireHashLock, releaseHashLock } from "./db/locks.js";
 import {
  getSourceChannelMappings,
  getGlobalDestinationChannel,
  packageExistsByHash,
  packageExistsBySourceMessage,
-  createPackageWithFiles,
+  createPackageStub,
  updatePackageWithMetadata,
  createIngestionRun,
  completeIngestionRun,
  failIngestionRun,
@@ -301,6 +302,7 @@ interface PipelineContext {
  /** Forum topic ID (null for non-forum). */
  sourceTopicId: bigint | null;
  accountLog: ReturnType<typeof childLogger>;
  maxUploadSize: bigint;
 }
 /**
@@ -453,6 +455,7 @@ export async function runWorkerForAccount(
          topicCreator: null,
          sourceTopicId: null,
          accountLog,
          maxUploadSize: BigInt(config.maxPartSizeMB) * 1024n * 1024n,
        };
        if (forum) {
@@ -1028,6 +1031,30 @@ async function processOneArchiveSet(
      return null;
    }
    // ── Hash lock: prevent concurrent workers racing on shared-channel archives ──
    const hashLockAcquired = await tryAcquireHashLock(contentHash);
    if (!hashLockAcquired) {
      counters.zipsDuplicate++;
      accountLog.info(
        { fileName: archiveName, hash: contentHash.slice(0, 16) },
        "Hash lock held by another worker — skipping concurrent duplicate"
      );
      return null;
    }
    // Re-check after acquiring lock: another worker may have finished between
    // the first check above and this point.
    const existsAfterLock = await packageExistsByHash(contentHash);
    if (existsAfterLock) {
      await releaseHashLock(contentHash);
      counters.zipsDuplicate++;
      accountLog.debug(
        { fileName: archiveName, hash: contentHash.slice(0, 16) },
        "Duplicate detected after acquiring hash lock — skipping"
      );
      return null;
    }
    // ── Reading metadata ──
    await updateRunActivity(runId, {
      currentActivity: `Reading file list from ${archiveName}`,
@@ -1070,7 +1097,7 @@ async function processOneArchiveSet(
      (sum, p) => sum + p.fileSize,
      0n
    );
-    const MAX_UPLOAD_SIZE = BigInt(config.maxPartSizeMB) * 1024n * 1024n;
+    const MAX_UPLOAD_SIZE = ctx.maxUploadSize;
    const hasOversizedPart = archiveSet.parts.some((p) => p.fileSize > MAX_UPLOAD_SIZE);
    if (hasOversizedPart) {
@@ -1140,6 +1167,12 @@ async function processOneArchiveSet(
      );
    }
    // Hoist creator/tags so they're visible after try block for logging
    let creator: string | null = null;
    const tags: string[] = [];
    let stub: { id: string } | null = null;
    try {
      // ── Uploading ──
      // Check if a prior run already uploaded this file (orphaned upload scenario:
      // file reached Telegram but DB write failed or worker crashed before indexing)
@@ -1205,7 +1238,47 @@ async function processOneArchiveSet(
        }
      }
      // ── Phase 1: Stub record — persisted immediately after upload ──
      await deleteOrphanedPackageByHash(contentHash);
      creator =
        topicCreator ??
        extractCreatorFromFileName(archiveName) ??
        extractCreatorFromChannelTitle(channelTitle) ??
        null;
      if (channel.category) {
        tags.push(channel.category);
      }
      stub = await createPackageStub({
        contentHash,
        fileName: archiveName,
        fileSize: totalSize,
        archiveType: archiveSet.type === "7Z" ? "SEVEN_Z" : archiveSet.type,
        sourceChannelId: channel.id,
        sourceMessageId: archiveSet.parts[0].id,
        sourceTopicId,
        destChannelId,
        destMessageId: destResult.messageId,
        destMessageIds: destResult.messageIds,
        isMultipart: archiveSet.parts.length > 1 || uploadPaths.length > 1,
        partCount: uploadPaths.length,
        ingestionRunId,
        creator,
        tags,
      });
      counters.zipsIngested++;
      await deleteSkippedPackage(channel.id, archiveSet.parts[0].id);
    } finally {
      await releaseHashLock(contentHash);
    }
    if (!stub) return null;
    // ── Preview thumbnail ──
    // (moved here from before stub creation — lock is released, preview doesn't need it)
    let previewData: Buffer | null = null;
    let previewMsgId: bigint | null = null;
    const matchedPhoto = previewMatches.get(archiveSet.baseName);
@@ -1219,8 +1292,6 @@ async function processOneArchiveSet(
        totalFiles: totalSets,
      });
      previewData = await downloadPhotoThumbnail(client, matchedPhoto.fileId);
      // Only set previewMsgId if we actually got the image data —
      // otherwise the UI thinks there's a preview but the API returns 404
      if (previewData) {
        previewMsgId = matchedPhoto.id;
      }
@@ -1243,13 +1314,7 @@ async function processOneArchiveSet(
      }
    }
-    // ── Resolve creator: topic name > filename extraction > channel title > null ──
+    // ── Phase 2: Update stub with file entries and preview ──
    const creator = topicCreator
      ?? extractCreatorFromFileName(archiveName)
      ?? extractCreatorFromChannelTitle(channelTitle)
      ?? null;
    // ── Indexing ──
    await updateRunActivity(runId, {
      currentActivity: `Saving metadata for ${archiveName} (${entries.length} files)`,
      currentStep: "indexing",
@@ -1259,43 +1324,12 @@ async function processOneArchiveSet(
      totalFiles: totalSets,
    });
-    // Clean up any orphaned record (same hash but no dest upload) before creating
+    await updatePackageWithMetadata(stub.id, {
-    await deleteOrphanedPackageByHash(contentHash);
+      files: entries,
    // Auto-inherit source channel category as initial tag
    const tags: string[] = [];
    if (channel.category) {
      tags.push(channel.category);
    }
    const pkg = await createPackageWithFiles({
      contentHash,
      fileName: archiveName,
      fileSize: totalSize,
      archiveType: archiveSet.type === "7Z" ? "SEVEN_Z" : archiveSet.type,
      sourceChannelId: channel.id,
      sourceMessageId: archiveSet.parts[0].id,
      sourceTopicId,
      destChannelId,
      destMessageId: destResult.messageId,
      destMessageIds: destResult.messageIds,
      isMultipart:
        archiveSet.parts.length > 1 || uploadPaths.length > 1,
      partCount: uploadPaths.length,
      ingestionRunId,
      creator,
      tags,
      previewData,
      previewMsgId,
      sourceCaption: archiveSet.parts[0].caption ?? null,
      replyToMessageId: archiveSet.parts[0].replyToMessageId ?? null,
      files: entries,
    });
    counters.zipsIngested++;
    // Clean up any prior skip record for this archive
    await deleteSkippedPackage(channel.id, archiveSet.parts[0].id);
    await updateRunActivity(runId, {
      currentActivity: `Ingested ${archiveName} (${entries.length} files indexed)`,
      currentStep: "complete",
@@ -1311,7 +1345,7 @@ async function processOneArchiveSet(
      "Archive ingested"
    );
-    return pkg.id;
+    return stub.id;
  } finally {
    // ALWAYS delete temp files and the set directory
    await deleteFiles([...tempPaths, ...splitPaths]);