feat: add two-phase DB write and hash advisory lock to prevent double-uploads

This commit is contained in:
2026-05-02 23:13:55 +02:00
parent 614c8e5b74
commit b48cc510a4

View File

@@ -2,13 +2,14 @@ import path from "path";
import { unlink, readdir, mkdir, rm } from "fs/promises"; import { unlink, readdir, mkdir, rm } from "fs/promises";
import { config } from "./util/config.js"; import { config } from "./util/config.js";
import { childLogger } from "./util/logger.js"; import { childLogger } from "./util/logger.js";
import { tryAcquireLock, releaseLock } from "./db/locks.js"; import { tryAcquireLock, releaseLock, tryAcquireHashLock, releaseHashLock } from "./db/locks.js";
import { import {
getSourceChannelMappings, getSourceChannelMappings,
getGlobalDestinationChannel, getGlobalDestinationChannel,
packageExistsByHash, packageExistsByHash,
packageExistsBySourceMessage, packageExistsBySourceMessage,
createPackageWithFiles, createPackageStub,
updatePackageWithMetadata,
createIngestionRun, createIngestionRun,
completeIngestionRun, completeIngestionRun,
failIngestionRun, failIngestionRun,
@@ -301,6 +302,7 @@ interface PipelineContext {
/** Forum topic ID (null for non-forum). */ /** Forum topic ID (null for non-forum). */
sourceTopicId: bigint | null; sourceTopicId: bigint | null;
accountLog: ReturnType<typeof childLogger>; accountLog: ReturnType<typeof childLogger>;
maxUploadSize: bigint;
} }
/** /**
@@ -453,6 +455,7 @@ export async function runWorkerForAccount(
topicCreator: null, topicCreator: null,
sourceTopicId: null, sourceTopicId: null,
accountLog, accountLog,
maxUploadSize: BigInt(config.maxPartSizeMB) * 1024n * 1024n,
}; };
if (forum) { if (forum) {
@@ -1028,6 +1031,30 @@ async function processOneArchiveSet(
return null; return null;
} }
// ── Hash lock: prevent concurrent workers racing on shared-channel archives ──
const hashLockAcquired = await tryAcquireHashLock(contentHash);
if (!hashLockAcquired) {
counters.zipsDuplicate++;
accountLog.info(
{ fileName: archiveName, hash: contentHash.slice(0, 16) },
"Hash lock held by another worker — skipping concurrent duplicate"
);
return null;
}
// Re-check after acquiring lock: another worker may have finished between
// the first check above and this point.
const existsAfterLock = await packageExistsByHash(contentHash);
if (existsAfterLock) {
await releaseHashLock(contentHash);
counters.zipsDuplicate++;
accountLog.debug(
{ fileName: archiveName, hash: contentHash.slice(0, 16) },
"Duplicate detected after acquiring hash lock — skipping"
);
return null;
}
// ── Reading metadata ── // ── Reading metadata ──
await updateRunActivity(runId, { await updateRunActivity(runId, {
currentActivity: `Reading file list from ${archiveName}`, currentActivity: `Reading file list from ${archiveName}`,
@@ -1070,7 +1097,7 @@ async function processOneArchiveSet(
(sum, p) => sum + p.fileSize, (sum, p) => sum + p.fileSize,
0n 0n
); );
const MAX_UPLOAD_SIZE = BigInt(config.maxPartSizeMB) * 1024n * 1024n; const MAX_UPLOAD_SIZE = ctx.maxUploadSize;
const hasOversizedPart = archiveSet.parts.some((p) => p.fileSize > MAX_UPLOAD_SIZE); const hasOversizedPart = archiveSet.parts.some((p) => p.fileSize > MAX_UPLOAD_SIZE);
if (hasOversizedPart) { if (hasOversizedPart) {
@@ -1140,72 +1167,118 @@ async function processOneArchiveSet(
); );
} }
// ── Uploading ── // Hoist creator/tags so they're visible after try block for logging
// Check if a prior run already uploaded this file (orphaned upload scenario: let creator: string | null = null;
// file reached Telegram but DB write failed or worker crashed before indexing) const tags: string[] = [];
const existingUpload = await getUploadedPackageByHash(contentHash);
let destResult: { messageId: bigint; messageIds: bigint[] };
if (existingUpload && existingUpload.destMessageId) { let stub: { id: string } | null = null;
accountLog.info( try {
{ fileName: archiveName, destMessageId: Number(existingUpload.destMessageId) }, // ── Uploading ──
"Reusing existing upload (file already on destination channel)" // Check if a prior run already uploaded this file (orphaned upload scenario:
); // file reached Telegram but DB write failed or worker crashed before indexing)
destResult = { const existingUpload = await getUploadedPackageByHash(contentHash);
messageId: existingUpload.destMessageId, let destResult: { messageId: bigint; messageIds: bigint[] };
messageIds: existingUpload.destMessageIds?.length
? (existingUpload.destMessageIds as bigint[]) if (existingUpload && existingUpload.destMessageId) {
: [existingUpload.destMessageId], accountLog.info(
}; { fileName: archiveName, destMessageId: Number(existingUpload.destMessageId) },
} else { "Reusing existing upload (file already on destination channel)"
const uploadLabel = uploadPaths.length > 1 );
? ` (${uploadPaths.length} parts)` destResult = {
: ""; messageId: existingUpload.destMessageId,
await updateRunActivity(runId, { messageIds: existingUpload.destMessageIds?.length
currentActivity: `Uploading ${archiveName} to archive channel${uploadLabel}`, ? (existingUpload.destMessageIds as bigint[])
currentStep: "uploading", : [existingUpload.destMessageId],
currentChannel: channelTitle, };
currentFile: archiveName, } else {
currentFileNum: setIdx + 1, const uploadLabel = uploadPaths.length > 1
totalFiles: totalSets, ? ` (${uploadPaths.length} parts)`
: "";
await updateRunActivity(runId, {
currentActivity: `Uploading ${archiveName} to archive channel${uploadLabel}`,
currentStep: "uploading",
currentChannel: channelTitle,
currentFile: archiveName,
currentFileNum: setIdx + 1,
totalFiles: totalSets,
});
destResult = await uploadToChannel(
client,
destChannelTelegramId,
uploadPaths
);
}
// ── Post-upload integrity check ──
// Verify the files on disk still match before we index
if (uploadPaths.length > 0 && !existingUpload) {
try {
const postUploadHash = await hashParts(uploadPaths);
if (splitPaths.length > 0) {
// Split files — hash should match the split hash (already verified above)
// No additional check needed since we verified split hash = original hash
} else if (postUploadHash !== contentHash) {
accountLog.error(
{ fileName: archiveName, originalHash: contentHash, postUploadHash },
"Hash changed between hashing and upload — possible disk corruption"
);
await db.systemNotification.create({
data: {
type: "HASH_MISMATCH",
severity: "ERROR",
title: `Post-upload hash mismatch: ${archiveName}`,
message: `Hash changed between download and upload. Original: ${contentHash.slice(0, 16)}…, post-upload: ${postUploadHash.slice(0, 16)}`,
context: { fileName: archiveName, originalHash: contentHash, postUploadHash, sourceChannelId: channel.id },
},
});
}
} catch {
// Best-effort — don't fail the ingestion
}
}
// ── Phase 1: Stub record — persisted immediately after upload ──
await deleteOrphanedPackageByHash(contentHash);
creator =
topicCreator ??
extractCreatorFromFileName(archiveName) ??
extractCreatorFromChannelTitle(channelTitle) ??
null;
if (channel.category) {
tags.push(channel.category);
}
stub = await createPackageStub({
contentHash,
fileName: archiveName,
fileSize: totalSize,
archiveType: archiveSet.type === "7Z" ? "SEVEN_Z" : archiveSet.type,
sourceChannelId: channel.id,
sourceMessageId: archiveSet.parts[0].id,
sourceTopicId,
destChannelId,
destMessageId: destResult.messageId,
destMessageIds: destResult.messageIds,
isMultipart: archiveSet.parts.length > 1 || uploadPaths.length > 1,
partCount: uploadPaths.length,
ingestionRunId,
creator,
tags,
}); });
destResult = await uploadToChannel( counters.zipsIngested++;
client, await deleteSkippedPackage(channel.id, archiveSet.parts[0].id);
destChannelTelegramId, } finally {
uploadPaths await releaseHashLock(contentHash);
);
} }
// ── Post-upload integrity check ── if (!stub) return null;
// Verify the files on disk still match before we index
if (uploadPaths.length > 0 && !existingUpload) {
try {
const postUploadHash = await hashParts(uploadPaths);
if (splitPaths.length > 0) {
// Split files — hash should match the split hash (already verified above)
// No additional check needed since we verified split hash = original hash
} else if (postUploadHash !== contentHash) {
accountLog.error(
{ fileName: archiveName, originalHash: contentHash, postUploadHash },
"Hash changed between hashing and upload — possible disk corruption"
);
await db.systemNotification.create({
data: {
type: "HASH_MISMATCH",
severity: "ERROR",
title: `Post-upload hash mismatch: ${archiveName}`,
message: `Hash changed between download and upload. Original: ${contentHash.slice(0, 16)}…, post-upload: ${postUploadHash.slice(0, 16)}`,
context: { fileName: archiveName, originalHash: contentHash, postUploadHash, sourceChannelId: channel.id },
},
});
}
} catch {
// Best-effort — don't fail the ingestion
}
}
// ── Preview thumbnail ── // ── Preview thumbnail ──
// (moved here from before stub creation — lock is released, preview doesn't need it)
let previewData: Buffer | null = null; let previewData: Buffer | null = null;
let previewMsgId: bigint | null = null; let previewMsgId: bigint | null = null;
const matchedPhoto = previewMatches.get(archiveSet.baseName); const matchedPhoto = previewMatches.get(archiveSet.baseName);
@@ -1219,8 +1292,6 @@ async function processOneArchiveSet(
totalFiles: totalSets, totalFiles: totalSets,
}); });
previewData = await downloadPhotoThumbnail(client, matchedPhoto.fileId); previewData = await downloadPhotoThumbnail(client, matchedPhoto.fileId);
// Only set previewMsgId if we actually got the image data —
// otherwise the UI thinks there's a preview but the API returns 404
if (previewData) { if (previewData) {
previewMsgId = matchedPhoto.id; previewMsgId = matchedPhoto.id;
} }
@@ -1243,13 +1314,7 @@ async function processOneArchiveSet(
} }
} }
// ── Resolve creator: topic name > filename extraction > channel title > null ── // ── Phase 2: Update stub with file entries and preview ──
const creator = topicCreator
?? extractCreatorFromFileName(archiveName)
?? extractCreatorFromChannelTitle(channelTitle)
?? null;
// ── Indexing ──
await updateRunActivity(runId, { await updateRunActivity(runId, {
currentActivity: `Saving metadata for ${archiveName} (${entries.length} files)`, currentActivity: `Saving metadata for ${archiveName} (${entries.length} files)`,
currentStep: "indexing", currentStep: "indexing",
@@ -1259,43 +1324,12 @@ async function processOneArchiveSet(
totalFiles: totalSets, totalFiles: totalSets,
}); });
// Clean up any orphaned record (same hash but no dest upload) before creating await updatePackageWithMetadata(stub.id, {
await deleteOrphanedPackageByHash(contentHash); files: entries,
// Auto-inherit source channel category as initial tag
const tags: string[] = [];
if (channel.category) {
tags.push(channel.category);
}
const pkg = await createPackageWithFiles({
contentHash,
fileName: archiveName,
fileSize: totalSize,
archiveType: archiveSet.type === "7Z" ? "SEVEN_Z" : archiveSet.type,
sourceChannelId: channel.id,
sourceMessageId: archiveSet.parts[0].id,
sourceTopicId,
destChannelId,
destMessageId: destResult.messageId,
destMessageIds: destResult.messageIds,
isMultipart:
archiveSet.parts.length > 1 || uploadPaths.length > 1,
partCount: uploadPaths.length,
ingestionRunId,
creator,
tags,
previewData, previewData,
previewMsgId, previewMsgId,
sourceCaption: archiveSet.parts[0].caption ?? null,
replyToMessageId: archiveSet.parts[0].replyToMessageId ?? null,
files: entries,
}); });
counters.zipsIngested++;
// Clean up any prior skip record for this archive
await deleteSkippedPackage(channel.id, archiveSet.parts[0].id);
await updateRunActivity(runId, { await updateRunActivity(runId, {
currentActivity: `Ingested ${archiveName} (${entries.length} files indexed)`, currentActivity: `Ingested ${archiveName} (${entries.length} files indexed)`,
currentStep: "complete", currentStep: "complete",
@@ -1311,7 +1345,7 @@ async function processOneArchiveSet(
"Archive ingested" "Archive ingested"
); );
return pkg.id; return stub.id;
} finally { } finally {
// ALWAYS delete temp files and the set directory // ALWAYS delete temp files and the set directory
await deleteFiles([...tempPaths, ...splitPaths]); await deleteFiles([...tempPaths, ...splitPaths]);