feat: grouping phase 1 — schema, ungrouped tab, time-window grouping, hash verification

Schema:
- Add GroupingSource enum (ALBUM, MANUAL, AUTO_TIME, AUTO_PATTERN, etc.)
- Add groupingSource field to PackageGroup with backfill
- Add SystemNotification model for persistent alerts
- Add NotificationType and NotificationSeverity enums

Ungrouped staging tab:
- Add listUngroupedPackages/countUngroupedPackages queries
- Add "Ungrouped" tab to STL page showing packages without a group

Time-window auto-grouping:
- After album grouping, cluster ungrouped packages within configurable
  time window (default 5 min, AUTO_GROUP_TIME_WINDOW_MINUTES env var)
- Groups named from common filename prefix
- Groups created with groupingSource=AUTO_TIME

Hash verification after split:
- Re-hash split parts and compare to original contentHash
- Log error and create SystemNotification on mismatch
- Prevents silently corrupted split uploads

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-30 13:00:27 +02:00
parent 194c87a256
commit 9e78cc5d19
10 changed files with 415 additions and 5 deletions

View File

@@ -587,3 +587,24 @@ export async function linkPackagesToGroup(
data: { packageGroupId: groupId },
});
}
export async function createTimeWindowGroup(input: {
sourceChannelId: string;
name: string;
packageIds: string[];
}): Promise<string> {
const group = await db.packageGroup.create({
data: {
sourceChannelId: input.sourceChannelId,
name: input.name,
groupingSource: "AUTO_TIME",
},
});
await db.package.updateMany({
where: { id: { in: input.packageIds } },
data: { packageGroupId: group.id },
});
return group.id;
}

View File

@@ -1,7 +1,8 @@
import type { Client } from "tdl";
import type { TelegramPhoto } from "./preview/match.js";
import { downloadPhotoThumbnail } from "./tdlib/download.js";
import { createOrFindPackageGroup, linkPackagesToGroup } from "./db/queries.js";
import { createOrFindPackageGroup, linkPackagesToGroup, createTimeWindowGroup } from "./db/queries.js";
import { config } from "./util/config.js";
import { childLogger } from "./util/logger.js";
import { db } from "./db/client.js";
@@ -77,3 +78,95 @@ export async function processAlbumGroups(
}
}
}
/**
* After album grouping, cluster remaining ungrouped packages from the same channel
* that were posted within a configurable time window.
* Only groups packages that were just indexed in this scan cycle (the `indexedPackages` list).
*/
export async function processTimeWindowGroups(
sourceChannelId: string,
indexedPackages: IndexedPackageRef[]
): Promise<void> {
if (config.autoGroupTimeWindowMinutes <= 0) return;
// Find which of the just-indexed packages are still ungrouped
const ungrouped = await db.package.findMany({
where: {
id: { in: indexedPackages.map((p) => p.packageId) },
packageGroupId: null,
},
orderBy: { sourceMessageId: "asc" },
select: {
id: true,
fileName: true,
sourceMessageId: true,
indexedAt: true,
},
});
if (ungrouped.length < 2) return;
const windowMs = config.autoGroupTimeWindowMinutes * 60 * 1000;
// Cluster by time proximity: walk through sorted list, start new cluster when gap > window
const clusters: typeof ungrouped[] = [];
let current: typeof ungrouped = [ungrouped[0]];
for (let i = 1; i < ungrouped.length; i++) {
const prev = current[current.length - 1];
const gap = Math.abs(ungrouped[i].indexedAt.getTime() - prev.indexedAt.getTime());
if (gap <= windowMs) {
current.push(ungrouped[i]);
} else {
clusters.push(current);
current = [ungrouped[i]];
}
}
clusters.push(current);
// Create groups for clusters with 2+ packages
for (const cluster of clusters) {
if (cluster.length < 2) continue;
// Derive group name from common filename prefix
const name = findCommonPrefix(cluster.map((p) => p.fileName)) || cluster[0].fileName;
try {
const groupId = await createTimeWindowGroup({
sourceChannelId,
name,
packageIds: cluster.map((p) => p.id),
});
log.info(
{ groupId, name, memberCount: cluster.length },
"Created time-window group"
);
} catch (err) {
log.warn({ err, clusterSize: cluster.length }, "Failed to create time-window group");
}
}
}
/**
* Find the longest common prefix among a list of filenames,
* trimming trailing separators and partial words.
*/
function findCommonPrefix(names: string[]): string {
if (names.length === 0) return "";
if (names.length === 1) return names[0];
let prefix = names[0];
for (let i = 1; i < names.length; i++) {
while (!names[i].startsWith(prefix)) {
prefix = prefix.slice(0, -1);
if (prefix.length === 0) return "";
}
}
// Trim trailing separators and partial words
const trimmed = prefix.replace(/[\s\-_.(]+$/, "");
return trimmed.length >= 3 ? trimmed : "";
}

View File

@@ -10,6 +10,8 @@ export const config = {
/** Maximum file part size for Telegram upload (in MiB). Default 1950 (under 2GB non-Premium limit).
* Set to 3900 for Premium accounts (under 4GB limit). */
maxPartSizeMB: parseInt(process.env.MAX_PART_SIZE_MB ?? "1950", 10),
/** Time window for auto-grouping ungrouped packages from the same channel (minutes). 0 = disabled. */
autoGroupTimeWindowMinutes: parseInt(process.env.AUTO_GROUP_TIME_WINDOW_MINUTES ?? "5", 10),
/** Maximum jitter added to scheduler interval (in minutes) */
jitterMinutes: 5,
/** Maximum time span for multipart archive parts (in hours). 0 = no limit. */

View File

@@ -47,7 +47,8 @@ import { readRarContents } from "./archive/rar-reader.js";
import { read7zContents } from "./archive/sevenz-reader.js";
import { byteLevelSplit, concatenateFiles } from "./archive/split.js";
import { uploadToChannel } from "./upload/channel.js";
import { processAlbumGroups, type IndexedPackageRef } from "./grouping.js";
import { processAlbumGroups, processTimeWindowGroups, type IndexedPackageRef } from "./grouping.js";
import { db } from "./db/client.js";
import type { TelegramAccount, TelegramChannel } from "@prisma/client";
import type { Client } from "tdl";
@@ -790,6 +791,9 @@ async function processArchiveSets(
indexedPackageRefs,
scanResult.photos
);
// Time-window grouping for remaining ungrouped packages
await processTimeWindowGroups(channel.id, indexedPackageRefs);
}
return maxProcessedId;
@@ -1053,6 +1057,43 @@ async function processOneArchiveSet(
uploadPaths = splitPaths;
}
// ── Hash verification after split ──
// If we split/repacked, verify the split parts hash matches the original
if (splitPaths.length > 0) {
const splitHash = await hashParts(splitPaths);
if (splitHash !== contentHash) {
accountLog.error(
{ fileName: archiveName, originalHash: contentHash, splitHash, parts: splitPaths.length },
"Hash mismatch after split — file may be corrupted"
);
// Record notification for visibility
try {
await db.systemNotification.create({
data: {
type: "HASH_MISMATCH",
severity: "ERROR",
title: `Hash mismatch after splitting ${archiveName}`,
message: `Expected ${contentHash.slice(0, 16)}… but got ${splitHash.slice(0, 16)}… after splitting into ${splitPaths.length} parts`,
context: {
fileName: archiveName,
originalHash: contentHash,
splitHash,
partCount: splitPaths.length,
sourceChannelId: channel.id,
},
},
});
} catch {
// Best-effort notification
}
throw new Error(`Hash mismatch after split for ${archiveName}: expected ${contentHash}, got ${splitHash}`);
}
accountLog.debug(
{ fileName: archiveName, hash: contentHash.slice(0, 16), parts: splitPaths.length },
"Split hash verified — matches original"
);
}
// ── Uploading ──
// Check if a prior run already uploaded this file (orphaned upload scenario:
// file reached Telegram but DB write failed or worker crashed before indexing)