feat: grouping phase 1 — schema, ungrouped tab, time-window grouping, hash verification

Schema: - Add GroupingSource enum (ALBUM, MANUAL, AUTO_TIME, AUTO_PATTERN, etc.) - Add groupingSource field to PackageGroup with backfill - Add SystemNotification model for persistent alerts - Add NotificationType and NotificationSeverity enums Ungrouped staging tab: - Add listUngroupedPackages/countUngroupedPackages queries - Add "Ungrouped" tab to STL page showing packages without a group Time-window auto-grouping: - After album grouping, cluster ungrouped packages within configurable time window (default 5 min, AUTO_GROUP_TIME_WINDOW_MINUTES env var) - Groups named from common filename prefix - Groups created with groupingSource=AUTO_TIME Hash verification after split: - Re-hash split parts and compare to original contentHash - Log error and create SystemNotification on mismatch - Prevents silently corrupted split uploads Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-11 06:11:15 +00:00 · 2026-03-30 13:00:27 +02:00
parent 194c87a256
commit 9e78cc5d19
10 changed files with 415 additions and 5 deletions
--- a/worker/src/db/queries.ts
+++ b/worker/src/db/queries.ts
@@ -587,3 +587,24 @@ export async function linkPackagesToGroup(
    data: { packageGroupId: groupId },
  });
 }
+
+export async function createTimeWindowGroup(input: {
+  sourceChannelId: string;
+  name: string;
+  packageIds: string[];
+}): Promise<string> {
+  const group = await db.packageGroup.create({
+    data: {
+      sourceChannelId: input.sourceChannelId,
+      name: input.name,
+      groupingSource: "AUTO_TIME",
+    },
+  });
+
+  await db.package.updateMany({
+    where: { id: { in: input.packageIds } },
+    data: { packageGroupId: group.id },
+  });
+
+  return group.id;
+}
--- a/worker/src/grouping.ts
+++ b/worker/src/grouping.ts
@@ -1,7 +1,8 @@
 import type { Client } from "tdl";
 import type { TelegramPhoto } from "./preview/match.js";
 import { downloadPhotoThumbnail } from "./tdlib/download.js";
-import { createOrFindPackageGroup, linkPackagesToGroup } from "./db/queries.js";
+import { createOrFindPackageGroup, linkPackagesToGroup, createTimeWindowGroup } from "./db/queries.js";
+import { config } from "./util/config.js";
 import { childLogger } from "./util/logger.js";
 import { db } from "./db/client.js";

@@ -77,3 +78,95 @@ export async function processAlbumGroups(
    }
  }
 }
+
+/**
+ * After album grouping, cluster remaining ungrouped packages from the same channel
+ * that were posted within a configurable time window.
+ * Only groups packages that were just indexed in this scan cycle (the `indexedPackages` list).
+ */
+export async function processTimeWindowGroups(
+  sourceChannelId: string,
+  indexedPackages: IndexedPackageRef[]
+): Promise<void> {
+  if (config.autoGroupTimeWindowMinutes <= 0) return;
+
+  // Find which of the just-indexed packages are still ungrouped
+  const ungrouped = await db.package.findMany({
+    where: {
+      id: { in: indexedPackages.map((p) => p.packageId) },
+      packageGroupId: null,
+    },
+    orderBy: { sourceMessageId: "asc" },
+    select: {
+      id: true,
+      fileName: true,
+      sourceMessageId: true,
+      indexedAt: true,
+    },
+  });
+
+  if (ungrouped.length < 2) return;
+
+  const windowMs = config.autoGroupTimeWindowMinutes * 60 * 1000;
+
+  // Cluster by time proximity: walk through sorted list, start new cluster when gap > window
+  const clusters: typeof ungrouped[] = [];
+  let current: typeof ungrouped = [ungrouped[0]];
+
+  for (let i = 1; i < ungrouped.length; i++) {
+    const prev = current[current.length - 1];
+    const gap = Math.abs(ungrouped[i].indexedAt.getTime() - prev.indexedAt.getTime());
+
+    if (gap <= windowMs) {
+      current.push(ungrouped[i]);
+    } else {
+      clusters.push(current);
+      current = [ungrouped[i]];
+    }
+  }
+  clusters.push(current);
+
+  // Create groups for clusters with 2+ packages
+  for (const cluster of clusters) {
+    if (cluster.length < 2) continue;
+
+    // Derive group name from common filename prefix
+    const name = findCommonPrefix(cluster.map((p) => p.fileName)) || cluster[0].fileName;
+
+    try {
+      const groupId = await createTimeWindowGroup({
+        sourceChannelId,
+        name,
+        packageIds: cluster.map((p) => p.id),
+      });
+
+      log.info(
+        { groupId, name, memberCount: cluster.length },
+        "Created time-window group"
+      );
+    } catch (err) {
+      log.warn({ err, clusterSize: cluster.length }, "Failed to create time-window group");
+    }
+  }
+}
+
+/**
+ * Find the longest common prefix among a list of filenames,
+ * trimming trailing separators and partial words.
+ */
+function findCommonPrefix(names: string[]): string {
+  if (names.length === 0) return "";
+  if (names.length === 1) return names[0];
+
+  let prefix = names[0];
+  for (let i = 1; i < names.length; i++) {
+    while (!names[i].startsWith(prefix)) {
+      prefix = prefix.slice(0, -1);
+      if (prefix.length === 0) return "";
+    }
+  }
+
+  // Trim trailing separators and partial words
+  const trimmed = prefix.replace(/[\s\-_.(]+$/, "");
+  return trimmed.length >= 3 ? trimmed : "";
+}
--- a/worker/src/util/config.ts
+++ b/worker/src/util/config.ts
@@ -10,6 +10,8 @@ export const config = {
  /** Maximum file part size for Telegram upload (in MiB). Default 1950 (under 2GB non-Premium limit).
   *  Set to 3900 for Premium accounts (under 4GB limit). */
  maxPartSizeMB: parseInt(process.env.MAX_PART_SIZE_MB ?? "1950", 10),
+  /** Time window for auto-grouping ungrouped packages from the same channel (minutes). 0 = disabled. */
+  autoGroupTimeWindowMinutes: parseInt(process.env.AUTO_GROUP_TIME_WINDOW_MINUTES ?? "5", 10),
  /** Maximum jitter added to scheduler interval (in minutes) */
  jitterMinutes: 5,
  /** Maximum time span for multipart archive parts (in hours). 0 = no limit. */
--- a/worker/src/worker.ts
+++ b/worker/src/worker.ts
@@ -47,7 +47,8 @@ import { readRarContents } from "./archive/rar-reader.js";
 import { read7zContents } from "./archive/sevenz-reader.js";
 import { byteLevelSplit, concatenateFiles } from "./archive/split.js";
 import { uploadToChannel } from "./upload/channel.js";
-import { processAlbumGroups, type IndexedPackageRef } from "./grouping.js";
+import { processAlbumGroups, processTimeWindowGroups, type IndexedPackageRef } from "./grouping.js";
+import { db } from "./db/client.js";
 import type { TelegramAccount, TelegramChannel } from "@prisma/client";
 import type { Client } from "tdl";

@@ -790,6 +791,9 @@ async function processArchiveSets(
      indexedPackageRefs,
      scanResult.photos
    );
+
+    // Time-window grouping for remaining ungrouped packages
+    await processTimeWindowGroups(channel.id, indexedPackageRefs);
  }

  return maxProcessedId;
@@ -1053,6 +1057,43 @@ async function processOneArchiveSet(
      uploadPaths = splitPaths;
    }

+    // ── Hash verification after split ──
+    // If we split/repacked, verify the split parts hash matches the original
+    if (splitPaths.length > 0) {
+      const splitHash = await hashParts(splitPaths);
+      if (splitHash !== contentHash) {
+        accountLog.error(
+          { fileName: archiveName, originalHash: contentHash, splitHash, parts: splitPaths.length },
+          "Hash mismatch after split — file may be corrupted"
+        );
+        // Record notification for visibility
+        try {
+          await db.systemNotification.create({
+            data: {
+              type: "HASH_MISMATCH",
+              severity: "ERROR",
+              title: `Hash mismatch after splitting ${archiveName}`,
+              message: `Expected ${contentHash.slice(0, 16)}… but got ${splitHash.slice(0, 16)}… after splitting into ${splitPaths.length} parts`,
+              context: {
+                fileName: archiveName,
+                originalHash: contentHash,
+                splitHash,
+                partCount: splitPaths.length,
+                sourceChannelId: channel.id,
+              },
+            },
+          });
+        } catch {
+          // Best-effort notification
+        }
+        throw new Error(`Hash mismatch after split for ${archiveName}: expected ${contentHash}, got ${splitHash}`);
+      }
+      accountLog.debug(
+        { fileName: archiveName, hash: contentHash.slice(0, 16), parts: splitPaths.length },
+        "Split hash verified — matches original"
+      );
+    }
+
    // ── Uploading ──
    // Check if a prior run already uploaded this file (orphaned upload scenario:
    // file reached Telegram but DB write failed or worker crashed before indexing)