feat: complete remaining features — training, FTS, bot groups, repair, re-tag

Manual override training (GroupingRule): - Learn patterns from manual group creation (common filename prefix or creator) - Apply learned rules as first auto-grouping pass (highest confidence after albums) - GroupingRule model stores pattern, channel, signal type, confidence Hash verification after upload: - Re-hash upload files on disk before indexing to catch disk corruption - Creates HASH_MISMATCH notification on discrepancy Grouping conflict detection: - After all grouping passes, check if grouped packages match rules from different groups - Creates GROUPING_CONFLICT notification for manual review Per-channel grouping flags: - Add autoGroupEnabled boolean to TelegramChannel (default true) - Auto-grouping passes (all except album) gated behind this flag - Album grouping always runs as it reflects Telegram's native behavior Full-text search (tsvector): - Add searchVector tsvector column with GIN index and auto-update trigger - Backfill 1870 existing packages - FTS with ts_rank for ranked results, ILIKE fallback for short/failed queries - Applied to both web app and bot search Bot group awareness: - /group <query> — view group info or search groups by name - /sendgroup <id> — send all packages in a group to linked Telegram account Bulk repair: - repairPackageAction clears dest info and resets watermark for re-processing - Repair button in notification bell for MISSING_PART and HASH_MISMATCH alerts - /api/notifications/repair endpoint Retroactive category re-tagging: - When channel category changes, auto-update tags on all existing packages - Removes old category tag, adds new one Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-11 06:11:15 +00:00 · 2026-03-30 14:34:14 +02:00
parent 7f9a03d4ee
commit f4aa9d9a2f
12 changed files with 788 additions and 25 deletions
--- a/worker/src/db/queries.ts
+++ b/worker/src/db/queries.ts
@@ -617,7 +617,7 @@ export async function createAutoGroup(input: {
  sourceChannelId: string;
  name: string;
  packageIds: string[];
-  groupingSource: "AUTO_TIME" | "AUTO_PATTERN" | "AUTO_ZIP" | "AUTO_CAPTION" | "AUTO_REPLY";
+  groupingSource: "ALBUM" | "MANUAL" | "AUTO_TIME" | "AUTO_PATTERN" | "AUTO_ZIP" | "AUTO_CAPTION" | "AUTO_REPLY";
 }): Promise<string> {
  const group = await db.packageGroup.create({
    data: {
--- a/worker/src/grouping.ts
+++ b/worker/src/grouping.ts
@@ -79,6 +79,69 @@ export async function processAlbumGroups(
  }
 }

+/**
+ * Apply learned GroupingRules from manual overrides.
+ * For each rule, find ungrouped packages whose fileName contains the pattern.
+ */
+export async function processRuleBasedGroups(
+  sourceChannelId: string,
+  indexedPackages: IndexedPackageRef[]
+): Promise<void> {
+  const rules = await db.groupingRule.findMany({
+    where: { sourceChannelId },
+    orderBy: { confidence: "desc" },
+  });
+
+  if (rules.length === 0) return;
+
+  const ungrouped = await db.package.findMany({
+    where: {
+      id: { in: indexedPackages.map((p) => p.packageId) },
+      packageGroupId: null,
+    },
+    select: { id: true, fileName: true, creator: true },
+  });
+
+  if (ungrouped.length < 2) return;
+
+  for (const rule of rules) {
+    const matches = ungrouped.filter((pkg) => {
+      const lower = rule.pattern.toLowerCase();
+      return pkg.fileName.toLowerCase().includes(lower) ||
+        (pkg.creator && pkg.creator.toLowerCase().includes(lower));
+    });
+
+    if (matches.length < 2) continue;
+
+    // Check if any are already grouped (by a previous rule in this loop)
+    const stillUngrouped = await db.package.findMany({
+      where: {
+        id: { in: matches.map((m) => m.id) },
+        packageGroupId: null,
+      },
+      select: { id: true },
+    });
+
+    if (stillUngrouped.length < 2) continue;
+
+    try {
+      const groupId = await createAutoGroup({
+        sourceChannelId,
+        name: rule.pattern,
+        packageIds: stillUngrouped.map((m) => m.id),
+        groupingSource: "MANUAL",
+      });
+
+      log.info(
+        { groupId, ruleId: rule.id, pattern: rule.pattern, memberCount: stillUngrouped.length },
+        "Applied learned grouping rule"
+      );
+    } catch (err) {
+      log.warn({ err, ruleId: rule.id }, "Failed to apply grouping rule");
+    }
+  }
+}
+
 /**
 * After album grouping, cluster remaining ungrouped packages from the same channel
 * that were posted within a configurable time window.
@@ -525,6 +588,64 @@ function extractRootFolder(paths: string[]): string | null {
  return maxSegment;
 }

+/**
+ * Detect packages that could have been grouped differently.
+ * Checks if any grouped package's filename matches a GroupingRule
+ * that would place it in a different group.
+ */
+export async function detectGroupingConflicts(
+  sourceChannelId: string,
+  indexedPackages: IndexedPackageRef[]
+): Promise<void> {
+  const rules = await db.groupingRule.findMany({
+    where: { sourceChannelId },
+  });
+  if (rules.length === 0) return;
+
+  const grouped = await db.package.findMany({
+    where: {
+      id: { in: indexedPackages.map((p) => p.packageId) },
+      packageGroupId: { not: null },
+    },
+    select: {
+      id: true,
+      fileName: true,
+      packageGroupId: true,
+      packageGroup: { select: { name: true, groupingSource: true } },
+    },
+  });
+
+  for (const pkg of grouped) {
+    for (const rule of rules) {
+      if (pkg.fileName.toLowerCase().includes(rule.pattern.toLowerCase())) {
+        // Check if the rule's source group is different from current group
+        if (rule.createdByGroupId && rule.createdByGroupId !== pkg.packageGroupId) {
+          try {
+            await db.systemNotification.create({
+              data: {
+                type: "GROUPING_CONFLICT",
+                severity: "INFO",
+                title: `Potential grouping conflict: ${pkg.fileName}`,
+                message: `Grouped by ${pkg.packageGroup?.groupingSource ?? "unknown"} into "${pkg.packageGroup?.name}", but also matches rule "${rule.pattern}" from a different manual group`,
+                context: {
+                  packageId: pkg.id,
+                  fileName: pkg.fileName,
+                  currentGroupId: pkg.packageGroupId,
+                  matchedRuleId: rule.id,
+                  matchedPattern: rule.pattern,
+                },
+              },
+            });
+          } catch {
+            // Best-effort
+          }
+          break; // One notification per package
+        }
+      }
+    }
+  }
+}
+
 /**
 * Find the longest common prefix among a list of filenames,
 * trimming trailing separators and partial words.
--- a/worker/src/worker.ts
+++ b/worker/src/worker.ts
@@ -47,7 +47,7 @@ import { readRarContents } from "./archive/rar-reader.js";
 import { read7zContents } from "./archive/sevenz-reader.js";
 import { byteLevelSplit, concatenateFiles } from "./archive/split.js";
 import { uploadToChannel } from "./upload/channel.js";
-import { processAlbumGroups, processTimeWindowGroups, processPatternGroups, processCreatorGroups, processZipPathGroups, processReplyChainGroups, processCaptionGroups, type IndexedPackageRef } from "./grouping.js";
+import { processAlbumGroups, processRuleBasedGroups, processTimeWindowGroups, processPatternGroups, processCreatorGroups, processZipPathGroups, processReplyChainGroups, processCaptionGroups, detectGroupingConflicts, type IndexedPackageRef } from "./grouping.js";
 import { db } from "./db/client.js";
 import type { TelegramAccount, TelegramChannel } from "@prisma/client";
 import type { Client } from "tdl";
@@ -808,23 +808,37 @@ async function processArchiveSets(
      scanResult.photos
    );

-    // Time-window grouping for remaining ungrouped packages
-    await processTimeWindowGroups(channel.id, indexedPackageRefs);
+    // Auto-grouping passes (gated by per-channel flag)
+    const channelRecord = await db.telegramChannel.findUnique({
+      where: { id: channel.id },
+      select: { autoGroupEnabled: true },
+    });

-    // Pattern-based grouping (date patterns, project slugs)
-    await processPatternGroups(channel.id, indexedPackageRefs);
+    if (channelRecord?.autoGroupEnabled !== false) {
+      // Learned rule-based grouping (from manual overrides)
+      await processRuleBasedGroups(channel.id, indexedPackageRefs);

-    // Creator-based grouping (3+ files from same creator)
-    await processCreatorGroups(channel.id, indexedPackageRefs);
+      // Time-window grouping for remaining ungrouped packages
+      await processTimeWindowGroups(channel.id, indexedPackageRefs);

-    // ZIP path prefix grouping (shared root folder inside archives)
-    await processZipPathGroups(channel.id, indexedPackageRefs);
+      // Pattern-based grouping (date patterns, project slugs)
+      await processPatternGroups(channel.id, indexedPackageRefs);

-    // Reply chain grouping (messages replying to same root)
-    await processReplyChainGroups(channel.id, indexedPackageRefs);
+      // Creator-based grouping (3+ files from same creator)
+      await processCreatorGroups(channel.id, indexedPackageRefs);

-    // Caption fuzzy match grouping
-    await processCaptionGroups(channel.id, indexedPackageRefs);
+      // ZIP path prefix grouping (shared root folder inside archives)
+      await processZipPathGroups(channel.id, indexedPackageRefs);
+
+      // Reply chain grouping (messages replying to same root)
+      await processReplyChainGroups(channel.id, indexedPackageRefs);
+
+      // Caption fuzzy match grouping
+      await processCaptionGroups(channel.id, indexedPackageRefs);
+    }
+
+    // Check for potential grouping conflicts
+    await detectGroupingConflicts(channel.id, indexedPackageRefs);
  }

  return maxProcessedId;
@@ -1162,6 +1176,34 @@ async function processOneArchiveSet(
      );
    }

+    // ── Post-upload integrity check ──
+    // Verify the files on disk still match before we index
+    if (uploadPaths.length > 0 && !existingUpload) {
+      try {
+        const postUploadHash = await hashParts(uploadPaths);
+        if (splitPaths.length > 0) {
+          // Split files — hash should match the split hash (already verified above)
+          // No additional check needed since we verified split hash = original hash
+        } else if (postUploadHash !== contentHash) {
+          accountLog.error(
+            { fileName: archiveName, originalHash: contentHash, postUploadHash },
+            "Hash changed between hashing and upload — possible disk corruption"
+          );
+          await db.systemNotification.create({
+            data: {
+              type: "HASH_MISMATCH",
+              severity: "ERROR",
+              title: `Post-upload hash mismatch: ${archiveName}`,
+              message: `Hash changed between download and upload. Original: ${contentHash.slice(0, 16)}…, post-upload: ${postUploadHash.slice(0, 16)}…`,
+              context: { fileName: archiveName, originalHash: contentHash, postUploadHash, sourceChannelId: channel.id },
+            },
+          });
+        }
+      } catch {
+        // Best-effort — don't fail the ingestion
+      }
+    }
+
    // ── Preview thumbnail ──
    let previewData: Buffer | null = null;
    let previewMsgId: bigint | null = null;