feat: group merge, ZIP/reply/caption grouping, integrity audit

Group merge UI:
- Add mergeGroups query and mergeGroupsAction server action
- Add "Start Merge" / "Merge Here" buttons to group row actions
- Two-step UX: click Start on source, click Merge Here on target

ZIP path prefix grouping (Signal 7):
- Compare PackageFile.path root folders across ungrouped packages
- Auto-group if 2+ packages share the same dominant root folder

Reply chain grouping (Signal 6):
- Capture reply_to_message_id during channel scanning
- Group archives that reply to the same root message
- Add replyToMessageId field to Package schema

Caption fuzzy match grouping (Signal 8):
- Capture source caption during channel scanning
- Normalize captions (strip extensions, extract significant words)
- Group packages with matching normalized caption keys
- Add sourceCaption field to Package schema

Periodic integrity audit:
- Check multipart packages for completeness (parts vs destMessageIds)
- Detect orphaned indexes (destChannelId set but no destMessageId)
- Runs after each ingestion cycle, deduplicates notifications

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-30 14:19:36 +02:00
parent 2c46ab0843
commit 7f9a03d4ee
13 changed files with 488 additions and 3 deletions

View File

@@ -0,0 +1,3 @@
-- AlterTable: add sourceCaption and replyToMessageId to packages
ALTER TABLE "packages" ADD COLUMN "sourceCaption" TEXT;
ALTER TABLE "packages" ADD COLUMN "replyToMessageId" BIGINT;

View File

@@ -474,6 +474,8 @@ model Package {
partCount Int @default(1)
fileCount Int @default(0)
tags String[] @default([])
sourceCaption String? // Caption text from source Telegram message
replyToMessageId BigInt? // reply_to_message_id from source message (for reply chain grouping)
previewData Bytes? // JPEG thumbnail from nearby Telegram photo (stored as raw bytes)
previewMsgId BigInt? // Telegram message ID of the matched photo
packageGroupId String?

View File

@@ -1,7 +1,7 @@
"use client";
import { type ColumnDef } from "@tanstack/react-table";
import { FileArchive, Eye, ChevronRight, Layers, Ungroup, Send, ImagePlus } from "lucide-react";
import { FileArchive, Eye, ChevronRight, Layers, Ungroup, Send, ImagePlus, GitMerge } from "lucide-react";
import { DataTableColumnHeader } from "@/components/shared/data-table-column-header";
import { Badge } from "@/components/ui/badge";
import { Button } from "@/components/ui/button";
@@ -69,6 +69,9 @@ interface PackageColumnsProps {
onGroupPreviewUpload: (groupId: string) => void;
selectedPackages: Set<string>;
onToggleSelect: (packageId: string) => void;
mergeSourceId: string | null;
onStartMerge: (groupId: string) => void;
onCompleteMerge: (targetGroupId: string) => void;
}
export function formatBytes(bytesStr: string): string {
@@ -148,6 +151,9 @@ export function getPackageColumns({
onGroupPreviewUpload,
selectedPackages,
onToggleSelect,
mergeSourceId,
onStartMerge,
onCompleteMerge,
}: PackageColumnsProps): ColumnDef<StlTableRow, unknown>[] {
return [
{
@@ -392,6 +398,8 @@ export function getPackageColumns({
cell: ({ row }) => {
const data = row.original;
if (isGroupRow(data)) {
const isMergeSource = mergeSourceId === data.id;
const canMergeHere = mergeSourceId !== null && mergeSourceId !== data.id;
return (
<div className="flex items-center gap-0.5">
<Button
@@ -403,6 +411,26 @@ export function getPackageColumns({
>
<Send className="h-4 w-4" />
</Button>
<Button
variant="ghost"
size="icon"
className={`h-8 w-8 ${isMergeSource ? "text-amber-500 bg-amber-500/10 hover:bg-amber-500/20" : ""}`}
onClick={() => onStartMerge(data.id)}
title={isMergeSource ? "Cancel merge (this group is the merge source)" : "Start merge — mark this group as merge source"}
>
<GitMerge className="h-4 w-4" />
</Button>
{canMergeHere && (
<Button
variant="ghost"
size="icon"
className="h-8 w-8 text-primary bg-primary/10 hover:bg-primary/20"
onClick={() => onCompleteMerge(data.id)}
title="Merge source group into this group"
>
<Layers className="h-4 w-4" />
</Button>
)}
<Button
variant="ghost"
size="icon"

View File

@@ -49,6 +49,7 @@ import {
removeFromGroupAction,
sendAllInGroupAction,
updateGroupPreviewAction,
mergeGroupsAction,
} from "../actions";
interface StlTableProps {
@@ -102,6 +103,9 @@ export function StlTable({
const previewInputRef = useRef<HTMLInputElement>(null);
const [uploadGroupId, setUploadGroupId] = useState<string | null>(null);
// Group merge state
const [mergeSourceId, setMergeSourceId] = useState<string | null>(null);
const toggleGroup = useCallback((groupId: string) => {
setExpandedGroups((prev) => {
const next = new Set(prev);
@@ -340,6 +344,35 @@ export function StlTable({
[uploadGroupId, router]
);
const handleStartMerge = useCallback((groupId: string) => {
setMergeSourceId((prev) => {
if (prev === groupId) {
toast.info("Merge cancelled");
return null;
}
toast.info("Merge source selected — click the merge-here button on the target group");
return groupId;
});
}, []);
const handleMergeGroups = useCallback(
(targetGroupId: string) => {
if (!mergeSourceId) return;
const sourceId = mergeSourceId;
startTransition(async () => {
const result = await mergeGroupsAction(targetGroupId, sourceId);
if (result.success) {
toast.success("Groups merged successfully");
setMergeSourceId(null);
router.refresh();
} else {
toast.error(result.error);
}
});
},
[mergeSourceId, router]
);
const columns = getPackageColumns({
onViewFiles: (pkg) => setViewPkg(pkg),
searchTerm,
@@ -381,6 +414,9 @@ export function StlTable({
onGroupPreviewUpload: handleGroupPreviewUpload,
selectedPackages,
onToggleSelect: toggleSelect,
mergeSourceId,
onStartMerge: handleStartMerge,
onCompleteMerge: handleMergeGroups,
});
const { table } = useDataTable({ data: tableRows, columns, pageCount });

View File

@@ -10,6 +10,7 @@ import {
createManualGroup,
removePackageFromGroup,
dissolveGroup,
mergeGroups,
} from "@/lib/telegram/queries";
const ALLOWED_IMAGE_TYPES = [
@@ -435,6 +436,26 @@ export async function updateGroupPreviewAction(
}
}
export async function mergeGroupsAction(
targetGroupId: string,
sourceGroupId: string
): Promise<ActionResult> {
const session = await auth();
if (!session?.user?.id) return { success: false, error: "Unauthorized" };
if (targetGroupId === sourceGroupId) {
return { success: false, error: "Cannot merge a group with itself" };
}
try {
await mergeGroups(targetGroupId, sourceGroupId);
revalidatePath("/stls");
return { success: true, data: undefined };
} catch {
return { success: false, error: "Failed to merge groups" };
}
}
export async function sendAllInGroupAction(
groupId: string
): Promise<ActionResult> {

View File

@@ -736,3 +736,13 @@ export async function dissolveGroup(groupId: string) {
});
await prisma.packageGroup.delete({ where: { id: groupId } });
}
export async function mergeGroups(targetGroupId: string, sourceGroupId: string) {
// Move all packages from source group to target group
await prisma.package.updateMany({
where: { packageGroupId: sourceGroupId },
data: { packageGroupId: targetGroupId },
});
// Delete the now-empty source group
await prisma.packageGroup.delete({ where: { id: sourceGroupId } });
}

View File

@@ -11,6 +11,8 @@ export interface TelegramMessage {
fileSize: bigint;
date: Date;
mediaAlbumId?: string;
replyToMessageId?: bigint; // NEW
caption?: string; // NEW
}
export interface ArchiveSet {

117
worker/src/audit.ts Normal file
View File

@@ -0,0 +1,117 @@
import { db } from "./db/client.js";
import { childLogger } from "./util/logger.js";
const log = childLogger("audit");
/**
* Periodic integrity audit: checks all packages for consistency.
* Creates SystemNotification records for any issues found.
*
* Checks performed:
* 1. Multipart completeness: destMessageIds.length should match partCount
* 2. Missing destination: packages with destChannelId but no destMessageId
*/
export async function runIntegrityAudit(): Promise<{ checked: number; issues: number }> {
log.info("Starting integrity audit");
let checked = 0;
let issues = 0;
// Check 1: Multipart packages with wrong number of destination message IDs
const multipartPackages = await db.package.findMany({
where: {
isMultipart: true,
partCount: { gt: 1 },
destMessageId: { not: null },
},
select: {
id: true,
fileName: true,
partCount: true,
destMessageIds: true,
sourceChannelId: true,
sourceChannel: { select: { title: true } },
},
});
checked += multipartPackages.length;
for (const pkg of multipartPackages) {
const actualParts = pkg.destMessageIds.length;
if (actualParts > 0 && actualParts !== pkg.partCount) {
issues++;
// Check if we already have a notification for this
const existing = await db.systemNotification.findFirst({
where: {
type: "MISSING_PART",
context: { path: ["packageId"], equals: pkg.id },
},
select: { id: true },
});
if (!existing) {
await db.systemNotification.create({
data: {
type: "MISSING_PART",
severity: "WARNING",
title: `Incomplete multipart: ${pkg.fileName}`,
message: `Expected ${pkg.partCount} parts but only ${actualParts} destination message IDs stored`,
context: {
packageId: pkg.id,
fileName: pkg.fileName,
expectedParts: pkg.partCount,
actualParts,
sourceChannelId: pkg.sourceChannelId,
channelTitle: pkg.sourceChannel.title,
},
},
});
log.warn(
{ packageId: pkg.id, fileName: pkg.fileName, expected: pkg.partCount, actual: actualParts },
"Multipart package has mismatched part count"
);
}
}
}
// Check 2: Packages with dest channel but no dest message (orphaned index)
const orphanedCount = await db.package.count({
where: {
destChannelId: { not: null },
destMessageId: null,
},
});
if (orphanedCount > 0) {
issues++;
const existing = await db.systemNotification.findFirst({
where: {
type: "INTEGRITY_AUDIT",
context: { path: ["check"], equals: "orphaned_index" },
createdAt: { gte: new Date(Date.now() - 24 * 60 * 60 * 1000) },
},
select: { id: true },
});
if (!existing) {
await db.systemNotification.create({
data: {
type: "INTEGRITY_AUDIT",
severity: "INFO",
title: `${orphanedCount} packages with missing destination message`,
message: `Found ${orphanedCount} packages that have a destination channel set but no destination message ID. These may be from interrupted uploads.`,
context: {
check: "orphaned_index",
count: orphanedCount,
},
},
});
}
}
log.info({ checked, issues }, "Integrity audit complete");
return { checked, issues };
}

View File

@@ -119,6 +119,8 @@ export interface CreatePackageInput {
tags?: string[];
previewData?: Buffer | null;
previewMsgId?: bigint | null;
sourceCaption?: string | null;
replyToMessageId?: bigint | null;
files: {
path: string;
fileName: string;
@@ -150,6 +152,8 @@ export async function createPackageWithFiles(input: CreatePackageInput) {
tags: input.tags && input.tags.length > 0 ? input.tags : undefined,
previewData: input.previewData ? new Uint8Array(input.previewData) : undefined,
previewMsgId: input.previewMsgId ?? undefined,
sourceCaption: input.sourceCaption ?? undefined,
replyToMessageId: input.replyToMessageId ?? undefined,
files: {
create: input.files,
},
@@ -613,7 +617,7 @@ export async function createAutoGroup(input: {
sourceChannelId: string;
name: string;
packageIds: string[];
groupingSource: "AUTO_TIME" | "AUTO_PATTERN" | "AUTO_ZIP" | "AUTO_CAPTION";
groupingSource: "AUTO_TIME" | "AUTO_PATTERN" | "AUTO_ZIP" | "AUTO_CAPTION" | "AUTO_REPLY";
}): Promise<string> {
const group = await db.packageGroup.create({
data: {

View File

@@ -288,6 +288,243 @@ export async function processCreatorGroups(
}
}
/**
* Group ungrouped packages that share the same root folder inside their archives.
* E.g., if two packages both contain files under "ProjectX/", they're likely related.
* Only considers packages with 3+ files (to avoid false positives from flat archives).
*/
export async function processZipPathGroups(
sourceChannelId: string,
indexedPackages: IndexedPackageRef[]
): Promise<void> {
// Find ungrouped packages that have indexed files
const ungrouped = await db.package.findMany({
where: {
id: { in: indexedPackages.map((p) => p.packageId) },
packageGroupId: null,
fileCount: { gte: 3 },
},
select: {
id: true,
fileName: true,
files: {
select: { path: true },
take: 50,
},
},
});
if (ungrouped.length < 2) return;
// Extract the dominant root folder for each package
const packageRoots = new Map<string, { id: string; fileName: string }[]>();
for (const pkg of ungrouped) {
const root = extractRootFolder(pkg.files.map((f) => f.path));
if (!root) continue;
const key = root.toLowerCase();
const group = packageRoots.get(key) ?? [];
group.push({ id: pkg.id, fileName: pkg.fileName });
packageRoots.set(key, group);
}
// Create groups for roots shared by 2+ packages
for (const [root, members] of packageRoots) {
if (members.length < 2) continue;
try {
const groupId = await createAutoGroup({
sourceChannelId,
name: root,
packageIds: members.map((m) => m.id),
groupingSource: "AUTO_ZIP",
});
log.info(
{ groupId, rootFolder: root, memberCount: members.length },
"Created ZIP path prefix group"
);
} catch (err) {
log.warn({ err, rootFolder: root }, "Failed to create ZIP path group");
}
}
}
/**
* Group ungrouped packages that reply to the same root message.
* If message B and C both reply to message A, they're grouped together.
*/
export async function processReplyChainGroups(
sourceChannelId: string,
indexedPackages: IndexedPackageRef[]
): Promise<void> {
const ungrouped = await db.package.findMany({
where: {
id: { in: indexedPackages.map((p) => p.packageId) },
packageGroupId: null,
replyToMessageId: { not: null },
},
select: {
id: true,
fileName: true,
replyToMessageId: true,
},
});
if (ungrouped.length < 2) return;
// Group by replyToMessageId
const replyMap = new Map<string, typeof ungrouped>();
for (const pkg of ungrouped) {
if (!pkg.replyToMessageId) continue;
const key = pkg.replyToMessageId.toString();
const group = replyMap.get(key) ?? [];
group.push(pkg);
replyMap.set(key, group);
}
for (const [replyId, members] of replyMap) {
if (members.length < 2) continue;
const name = findCommonPrefix(members.map((m) => m.fileName)) || members[0].fileName;
try {
const groupId = await createAutoGroup({
sourceChannelId,
name,
packageIds: members.map((m) => m.id),
groupingSource: "AUTO_REPLY" as const,
});
log.info(
{ groupId, replyToMessageId: replyId, memberCount: members.length },
"Created reply-chain group"
);
} catch (err) {
log.warn({ err, replyToMessageId: replyId }, "Failed to create reply-chain group");
}
}
}
/**
* Group ungrouped packages with similar captions from the same channel.
* Uses normalized caption comparison — two captions match if they share
* the same significant words (ignoring common words and file extensions).
*/
export async function processCaptionGroups(
sourceChannelId: string,
indexedPackages: IndexedPackageRef[]
): Promise<void> {
const ungrouped = await db.package.findMany({
where: {
id: { in: indexedPackages.map((p) => p.packageId) },
packageGroupId: null,
sourceCaption: { not: null },
},
select: {
id: true,
fileName: true,
sourceCaption: true,
},
});
if (ungrouped.length < 2) return;
// Group by normalized caption key
const captionMap = new Map<string, typeof ungrouped>();
for (const pkg of ungrouped) {
if (!pkg.sourceCaption) continue;
const key = normalizeCaptionKey(pkg.sourceCaption);
if (!key) continue;
const group = captionMap.get(key) ?? [];
group.push(pkg);
captionMap.set(key, group);
}
for (const [, members] of captionMap) {
if (members.length < 2) continue;
const name = members[0].sourceCaption!.slice(0, 80);
try {
const groupId = await createAutoGroup({
sourceChannelId,
name,
packageIds: members.map((m) => m.id),
groupingSource: "AUTO_CAPTION" as const,
});
log.info(
{ groupId, memberCount: members.length },
"Created caption-match group"
);
} catch (err) {
log.warn({ err }, "Failed to create caption group");
}
}
}
/**
* Normalize a caption for grouping: lowercase, strip extensions and numbers,
* extract significant words (3+ chars), sort, and join.
* Two captions with the same key are considered a match.
*/
function normalizeCaptionKey(caption: string): string | null {
const stripped = caption
.toLowerCase()
.replace(/\.(zip|rar|7z|stl|pdf|obj|gcode)(\.\d+)?/gi, "")
.replace(/[^a-z0-9\s]/g, " ");
const words = stripped
.split(/\s+/)
.filter((w) => w.length >= 3)
.filter((w) => !["the", "and", "for", "with", "from", "part", "file", "files"].includes(w));
if (words.length < 2) return null;
return words.sort().join(" ");
}
/**
* Extract the dominant root folder from a list of archive file paths.
* Returns the first path segment that appears in >50% of files.
* Returns null for flat archives or archives with no common root.
*/
function extractRootFolder(paths: string[]): string | null {
if (paths.length === 0) return null;
// Count first path segments
const segmentCounts = new Map<string, number>();
for (const p of paths) {
// Normalize separators and get first segment
const normalized = p.replace(/\\/g, "/");
const firstSlash = normalized.indexOf("/");
if (firstSlash <= 0) continue; // Skip root-level files
const segment = normalized.slice(0, firstSlash);
// Skip common noise folders
if (segment === "__MACOSX" || segment === ".DS_Store" || segment === "Thumbs.db") continue;
segmentCounts.set(segment, (segmentCounts.get(segment) ?? 0) + 1);
}
if (segmentCounts.size === 0) return null;
// Find the most common segment
let maxSegment = "";
let maxCount = 0;
for (const [seg, count] of segmentCounts) {
if (count > maxCount) {
maxSegment = seg;
maxCount = count;
}
}
// Must appear in >50% of files and be at least 3 chars
if (maxCount < paths.length * 0.5 || maxSegment.length < 3) return null;
return maxSegment;
}
/**
* Find the longest common prefix among a list of filenames,
* trimming trailing separators and partial words.

View File

@@ -3,6 +3,7 @@ import { childLogger } from "./util/logger.js";
import { withTdlibMutex } from "./util/mutex.js";
import { getActiveAccounts, getPendingAccounts } from "./db/queries.js";
import { runWorkerForAccount, authenticateAccount } from "./worker.js";
import { runIntegrityAudit } from "./audit.js";
const log = childLogger("scheduler");
@@ -87,6 +88,16 @@ async function runCycle(): Promise<void> {
{ elapsed: Math.round((Date.now() - cycleStart) / 1000) },
"Ingestion cycle complete"
);
// Run integrity audit after all accounts are processed
try {
const auditResult = await runIntegrityAudit();
if (auditResult.issues > 0) {
log.info({ ...auditResult }, "Integrity audit found issues");
}
} catch (auditErr) {
log.warn({ err: auditErr }, "Integrity audit failed");
}
} catch (err) {
log.error({ err }, "Ingestion cycle failed");
} finally {

View File

@@ -39,6 +39,7 @@ interface TdMessage {
id: number;
date: number;
media_album_id?: string;
reply_to_message_id?: number;
content: {
_: string;
document?: {
@@ -216,6 +217,8 @@ export async function getChannelMessages(
fileSize: BigInt(doc.document.size),
date: new Date(msg.date * 1000),
mediaAlbumId: msg.media_album_id && msg.media_album_id !== "0" ? msg.media_album_id : undefined,
replyToMessageId: msg.reply_to_message_id ? BigInt(msg.reply_to_message_id) : undefined,
caption: msg.content?.caption?.text || undefined,
});
continue;
}

View File

@@ -47,7 +47,7 @@ import { readRarContents } from "./archive/rar-reader.js";
import { read7zContents } from "./archive/sevenz-reader.js";
import { byteLevelSplit, concatenateFiles } from "./archive/split.js";
import { uploadToChannel } from "./upload/channel.js";
import { processAlbumGroups, processTimeWindowGroups, processPatternGroups, processCreatorGroups, type IndexedPackageRef } from "./grouping.js";
import { processAlbumGroups, processTimeWindowGroups, processPatternGroups, processCreatorGroups, processZipPathGroups, processReplyChainGroups, processCaptionGroups, type IndexedPackageRef } from "./grouping.js";
import { db } from "./db/client.js";
import type { TelegramAccount, TelegramChannel } from "@prisma/client";
import type { Client } from "tdl";
@@ -816,6 +816,15 @@ async function processArchiveSets(
// Creator-based grouping (3+ files from same creator)
await processCreatorGroups(channel.id, indexedPackageRefs);
// ZIP path prefix grouping (shared root folder inside archives)
await processZipPathGroups(channel.id, indexedPackageRefs);
// Reply chain grouping (messages replying to same root)
await processReplyChainGroups(channel.id, indexedPackageRefs);
// Caption fuzzy match grouping
await processCaptionGroups(channel.id, indexedPackageRefs);
}
return maxProcessedId;
@@ -1235,6 +1244,8 @@ async function processOneArchiveSet(
tags,
previewData,
previewMsgId,
sourceCaption: archiveSet.parts[0].caption ?? null,
replyToMessageId: archiveSet.parts[0].replyToMessageId ?? null,
files: entries,
});