From 1019602eecb4a2021c4a38ce5e15806a488d2076 Mon Sep 17 00:00:00 2001 From: m3taversal Date: Wed, 25 Mar 2026 13:39:43 +0000 Subject: [PATCH] fix: transcript dump uses append-only JSONL, not full rewrite (Ganymede) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each dump was rewriting the full accumulated history — growing unbounded. Now: append-only JSONL (one line per message), only new entries since last dump. One file per chat per day. No dedup needed downstream. Also verified ARCHIVE_DIR path is correct (staging dir, not worktree). Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70> --- telegram/bot.py | 58 ++++++++++++++++++++----------------------------- 1 file changed, 24 insertions(+), 34 deletions(-) diff --git a/telegram/bot.py b/telegram/bot.py index f31d6d6..95fd69f 100644 --- a/telegram/bot.py +++ b/telegram/bot.py @@ -143,19 +143,33 @@ def _record_transcript(msg, text: str, is_bot: bool = False, transcript.append(entry) +_last_dump_index: dict[int, int] = {} # chat_id → index of last dumped message + + async def _dump_transcripts(context=None): - """Dump all chat transcripts to VPS-local JSON files. Runs every 6 hours.""" + """Append new transcript entries to per-chat JSONL files. Runs every hour. + + Append-only: each dump writes only new messages since last dump (Ganymede review). + One JSONL file per chat per day. Each line is one message. + """ if not chat_transcripts: return os.makedirs(TRANSCRIPT_DIR, exist_ok=True) now = datetime.now(timezone.utc) - period_end = now.strftime("%Y-%m-%dT%H") + today = now.strftime("%Y-%m-%d") + import json as _json for chat_id, entries in list(chat_transcripts.items()): if not entries: continue + # Only write new entries since last dump + last_idx = _last_dump_index.get(chat_id, 0) + new_entries = entries[last_idx:] + if not new_entries: + continue + # Get chat title from first entry chat_title = entries[0].get("chat_title", str(chat_id)) chat_slug = re.sub(r"[^a-z0-9]+", "-", chat_title.lower()).strip("-") or str(chat_id) @@ -164,44 +178,20 @@ async def _dump_transcripts(context=None): chat_dir = os.path.join(TRANSCRIPT_DIR, chat_slug) os.makedirs(chat_dir, exist_ok=True) - # Build transcript document - unique_users = set() - for e in entries: - if e.get("username"): - unique_users.add(e["username"]) - - doc = { - "chat_id": chat_id, - "chat_title": chat_title, - "period": { - "start": entries[0].get("ts", ""), - "end": entries[-1].get("ts", ""), - "dumped_at": now.isoformat(), - }, - "stats": { - "total_messages": len(entries), - "unique_users": len(unique_users), - "users": sorted(unique_users), - "bot_responses": sum(1 for e in entries if e.get("type") == "bot_response"), - }, - "messages": entries, - } - - filename = f"{period_end}.json" + # Append to today's JSONL file + filename = f"{today}.jsonl" filepath = os.path.join(chat_dir, filename) try: - import json as _json - with open(filepath, "w") as f: - _json.dump(doc, f, indent=2, default=str) - logger.info("Transcript dumped: %s (%d messages, %d users)", - filepath, len(entries), len(unique_users)) + with open(filepath, "a") as f: + for entry in new_entries: + f.write(_json.dumps(entry, default=str) + "\n") + _last_dump_index[chat_id] = len(entries) + logger.info("Transcript appended: %s (+%d messages, %d total)", + filepath, len(new_entries), len(entries)) except Exception as e: logger.warning("Failed to dump transcript for %s: %s", chat_slug, e) - # Don't clear — transcripts accumulate over the session. - # Each dump is the full history since last restart. - def _create_inline_source(source_text: str, user_message: str, user, msg): """Create a source file from Rio's SOURCE: tag. Verbatim user content, attributed."""