fix: transcript dump uses append-only JSONL, not full rewrite (Ganymede)

Each dump was rewriting the full accumulated history — growing unbounded. Now: append-only JSONL (one line per message), only new entries since last dump. One file per chat per day. No dedup needed downstream. Also verified ARCHIVE_DIR path is correct (staging dir, not worktree). Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
2026-03-25 13:39:43 +00:00 · 2026-03-25 13:39:43 +00:00 · 1019602eec
commit 1019602eec
parent 66bc742979
1 changed files with 24 additions and 34 deletions
--- a/telegram/bot.py
+++ b/telegram/bot.py
@ -143,19 +143,33 @@ def _record_transcript(msg, text: str, is_bot: bool = False,
    transcript.append(entry)


+_last_dump_index: dict[int, int] = {}  # chat_id → index of last dumped message
+
+
 async def _dump_transcripts(context=None):
-    """Dump all chat transcripts to VPS-local JSON files. Runs every 6 hours."""
+    """Append new transcript entries to per-chat JSONL files. Runs every hour.
+
+    Append-only: each dump writes only new messages since last dump (Ganymede review).
+    One JSONL file per chat per day. Each line is one message.
+    """
    if not chat_transcripts:
        return

    os.makedirs(TRANSCRIPT_DIR, exist_ok=True)
    now = datetime.now(timezone.utc)
-    period_end = now.strftime("%Y-%m-%dT%H")
+    today = now.strftime("%Y-%m-%d")

+    import json as _json
    for chat_id, entries in list(chat_transcripts.items()):
        if not entries:
            continue

+        # Only write new entries since last dump
+        last_idx = _last_dump_index.get(chat_id, 0)
+        new_entries = entries[last_idx:]
+        if not new_entries:
+            continue
+
        # Get chat title from first entry
        chat_title = entries[0].get("chat_title", str(chat_id))
        chat_slug = re.sub(r"[^a-z0-9]+", "-", chat_title.lower()).strip("-") or str(chat_id)
@ -164,44 +178,20 @@ async def _dump_transcripts(context=None):
        chat_dir = os.path.join(TRANSCRIPT_DIR, chat_slug)
        os.makedirs(chat_dir, exist_ok=True)

-        # Build transcript document
-        unique_users = set()
-        for e in entries:
-            if e.get("username"):
-                unique_users.add(e["username"])
-
-        doc = {
-            "chat_id": chat_id,
-            "chat_title": chat_title,
-            "period": {
-                "start": entries[0].get("ts", ""),
-                "end": entries[-1].get("ts", ""),
-                "dumped_at": now.isoformat(),
-            },
-            "stats": {
-                "total_messages": len(entries),
-                "unique_users": len(unique_users),
-                "users": sorted(unique_users),
-                "bot_responses": sum(1 for e in entries if e.get("type") == "bot_response"),
-            },
-            "messages": entries,
-        }
-
-        filename = f"{period_end}.json"
+        # Append to today's JSONL file
+        filename = f"{today}.jsonl"
        filepath = os.path.join(chat_dir, filename)

        try:
-            import json as _json
-            with open(filepath, "w") as f:
-                _json.dump(doc, f, indent=2, default=str)
-            logger.info("Transcript dumped: %s (%d messages, %d users)",
-                        filepath, len(entries), len(unique_users))
+            with open(filepath, "a") as f:
+                for entry in new_entries:
+                    f.write(_json.dumps(entry, default=str) + "\n")
+            _last_dump_index[chat_id] = len(entries)
+            logger.info("Transcript appended: %s (+%d messages, %d total)",
+                        filepath, len(new_entries), len(entries))
        except Exception as e:
            logger.warning("Failed to dump transcript for %s: %s", chat_slug, e)

-    # Don't clear — transcripts accumulate over the session.
-    # Each dump is the full history since last restart.
-

 def _create_inline_source(source_text: str, user_message: str, user, msg):
    """Create a source file from Rio's SOURCE: tag. Verbatim user content, attributed."""