fix: transcript dump uses append-only JSONL, not full rewrite (Ganymede)

Each dump was rewriting the full accumulated history — growing unbounded.
Now: append-only JSONL (one line per message), only new entries since
last dump. One file per chat per day. No dedup needed downstream.

Also verified ARCHIVE_DIR path is correct (staging dir, not worktree).

Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
This commit is contained in:
m3taversal 2026-03-25 13:39:43 +00:00
parent 66bc742979
commit 1019602eec

View file

@ -143,19 +143,33 @@ def _record_transcript(msg, text: str, is_bot: bool = False,
transcript.append(entry)
_last_dump_index: dict[int, int] = {} # chat_id → index of last dumped message
async def _dump_transcripts(context=None):
"""Dump all chat transcripts to VPS-local JSON files. Runs every 6 hours."""
"""Append new transcript entries to per-chat JSONL files. Runs every hour.
Append-only: each dump writes only new messages since last dump (Ganymede review).
One JSONL file per chat per day. Each line is one message.
"""
if not chat_transcripts:
return
os.makedirs(TRANSCRIPT_DIR, exist_ok=True)
now = datetime.now(timezone.utc)
period_end = now.strftime("%Y-%m-%dT%H")
today = now.strftime("%Y-%m-%d")
import json as _json
for chat_id, entries in list(chat_transcripts.items()):
if not entries:
continue
# Only write new entries since last dump
last_idx = _last_dump_index.get(chat_id, 0)
new_entries = entries[last_idx:]
if not new_entries:
continue
# Get chat title from first entry
chat_title = entries[0].get("chat_title", str(chat_id))
chat_slug = re.sub(r"[^a-z0-9]+", "-", chat_title.lower()).strip("-") or str(chat_id)
@ -164,44 +178,20 @@ async def _dump_transcripts(context=None):
chat_dir = os.path.join(TRANSCRIPT_DIR, chat_slug)
os.makedirs(chat_dir, exist_ok=True)
# Build transcript document
unique_users = set()
for e in entries:
if e.get("username"):
unique_users.add(e["username"])
doc = {
"chat_id": chat_id,
"chat_title": chat_title,
"period": {
"start": entries[0].get("ts", ""),
"end": entries[-1].get("ts", ""),
"dumped_at": now.isoformat(),
},
"stats": {
"total_messages": len(entries),
"unique_users": len(unique_users),
"users": sorted(unique_users),
"bot_responses": sum(1 for e in entries if e.get("type") == "bot_response"),
},
"messages": entries,
}
filename = f"{period_end}.json"
# Append to today's JSONL file
filename = f"{today}.jsonl"
filepath = os.path.join(chat_dir, filename)
try:
import json as _json
with open(filepath, "w") as f:
_json.dump(doc, f, indent=2, default=str)
logger.info("Transcript dumped: %s (%d messages, %d users)",
filepath, len(entries), len(unique_users))
with open(filepath, "a") as f:
for entry in new_entries:
f.write(_json.dumps(entry, default=str) + "\n")
_last_dump_index[chat_id] = len(entries)
logger.info("Transcript appended: %s (+%d messages, %d total)",
filepath, len(new_entries), len(entries))
except Exception as e:
logger.warning("Failed to dump transcript for %s: %s", chat_slug, e)
# Don't clear — transcripts accumulate over the session.
# Each dump is the full history since last restart.
def _create_inline_source(source_text: str, user_message: str, user, msg):
"""Create a source file from Rio's SOURCE: tag. Verbatim user content, attributed."""