fix: transcript dump uses append-only JSONL, not full rewrite (Ganymede)
Each dump was rewriting the full accumulated history — growing unbounded. Now: append-only JSONL (one line per message), only new entries since last dump. One file per chat per day. No dedup needed downstream. Also verified ARCHIVE_DIR path is correct (staging dir, not worktree). Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
This commit is contained in:
parent
66bc742979
commit
1019602eec
1 changed files with 24 additions and 34 deletions
|
|
@ -143,19 +143,33 @@ def _record_transcript(msg, text: str, is_bot: bool = False,
|
|||
transcript.append(entry)
|
||||
|
||||
|
||||
_last_dump_index: dict[int, int] = {} # chat_id → index of last dumped message
|
||||
|
||||
|
||||
async def _dump_transcripts(context=None):
|
||||
"""Dump all chat transcripts to VPS-local JSON files. Runs every 6 hours."""
|
||||
"""Append new transcript entries to per-chat JSONL files. Runs every hour.
|
||||
|
||||
Append-only: each dump writes only new messages since last dump (Ganymede review).
|
||||
One JSONL file per chat per day. Each line is one message.
|
||||
"""
|
||||
if not chat_transcripts:
|
||||
return
|
||||
|
||||
os.makedirs(TRANSCRIPT_DIR, exist_ok=True)
|
||||
now = datetime.now(timezone.utc)
|
||||
period_end = now.strftime("%Y-%m-%dT%H")
|
||||
today = now.strftime("%Y-%m-%d")
|
||||
|
||||
import json as _json
|
||||
for chat_id, entries in list(chat_transcripts.items()):
|
||||
if not entries:
|
||||
continue
|
||||
|
||||
# Only write new entries since last dump
|
||||
last_idx = _last_dump_index.get(chat_id, 0)
|
||||
new_entries = entries[last_idx:]
|
||||
if not new_entries:
|
||||
continue
|
||||
|
||||
# Get chat title from first entry
|
||||
chat_title = entries[0].get("chat_title", str(chat_id))
|
||||
chat_slug = re.sub(r"[^a-z0-9]+", "-", chat_title.lower()).strip("-") or str(chat_id)
|
||||
|
|
@ -164,44 +178,20 @@ async def _dump_transcripts(context=None):
|
|||
chat_dir = os.path.join(TRANSCRIPT_DIR, chat_slug)
|
||||
os.makedirs(chat_dir, exist_ok=True)
|
||||
|
||||
# Build transcript document
|
||||
unique_users = set()
|
||||
for e in entries:
|
||||
if e.get("username"):
|
||||
unique_users.add(e["username"])
|
||||
|
||||
doc = {
|
||||
"chat_id": chat_id,
|
||||
"chat_title": chat_title,
|
||||
"period": {
|
||||
"start": entries[0].get("ts", ""),
|
||||
"end": entries[-1].get("ts", ""),
|
||||
"dumped_at": now.isoformat(),
|
||||
},
|
||||
"stats": {
|
||||
"total_messages": len(entries),
|
||||
"unique_users": len(unique_users),
|
||||
"users": sorted(unique_users),
|
||||
"bot_responses": sum(1 for e in entries if e.get("type") == "bot_response"),
|
||||
},
|
||||
"messages": entries,
|
||||
}
|
||||
|
||||
filename = f"{period_end}.json"
|
||||
# Append to today's JSONL file
|
||||
filename = f"{today}.jsonl"
|
||||
filepath = os.path.join(chat_dir, filename)
|
||||
|
||||
try:
|
||||
import json as _json
|
||||
with open(filepath, "w") as f:
|
||||
_json.dump(doc, f, indent=2, default=str)
|
||||
logger.info("Transcript dumped: %s (%d messages, %d users)",
|
||||
filepath, len(entries), len(unique_users))
|
||||
with open(filepath, "a") as f:
|
||||
for entry in new_entries:
|
||||
f.write(_json.dumps(entry, default=str) + "\n")
|
||||
_last_dump_index[chat_id] = len(entries)
|
||||
logger.info("Transcript appended: %s (+%d messages, %d total)",
|
||||
filepath, len(new_entries), len(entries))
|
||||
except Exception as e:
|
||||
logger.warning("Failed to dump transcript for %s: %s", chat_slug, e)
|
||||
|
||||
# Don't clear — transcripts accumulate over the session.
|
||||
# Each dump is the full history since last restart.
|
||||
|
||||
|
||||
def _create_inline_source(source_text: str, user_message: str, user, msg):
|
||||
"""Create a source file from Rio's SOURCE: tag. Verbatim user content, attributed."""
|
||||
|
|
|
|||
Loading…
Reference in a new issue