teleo-infrastructure/migrate-source-archive.py
m3taversal 090b1411fd epimetheus: source archive restructure — inbox/queue + inbox/archive/{domain} + inbox/null-result
- config.py: added INBOX_QUEUE, INBOX_NULL_RESULT constants
- evaluate.py: skip patterns + LIGHT tier cover all inbox/ subdirs
- llm.py: eval prompts reference inbox/ generically
- telegram/bot.py: archives to inbox/queue/
- telegram/teleo-telegram.service: ReadWritePaths expanded
- research-prompt-v2.md: paths updated to inbox/queue/
- research-prompt-leo-synthesis.md: paths updated
- migrate-source-archive.py: one-time migration script

Reviewed by: Ganymede, Rhea, Leo (all approved)

Pentagon-Agent: Epimetheus <968B2991-E2DF-4006-B962-F5B0A0CC8ACA>
2026-03-18 11:50:04 +00:00

130 lines
4.4 KiB
Python

#!/usr/bin/env python3
"""Migrate source archive from flat inbox/archive/ to organized structure.
inbox/queue/ — unprocessed sources (landing zone)
inbox/archive/{domain}/ — processed sources with extraction results
inbox/null-result/ — reviewed, nothing extractable
One-time migration. Atomic commit. Idempotent (safe to re-run).
Run from repo root:
cd /opt/teleo-eval/workspaces/main
python3 /opt/teleo-eval/pipeline/migrate-source-archive.py [--dry-run]
"""
import argparse
import glob
import os
import re
from pathlib import Path
def get_source_status(filepath: str) -> str:
"""Read status from source frontmatter."""
try:
content = open(filepath).read()
match = re.search(r"^status:\s*(\S+)", content, re.MULTILINE)
if match:
return match.group(1).strip()
except Exception:
pass
return "unknown"
def get_source_domain(filepath: str) -> str:
"""Read domain from source frontmatter."""
try:
content = open(filepath).read()
match = re.search(r"^domain:\s*(\S+)", content, re.MULTILINE)
if match:
return match.group(1).strip()
except Exception:
pass
return "uncategorized"
def migrate(repo_root: str, dry_run: bool = False):
"""Move source files to organized structure."""
archive_dir = os.path.join(repo_root, "inbox", "archive")
queue_dir = os.path.join(repo_root, "inbox", "queue")
null_dir = os.path.join(repo_root, "inbox", "null-result")
if not os.path.isdir(archive_dir):
print(f"ERROR: {archive_dir} not found")
return
# Create target directories
if not dry_run:
os.makedirs(queue_dir, exist_ok=True)
os.makedirs(null_dir, exist_ok=True)
sources = glob.glob(os.path.join(archive_dir, "*.md"))
print(f"Found {len(sources)} source files in inbox/archive/")
moved = {"queue": 0, "null-result": 0, "archive": {}}
skipped = 0
for filepath in sorted(sources):
filename = os.path.basename(filepath)
if filename.startswith("_") or filename.startswith("."):
skipped += 1
continue
status = get_source_status(filepath)
domain = get_source_domain(filepath)
if status == "unprocessed" or status == "processing":
# → queue/
dest = os.path.join(queue_dir, filename)
if not dry_run:
os.rename(filepath, dest)
moved["queue"] += 1
elif status in ("null-result", "null_result"):
# → null-result/
dest = os.path.join(null_dir, filename)
if not dry_run:
os.rename(filepath, dest)
moved["null-result"] += 1
elif status in ("processed", "enrichment"):
# → archive/{domain}/
domain_dir = os.path.join(archive_dir, domain)
if not dry_run:
os.makedirs(domain_dir, exist_ok=True)
dest = os.path.join(domain_dir, filename)
if not dry_run:
os.rename(filepath, dest)
moved["archive"][domain] = moved["archive"].get(domain, 0) + 1
else:
# Unknown status — treat as unprocessed → queue/
dest = os.path.join(queue_dir, filename)
if not dry_run:
os.rename(filepath, dest)
moved["queue"] += 1
# Also move any .extraction-debug/ directory
debug_dir = os.path.join(archive_dir, ".extraction-debug")
if os.path.isdir(debug_dir):
print(f" (keeping .extraction-debug/ in place)")
print(f"\n{'='*60}")
print(f" MIGRATION {'(DRY RUN) ' if dry_run else ''}COMPLETE")
print(f" → queue/ (unprocessed): {moved['queue']}")
print(f" → null-result/: {moved['null-result']}")
print(f" → archive/{{domain}}/:")
for domain, count in sorted(moved["archive"].items()):
print(f" {domain}: {count}")
print(f" Archive total: {sum(moved['archive'].values())}")
print(f" Skipped: {skipped}")
print(f" Grand total: {moved['queue'] + moved['null-result'] + sum(moved['archive'].values()) + skipped}")
print(f"{'='*60}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Migrate source archive to organized structure")
parser.add_argument("--repo-root", default=".", help="Repository root")
parser.add_argument("--dry-run", action="store_true")
args = parser.parse_args()
migrate(args.repo_root, args.dry_run)