- config.py: added INBOX_QUEUE, INBOX_NULL_RESULT constants - evaluate.py: skip patterns + LIGHT tier cover all inbox/ subdirs - llm.py: eval prompts reference inbox/ generically - telegram/bot.py: archives to inbox/queue/ - telegram/teleo-telegram.service: ReadWritePaths expanded - research-prompt-v2.md: paths updated to inbox/queue/ - research-prompt-leo-synthesis.md: paths updated - migrate-source-archive.py: one-time migration script Reviewed by: Ganymede, Rhea, Leo (all approved) Pentagon-Agent: Epimetheus <968B2991-E2DF-4006-B962-F5B0A0CC8ACA>
130 lines
4.4 KiB
Python
130 lines
4.4 KiB
Python
#!/usr/bin/env python3
|
|
"""Migrate source archive from flat inbox/archive/ to organized structure.
|
|
|
|
inbox/queue/ — unprocessed sources (landing zone)
|
|
inbox/archive/{domain}/ — processed sources with extraction results
|
|
inbox/null-result/ — reviewed, nothing extractable
|
|
|
|
One-time migration. Atomic commit. Idempotent (safe to re-run).
|
|
|
|
Run from repo root:
|
|
cd /opt/teleo-eval/workspaces/main
|
|
python3 /opt/teleo-eval/pipeline/migrate-source-archive.py [--dry-run]
|
|
"""
|
|
|
|
import argparse
|
|
import glob
|
|
import os
|
|
import re
|
|
from pathlib import Path
|
|
|
|
|
|
def get_source_status(filepath: str) -> str:
|
|
"""Read status from source frontmatter."""
|
|
try:
|
|
content = open(filepath).read()
|
|
match = re.search(r"^status:\s*(\S+)", content, re.MULTILINE)
|
|
if match:
|
|
return match.group(1).strip()
|
|
except Exception:
|
|
pass
|
|
return "unknown"
|
|
|
|
|
|
def get_source_domain(filepath: str) -> str:
|
|
"""Read domain from source frontmatter."""
|
|
try:
|
|
content = open(filepath).read()
|
|
match = re.search(r"^domain:\s*(\S+)", content, re.MULTILINE)
|
|
if match:
|
|
return match.group(1).strip()
|
|
except Exception:
|
|
pass
|
|
return "uncategorized"
|
|
|
|
|
|
def migrate(repo_root: str, dry_run: bool = False):
|
|
"""Move source files to organized structure."""
|
|
archive_dir = os.path.join(repo_root, "inbox", "archive")
|
|
queue_dir = os.path.join(repo_root, "inbox", "queue")
|
|
null_dir = os.path.join(repo_root, "inbox", "null-result")
|
|
|
|
if not os.path.isdir(archive_dir):
|
|
print(f"ERROR: {archive_dir} not found")
|
|
return
|
|
|
|
# Create target directories
|
|
if not dry_run:
|
|
os.makedirs(queue_dir, exist_ok=True)
|
|
os.makedirs(null_dir, exist_ok=True)
|
|
|
|
sources = glob.glob(os.path.join(archive_dir, "*.md"))
|
|
print(f"Found {len(sources)} source files in inbox/archive/")
|
|
|
|
moved = {"queue": 0, "null-result": 0, "archive": {}}
|
|
skipped = 0
|
|
|
|
for filepath in sorted(sources):
|
|
filename = os.path.basename(filepath)
|
|
if filename.startswith("_") or filename.startswith("."):
|
|
skipped += 1
|
|
continue
|
|
|
|
status = get_source_status(filepath)
|
|
domain = get_source_domain(filepath)
|
|
|
|
if status == "unprocessed" or status == "processing":
|
|
# → queue/
|
|
dest = os.path.join(queue_dir, filename)
|
|
if not dry_run:
|
|
os.rename(filepath, dest)
|
|
moved["queue"] += 1
|
|
|
|
elif status in ("null-result", "null_result"):
|
|
# → null-result/
|
|
dest = os.path.join(null_dir, filename)
|
|
if not dry_run:
|
|
os.rename(filepath, dest)
|
|
moved["null-result"] += 1
|
|
|
|
elif status in ("processed", "enrichment"):
|
|
# → archive/{domain}/
|
|
domain_dir = os.path.join(archive_dir, domain)
|
|
if not dry_run:
|
|
os.makedirs(domain_dir, exist_ok=True)
|
|
dest = os.path.join(domain_dir, filename)
|
|
if not dry_run:
|
|
os.rename(filepath, dest)
|
|
moved["archive"][domain] = moved["archive"].get(domain, 0) + 1
|
|
|
|
else:
|
|
# Unknown status — treat as unprocessed → queue/
|
|
dest = os.path.join(queue_dir, filename)
|
|
if not dry_run:
|
|
os.rename(filepath, dest)
|
|
moved["queue"] += 1
|
|
|
|
# Also move any .extraction-debug/ directory
|
|
debug_dir = os.path.join(archive_dir, ".extraction-debug")
|
|
if os.path.isdir(debug_dir):
|
|
print(f" (keeping .extraction-debug/ in place)")
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f" MIGRATION {'(DRY RUN) ' if dry_run else ''}COMPLETE")
|
|
print(f" → queue/ (unprocessed): {moved['queue']}")
|
|
print(f" → null-result/: {moved['null-result']}")
|
|
print(f" → archive/{{domain}}/:")
|
|
for domain, count in sorted(moved["archive"].items()):
|
|
print(f" {domain}: {count}")
|
|
print(f" Archive total: {sum(moved['archive'].values())}")
|
|
print(f" Skipped: {skipped}")
|
|
print(f" Grand total: {moved['queue'] + moved['null-result'] + sum(moved['archive'].values()) + skipped}")
|
|
print(f"{'='*60}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="Migrate source archive to organized structure")
|
|
parser.add_argument("--repo-root", default=".", help="Repository root")
|
|
parser.add_argument("--dry-run", action="store_true")
|
|
args = parser.parse_args()
|
|
migrate(args.repo_root, args.dry_run)
|