#!/usr/bin/env python3 """Migrate source archive from flat inbox/archive/ to organized structure. inbox/queue/ — unprocessed sources (landing zone) inbox/archive/{domain}/ — processed sources with extraction results inbox/null-result/ — reviewed, nothing extractable One-time migration. Atomic commit. Idempotent (safe to re-run). Run from repo root: cd /opt/teleo-eval/workspaces/main python3 /opt/teleo-eval/pipeline/migrate-source-archive.py [--dry-run] """ import argparse import glob import os import re from pathlib import Path def get_source_status(filepath: str) -> str: """Read status from source frontmatter.""" try: content = open(filepath).read() match = re.search(r"^status:\s*(\S+)", content, re.MULTILINE) if match: return match.group(1).strip() except Exception: pass return "unknown" def get_source_domain(filepath: str) -> str: """Read domain from source frontmatter.""" try: content = open(filepath).read() match = re.search(r"^domain:\s*(\S+)", content, re.MULTILINE) if match: return match.group(1).strip() except Exception: pass return "uncategorized" def migrate(repo_root: str, dry_run: bool = False): """Move source files to organized structure.""" archive_dir = os.path.join(repo_root, "inbox", "archive") queue_dir = os.path.join(repo_root, "inbox", "queue") null_dir = os.path.join(repo_root, "inbox", "null-result") if not os.path.isdir(archive_dir): print(f"ERROR: {archive_dir} not found") return # Create target directories if not dry_run: os.makedirs(queue_dir, exist_ok=True) os.makedirs(null_dir, exist_ok=True) sources = glob.glob(os.path.join(archive_dir, "*.md")) print(f"Found {len(sources)} source files in inbox/archive/") moved = {"queue": 0, "null-result": 0, "archive": {}} skipped = 0 for filepath in sorted(sources): filename = os.path.basename(filepath) if filename.startswith("_") or filename.startswith("."): skipped += 1 continue status = get_source_status(filepath) domain = get_source_domain(filepath) if status == "unprocessed" or status == "processing": # → queue/ dest = os.path.join(queue_dir, filename) if not dry_run: os.rename(filepath, dest) moved["queue"] += 1 elif status in ("null-result", "null_result"): # → null-result/ dest = os.path.join(null_dir, filename) if not dry_run: os.rename(filepath, dest) moved["null-result"] += 1 elif status in ("processed", "enrichment"): # → archive/{domain}/ domain_dir = os.path.join(archive_dir, domain) if not dry_run: os.makedirs(domain_dir, exist_ok=True) dest = os.path.join(domain_dir, filename) if not dry_run: os.rename(filepath, dest) moved["archive"][domain] = moved["archive"].get(domain, 0) + 1 else: # Unknown status — treat as unprocessed → queue/ dest = os.path.join(queue_dir, filename) if not dry_run: os.rename(filepath, dest) moved["queue"] += 1 # Also move any .extraction-debug/ directory debug_dir = os.path.join(archive_dir, ".extraction-debug") if os.path.isdir(debug_dir): print(f" (keeping .extraction-debug/ in place)") print(f"\n{'='*60}") print(f" MIGRATION {'(DRY RUN) ' if dry_run else ''}COMPLETE") print(f" → queue/ (unprocessed): {moved['queue']}") print(f" → null-result/: {moved['null-result']}") print(f" → archive/{{domain}}/:") for domain, count in sorted(moved["archive"].items()): print(f" {domain}: {count}") print(f" Archive total: {sum(moved['archive'].values())}") print(f" Skipped: {skipped}") print(f" Grand total: {moved['queue'] + moved['null-result'] + sum(moved['archive'].values()) + skipped}") print(f"{'='*60}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Migrate source archive to organized structure") parser.add_argument("--repo-root", default=".", help="Repository root") parser.add_argument("--dry-run", action="store_true") args = parser.parse_args() migrate(args.repo_root, args.dry_run)