teleo-infrastructure/scripts/migrate-source-archive.py
m3taversal d2aec7fee3
Some checks are pending
CI / lint-and-test (push) Waiting to run
feat: reorganize repo with clear directory boundaries and agent ownership
Move scattered root-level files into categorized directories:
- deploy/ — deployment + mirror scripts (Ship)
- scripts/ — one-off backfills + migrations (Ship)
- research/ — nightly research + prompts (Ship)
- docs/ — all operational documentation (shared)

Delete 3 dead cron scripts replaced by pipeline daemon:
- batch-extract-50.sh, evaluate-trigger.sh, extract-cron.sh

Add CODEOWNERS mapping every path to its owning agent.
Add README with directory structure, ownership table, and VPS layout.
Update deploy.sh paths to match new structure.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-14 18:20:13 +01:00

130 lines
4.4 KiB
Python

#!/usr/bin/env python3
"""Migrate source archive from flat inbox/archive/ to organized structure.
inbox/queue/ — unprocessed sources (landing zone)
inbox/archive/{domain}/ — processed sources with extraction results
inbox/null-result/ — reviewed, nothing extractable
One-time migration. Atomic commit. Idempotent (safe to re-run).
Run from repo root:
cd /opt/teleo-eval/workspaces/main
python3 /opt/teleo-eval/pipeline/migrate-source-archive.py [--dry-run]
"""
import argparse
import glob
import os
import re
from pathlib import Path
def get_source_status(filepath: str) -> str:
"""Read status from source frontmatter."""
try:
content = open(filepath).read()
match = re.search(r"^status:\s*(\S+)", content, re.MULTILINE)
if match:
return match.group(1).strip()
except Exception:
pass
return "unknown"
def get_source_domain(filepath: str) -> str:
"""Read domain from source frontmatter."""
try:
content = open(filepath).read()
match = re.search(r"^domain:\s*(\S+)", content, re.MULTILINE)
if match:
return match.group(1).strip()
except Exception:
pass
return "uncategorized"
def migrate(repo_root: str, dry_run: bool = False):
"""Move source files to organized structure."""
archive_dir = os.path.join(repo_root, "inbox", "archive")
queue_dir = os.path.join(repo_root, "inbox", "queue")
null_dir = os.path.join(repo_root, "inbox", "null-result")
if not os.path.isdir(archive_dir):
print(f"ERROR: {archive_dir} not found")
return
# Create target directories
if not dry_run:
os.makedirs(queue_dir, exist_ok=True)
os.makedirs(null_dir, exist_ok=True)
sources = glob.glob(os.path.join(archive_dir, "*.md"))
print(f"Found {len(sources)} source files in inbox/archive/")
moved = {"queue": 0, "null-result": 0, "archive": {}}
skipped = 0
for filepath in sorted(sources):
filename = os.path.basename(filepath)
if filename.startswith("_") or filename.startswith("."):
skipped += 1
continue
status = get_source_status(filepath)
domain = get_source_domain(filepath)
if status == "unprocessed" or status == "processing":
# → queue/
dest = os.path.join(queue_dir, filename)
if not dry_run:
os.rename(filepath, dest)
moved["queue"] += 1
elif status in ("null-result", "null_result"):
# → null-result/
dest = os.path.join(null_dir, filename)
if not dry_run:
os.rename(filepath, dest)
moved["null-result"] += 1
elif status in ("processed", "enrichment"):
# → archive/{domain}/
domain_dir = os.path.join(archive_dir, domain)
if not dry_run:
os.makedirs(domain_dir, exist_ok=True)
dest = os.path.join(domain_dir, filename)
if not dry_run:
os.rename(filepath, dest)
moved["archive"][domain] = moved["archive"].get(domain, 0) + 1
else:
# Unknown status — treat as unprocessed → queue/
dest = os.path.join(queue_dir, filename)
if not dry_run:
os.rename(filepath, dest)
moved["queue"] += 1
# Also move any .extraction-debug/ directory
debug_dir = os.path.join(archive_dir, ".extraction-debug")
if os.path.isdir(debug_dir):
print(f" (keeping .extraction-debug/ in place)")
print(f"\n{'='*60}")
print(f" MIGRATION {'(DRY RUN) ' if dry_run else ''}COMPLETE")
print(f" → queue/ (unprocessed): {moved['queue']}")
print(f" → null-result/: {moved['null-result']}")
print(f" → archive/{{domain}}/:")
for domain, count in sorted(moved["archive"].items()):
print(f" {domain}: {count}")
print(f" Archive total: {sum(moved['archive'].values())}")
print(f" Skipped: {skipped}")
print(f" Grand total: {moved['queue'] + moved['null-result'] + sum(moved['archive'].values()) + skipped}")
print(f"{'='*60}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Migrate source archive to organized structure")
parser.add_argument("--repo-root", default=".", help="Repository root")
parser.add_argument("--dry-run", action="store_true")
args = parser.parse_args()
migrate(args.repo_root, args.dry_run)