Some checks are pending
CI / lint-and-test (push) Waiting to run
Move scattered root-level files into categorized directories: - deploy/ — deployment + mirror scripts (Ship) - scripts/ — one-off backfills + migrations (Ship) - research/ — nightly research + prompts (Ship) - docs/ — all operational documentation (shared) Delete 3 dead cron scripts replaced by pipeline daemon: - batch-extract-50.sh, evaluate-trigger.sh, extract-cron.sh Add CODEOWNERS mapping every path to its owning agent. Add README with directory structure, ownership table, and VPS layout. Update deploy.sh paths to match new structure. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
130 lines
4.4 KiB
Python
130 lines
4.4 KiB
Python
#!/usr/bin/env python3
|
|
"""Migrate source archive from flat inbox/archive/ to organized structure.
|
|
|
|
inbox/queue/ — unprocessed sources (landing zone)
|
|
inbox/archive/{domain}/ — processed sources with extraction results
|
|
inbox/null-result/ — reviewed, nothing extractable
|
|
|
|
One-time migration. Atomic commit. Idempotent (safe to re-run).
|
|
|
|
Run from repo root:
|
|
cd /opt/teleo-eval/workspaces/main
|
|
python3 /opt/teleo-eval/pipeline/migrate-source-archive.py [--dry-run]
|
|
"""
|
|
|
|
import argparse
|
|
import glob
|
|
import os
|
|
import re
|
|
from pathlib import Path
|
|
|
|
|
|
def get_source_status(filepath: str) -> str:
|
|
"""Read status from source frontmatter."""
|
|
try:
|
|
content = open(filepath).read()
|
|
match = re.search(r"^status:\s*(\S+)", content, re.MULTILINE)
|
|
if match:
|
|
return match.group(1).strip()
|
|
except Exception:
|
|
pass
|
|
return "unknown"
|
|
|
|
|
|
def get_source_domain(filepath: str) -> str:
|
|
"""Read domain from source frontmatter."""
|
|
try:
|
|
content = open(filepath).read()
|
|
match = re.search(r"^domain:\s*(\S+)", content, re.MULTILINE)
|
|
if match:
|
|
return match.group(1).strip()
|
|
except Exception:
|
|
pass
|
|
return "uncategorized"
|
|
|
|
|
|
def migrate(repo_root: str, dry_run: bool = False):
|
|
"""Move source files to organized structure."""
|
|
archive_dir = os.path.join(repo_root, "inbox", "archive")
|
|
queue_dir = os.path.join(repo_root, "inbox", "queue")
|
|
null_dir = os.path.join(repo_root, "inbox", "null-result")
|
|
|
|
if not os.path.isdir(archive_dir):
|
|
print(f"ERROR: {archive_dir} not found")
|
|
return
|
|
|
|
# Create target directories
|
|
if not dry_run:
|
|
os.makedirs(queue_dir, exist_ok=True)
|
|
os.makedirs(null_dir, exist_ok=True)
|
|
|
|
sources = glob.glob(os.path.join(archive_dir, "*.md"))
|
|
print(f"Found {len(sources)} source files in inbox/archive/")
|
|
|
|
moved = {"queue": 0, "null-result": 0, "archive": {}}
|
|
skipped = 0
|
|
|
|
for filepath in sorted(sources):
|
|
filename = os.path.basename(filepath)
|
|
if filename.startswith("_") or filename.startswith("."):
|
|
skipped += 1
|
|
continue
|
|
|
|
status = get_source_status(filepath)
|
|
domain = get_source_domain(filepath)
|
|
|
|
if status == "unprocessed" or status == "processing":
|
|
# → queue/
|
|
dest = os.path.join(queue_dir, filename)
|
|
if not dry_run:
|
|
os.rename(filepath, dest)
|
|
moved["queue"] += 1
|
|
|
|
elif status in ("null-result", "null_result"):
|
|
# → null-result/
|
|
dest = os.path.join(null_dir, filename)
|
|
if not dry_run:
|
|
os.rename(filepath, dest)
|
|
moved["null-result"] += 1
|
|
|
|
elif status in ("processed", "enrichment"):
|
|
# → archive/{domain}/
|
|
domain_dir = os.path.join(archive_dir, domain)
|
|
if not dry_run:
|
|
os.makedirs(domain_dir, exist_ok=True)
|
|
dest = os.path.join(domain_dir, filename)
|
|
if not dry_run:
|
|
os.rename(filepath, dest)
|
|
moved["archive"][domain] = moved["archive"].get(domain, 0) + 1
|
|
|
|
else:
|
|
# Unknown status — treat as unprocessed → queue/
|
|
dest = os.path.join(queue_dir, filename)
|
|
if not dry_run:
|
|
os.rename(filepath, dest)
|
|
moved["queue"] += 1
|
|
|
|
# Also move any .extraction-debug/ directory
|
|
debug_dir = os.path.join(archive_dir, ".extraction-debug")
|
|
if os.path.isdir(debug_dir):
|
|
print(f" (keeping .extraction-debug/ in place)")
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f" MIGRATION {'(DRY RUN) ' if dry_run else ''}COMPLETE")
|
|
print(f" → queue/ (unprocessed): {moved['queue']}")
|
|
print(f" → null-result/: {moved['null-result']}")
|
|
print(f" → archive/{{domain}}/:")
|
|
for domain, count in sorted(moved["archive"].items()):
|
|
print(f" {domain}: {count}")
|
|
print(f" Archive total: {sum(moved['archive'].values())}")
|
|
print(f" Skipped: {skipped}")
|
|
print(f" Grand total: {moved['queue'] + moved['null-result'] + sum(moved['archive'].values()) + skipped}")
|
|
print(f"{'='*60}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="Migrate source archive to organized structure")
|
|
parser.add_argument("--repo-root", default=".", help="Repository root")
|
|
parser.add_argument("--dry-run", action="store_true")
|
|
args = parser.parse_args()
|
|
migrate(args.repo_root, args.dry_run)
|