teleo-infrastructure/reconcile-sources.py

#!/usr/bin/env python3
"""
Reconcile archive source status and add bidirectional links.

Matches unprocessed archive sources to existing decisions, entities, and claims.
Updates status to 'processed' or 'null-result' and adds frontmatter links.

Linking pattern (Ganymede Option A — frontmatter only):
  - Archive sources get `derived_items:` listing decision/entity paths
  - Decisions/entities get `source_archive:` pointing to archive source path
  - All paths relative to repo root

Usage:
  python3 reconcile-sources.py [--apply]        # default: dry-run
  python3 reconcile-sources.py --apply           # apply changes
"""

import os
import re
import sys
from pathlib import Path
from urllib.parse import urlparse
from collections import defaultdict

REPO_ROOT = Path("/opt/teleo-eval/workspaces/main")
ARCHIVE_DIR = REPO_ROOT / "inbox" / "archive"
DECISIONS_DIR = REPO_ROOT / "decisions"
ENTITIES_DIR = REPO_ROOT / "entities"
DOMAINS_DIR = REPO_ROOT / "domains"

DRY_RUN = "--apply" not in sys.argv

# --- YAML frontmatter helpers ---

def read_frontmatter(filepath):
    """Read file, return (frontmatter_text, body_text, raw_content)."""
    content = filepath.read_text(encoding="utf-8")
    if not content.startswith("---"):
        return None, content, content
    end = content.find("\n---", 3)
    if end == -1:
        return None, content, content
    fm = content[3:end].strip()
    body = content[end + 4:]  # skip \n---
    return fm, body, content


def get_field(fm_text, field):
    """Get a single YAML field value from frontmatter text."""
    if fm_text is None:
        return None
    m = re.search(rf'^{field}:\s*["\']?(.+?)["\']?\s*$', fm_text, re.MULTILINE)
    return m.group(1) if m else None


def get_status(fm_text):
    return get_field(fm_text, "status")


def get_url(fm_text):
    return get_field(fm_text, "url")


def get_proposal_url(fm_text):
    return get_field(fm_text, "proposal_url")


def get_title(fm_text):
    return get_field(fm_text, "title")


def extract_hash_from_url(url):
    """Extract the proposal hash (last path segment) from a URL."""
    if not url:
        return None
    parsed = urlparse(url.strip('"').strip("'"))
    parts = [p for p in parsed.path.split("/") if p]
    if parts:
        last = parts[-1]
        # Proposal hashes are base58-like, 32-50 chars
        if len(last) >= 20 and re.match(r'^[A-Za-z0-9]+$', last):
            return last
    return None


def rel_path(filepath):
    """Get path relative to repo root."""
    return str(filepath.relative_to(REPO_ROOT))


# --- Test/spam detection ---

TEST_PATTERNS = [
    r'\btest\b', r'\btesting\b', r'\bmy-test\b', r'\bq\b$',
    r'\ba-very-unique', r'\btext-mint', r'\bsample\b',
    r'\basdf\b', r'\bfoo\b', r'\bbar\b', r'\bhello-world\b',
    r'\bgrpc-indexer\b', r'\brocks{0,2}wd\b',
    r'spending-limit', r'\btest-proposal\b',
    r'\bdummy\b',
]
TEST_RE = re.compile('|'.join(TEST_PATTERNS), re.IGNORECASE)

# Title-based patterns
TEST_TITLE_PATTERNS = [
    r'^test\b', r'^testing\b', r'^q$', r'^a$', r'^asdf',
    r'^my test', r'^sample', r'^hello',
    r'text mint ix', r'a very unique title',
    r'testing spending limit', r'testing.*grpc',
    r'my-test-proposal',
]
TEST_TITLE_RE = re.compile('|'.join(TEST_TITLE_PATTERNS), re.IGNORECASE)


def is_test_spam(filepath, fm_text):
    """Detect test/spam sources."""
    name = filepath.stem
    if TEST_RE.search(name):
        return True
    title = get_title(fm_text) or ""
    if TEST_TITLE_RE.search(title):
        return True
    return False


# --- Build indexes ---

def build_decision_hash_index():
    """Map proposal hash → decision file path."""
    index = {}
    if not DECISIONS_DIR.exists():
        return index
    for f in DECISIONS_DIR.rglob("*.md"):
        fm, _, _ = read_frontmatter(f)
        url = get_proposal_url(fm)
        h = extract_hash_from_url(url)
        if h:
            index[h] = f
    return index


def build_entity_name_index():
    """Map normalized entity name → entity file path."""
    index = {}
    if not ENTITIES_DIR.exists():
        return index
    for f in ENTITIES_DIR.rglob("*.md"):
        # Use filename as entity name
        name = f.stem.lower().replace("-", " ").replace("_", " ")
        index[name] = f
    return index


def build_claim_source_index():
    """Map archive source slug → list of claim file paths (via wiki-links)."""
    index = defaultdict(list)
    if not DOMAINS_DIR.exists():
        return index
    for f in DOMAINS_DIR.rglob("*.md"):
        try:
            content = f.read_text(encoding="utf-8")
        except Exception:
            continue
        # Find wiki-links to archive: [[inbox/archive/...]]
        for m in re.finditer(r'\[\[inbox/archive/([^\]]+)\]\]', content):
            slug = m.group(1)
            index[slug].append(f)
    return index


# --- Frontmatter modification ---

def add_frontmatter_field(filepath, field_name, field_value):
    """Add a YAML field to frontmatter. Returns modified content or None if already present."""
    content = filepath.read_text(encoding="utf-8")
    if not content.startswith("---"):
        return None

    end = content.find("\n---", 3)
    if end == -1:
        return None

    fm = content[3:end]

    # Check if field already exists
    if re.search(rf'^{field_name}:', fm, re.MULTILINE):
        return None  # Already has this field

    # Add before closing ---
    if isinstance(field_value, list):
        lines = f"\n{field_name}:"
        for v in field_value:
            lines += f'\n  - "{v}"'
        new_fm = fm.rstrip() + lines + "\n"
    else:
        new_fm = fm.rstrip() + f'\n{field_name}: "{field_value}"\n'

    return "---" + new_fm + "---" + content[end + 4:]


def set_status(filepath, new_status):
    """Change status field in frontmatter."""
    content = filepath.read_text(encoding="utf-8")
    if not content.startswith("---"):
        return None
    # Replace status field
    new_content = re.sub(
        r'^(status:\s*).*$',
        f'\\1{new_status}',
        content,
        count=1,
        flags=re.MULTILINE
    )
    if new_content == content:
        return None
    return new_content


# --- Main reconciliation ---

def main():
    print(f"{'DRY RUN' if DRY_RUN else 'APPLYING CHANGES'}")
    print(f"Repo root: {REPO_ROOT}")
    print()

    # Build indexes
    print("Building indexes...")
    decision_hash_idx = build_decision_hash_index()
    print(f"  Decision hash index: {len(decision_hash_idx)} entries")

    entity_name_idx = build_entity_name_index()
    print(f"  Entity name index: {len(entity_name_idx)} entries")

    claim_source_idx = build_claim_source_index()
    print(f"  Claim source index: {len(claim_source_idx)} entries")
    print()

    # Find all unprocessed archive sources
    unprocessed = []
    for f in sorted(ARCHIVE_DIR.rglob("*.md")):
        if ".extraction-debug" in str(f):
            continue
        fm, _, _ = read_frontmatter(f)
        if get_status(fm) == "unprocessed":
            unprocessed.append(f)

    print(f"Found {len(unprocessed)} unprocessed sources")
    print()

    # Categorize and match
    matched = []       # (source_path, [target_paths], match_type)
    test_spam = []
    futardio_unmatched = []   # futardio proposals with no KB output → null-result
    genuine_backlog = []      # non-futardio sources still awaiting extraction → keep unprocessed

    def is_futardio_source(filepath):
        """Check if file is a futardio/metadao governance proposal (not research)."""
        name = filepath.name.lower()
        return "futardio" in name

    for src in unprocessed:
        fm, _, _ = read_frontmatter(src)

        # Check test/spam first
        if is_test_spam(src, fm):
            test_spam.append(src)
            continue

        targets = []
        match_types = []

        # Match 1: proposal hash → decision
        url = get_url(fm)
        src_hash = extract_hash_from_url(url)
        if src_hash and src_hash in decision_hash_idx:
            targets.append(decision_hash_idx[src_hash])
            match_types.append("hash→decision")

        # Match 2: wiki-links from claims
        # Try multiple slug variants
        src_rel = rel_path(src)
        slug_no_ext = src_rel.replace("inbox/archive/", "").replace(".md", "")
        # Also try just the filename without extension
        slug_basename = src.stem
        for slug in [slug_no_ext, slug_basename]:
            if slug in claim_source_idx:
                for claim_path in claim_source_idx[slug]:
                    if claim_path not in targets:
                        targets.append(claim_path)
                        match_types.append("wiki→claim")

        # Match 3: entity name matching (for launches/fundraises)
        title = get_title(fm) or ""
        # Extract project name from title like "Futardio: ProjectName ..."
        title_match = re.match(r'Futardio:\s*(.+?)(?:\s*[-—]|\s+Launch|\s+Fundraise|$)', title, re.IGNORECASE)
        if title_match:
            project_name = title_match.group(1).strip().lower().replace("-", " ")
            if project_name in entity_name_idx:
                entity_path = entity_name_idx[project_name]
                if entity_path not in targets:
                    targets.append(entity_path)
                    match_types.append("name→entity")

        if targets:
            matched.append((src, targets, match_types))
        elif is_futardio_source(src):
            futardio_unmatched.append(src)
        else:
            genuine_backlog.append(src)

    print(f"Results:")
    print(f"  Matched: {len(matched)}")
    print(f"  Test/spam: {len(test_spam)}")
    print(f"  Futardio unmatched (→ null-result): {len(futardio_unmatched)}")
    print(f"  Genuine backlog (kept unprocessed): {len(genuine_backlog)}")
    print()

    # Validate all link targets exist
    broken_links = []
    for src, targets, _ in matched:
        for t in targets:
            if isinstance(t, Path) and not t.exists():
                broken_links.append((src, t))

    if broken_links:
        print(f"ERROR: {len(broken_links)} broken link targets!")
        for src, target in broken_links:
            print(f"  {rel_path(src)} → {rel_path(target)}")
        if not DRY_RUN:
            print("Aborting — fix broken links first.")
            sys.exit(1)

    # Show match samples
    print("Sample matches:")
    for src, targets, types in matched[:5]:
        print(f"  {src.name}")
        for t, mt in zip(targets, types):
            print(f"    → {rel_path(t)} ({mt})")
    print()

    # Show test/spam samples
    if test_spam:
        print(f"Test/spam samples ({len(test_spam)} total):")
        for src in test_spam[:5]:
            print(f"  {src.name}")
        print()

    # Show futardio unmatched samples
    if futardio_unmatched:
        print(f"Futardio unmatched samples ({len(futardio_unmatched)} total):")
        for src in futardio_unmatched[:10]:
            print(f"  {src.name}")
        print()

    # Show genuine backlog
    if genuine_backlog:
        print(f"Genuine backlog — kept unprocessed ({len(genuine_backlog)} total):")
        from collections import Counter
        backlog_domains = Counter()
        for src in genuine_backlog:
            parts = src.relative_to(ARCHIVE_DIR).parts
            domain = parts[0] if len(parts) > 1 else "root"
            backlog_domains[domain] += 1
        for d, c in backlog_domains.most_common():
            print(f"  {d}: {c}")
        print()

    if DRY_RUN:
        print("=== DRY RUN — no changes made. Use --apply to apply. ===")
        return

    # --- Apply changes ---
    files_modified = 0
    links_created = 0

    # 1. Matched sources → processed + bidirectional links
    for src, targets, _ in matched:
        # Update source status
        new_content = set_status(src, "processed")
        if new_content:
            # Also add derived_items
            decision_entity_targets = [
                rel_path(t) for t in targets
                if isinstance(t, Path) and (
                    str(t).startswith(str(DECISIONS_DIR)) or
                    str(t).startswith(str(ENTITIES_DIR))
                )
            ]
            if decision_entity_targets:
                # Add derived_items to the already-modified content
                # Write status change first, then add field
                src.write_text(new_content, encoding="utf-8")
                linked = add_frontmatter_field(src, "derived_items", decision_entity_targets)
                if linked:
                    src.write_text(linked, encoding="utf-8")
                    links_created += len(decision_entity_targets)
            else:
                src.write_text(new_content, encoding="utf-8")
            files_modified += 1

        # Add source_archive to decision/entity targets
        src_rel = rel_path(src)
        for t in targets:
            if isinstance(t, Path) and (
                str(t).startswith(str(DECISIONS_DIR)) or
                str(t).startswith(str(ENTITIES_DIR))
            ):
                linked = add_frontmatter_field(t, "source_archive", src_rel)
                if linked:
                    t.write_text(linked, encoding="utf-8")
                    files_modified += 1
                    links_created += 1

    # 2. Test/spam → null-result
    for src in test_spam:
        new_content = set_status(src, "null-result")
        if new_content:
            src.write_text(new_content, encoding="utf-8")
            files_modified += 1

    # 3. Futardio unmatched → null-result (no extraction output, won't be re-extracted)
    for src in futardio_unmatched:
        new_content = set_status(src, "null-result")
        if new_content:
            src.write_text(new_content, encoding="utf-8")
            files_modified += 1

    # 4. Genuine backlog → KEEP unprocessed (these are real extraction targets)
    # No changes needed

    print(f"\n=== APPLIED ===")
    print(f"Files modified: {files_modified}")
    print(f"Bidirectional links created: {links_created}")
    print(f"Matched → processed: {len(matched)}")
    print(f"Test/spam → null-result: {len(test_spam)}")
    print(f"Futardio unmatched → null-result: {len(futardio_unmatched)}")
    print(f"Genuine backlog → kept unprocessed: {len(genuine_backlog)}")

    # Verify
    remaining = 0
    for f in ARCHIVE_DIR.rglob("*.md"):
        if ".extraction-debug" in str(f):
            continue
        fm, _, _ = read_frontmatter(f)
        if get_status(fm) == "unprocessed":
            remaining += 1
    print(f"\nRemaining unprocessed: {remaining}")


if __name__ == "__main__":
    main()