#!/usr/bin/env python3 """ Reconcile archive source status and add bidirectional links. Matches unprocessed archive sources to existing decisions, entities, and claims. Updates status to 'processed' or 'null-result' and adds frontmatter links. Linking pattern (Ganymede Option A — frontmatter only): - Archive sources get `derived_items:` listing decision/entity paths - Decisions/entities get `source_archive:` pointing to archive source path - All paths relative to repo root Usage: python3 reconcile-sources.py [--apply] # default: dry-run python3 reconcile-sources.py --apply # apply changes """ import os import re import sys from pathlib import Path from urllib.parse import urlparse from collections import defaultdict REPO_ROOT = Path("/opt/teleo-eval/workspaces/main") ARCHIVE_DIR = REPO_ROOT / "inbox" / "archive" DECISIONS_DIR = REPO_ROOT / "decisions" ENTITIES_DIR = REPO_ROOT / "entities" DOMAINS_DIR = REPO_ROOT / "domains" DRY_RUN = "--apply" not in sys.argv # --- YAML frontmatter helpers --- def read_frontmatter(filepath): """Read file, return (frontmatter_text, body_text, raw_content).""" content = filepath.read_text(encoding="utf-8") if not content.startswith("---"): return None, content, content end = content.find("\n---", 3) if end == -1: return None, content, content fm = content[3:end].strip() body = content[end + 4:] # skip \n--- return fm, body, content def get_field(fm_text, field): """Get a single YAML field value from frontmatter text.""" if fm_text is None: return None m = re.search(rf'^{field}:\s*["\']?(.+?)["\']?\s*$', fm_text, re.MULTILINE) return m.group(1) if m else None def get_status(fm_text): return get_field(fm_text, "status") def get_url(fm_text): return get_field(fm_text, "url") def get_proposal_url(fm_text): return get_field(fm_text, "proposal_url") def get_title(fm_text): return get_field(fm_text, "title") def extract_hash_from_url(url): """Extract the proposal hash (last path segment) from a URL.""" if not url: return None parsed = urlparse(url.strip('"').strip("'")) parts = [p for p in parsed.path.split("/") if p] if parts: last = parts[-1] # Proposal hashes are base58-like, 32-50 chars if len(last) >= 20 and re.match(r'^[A-Za-z0-9]+$', last): return last return None def rel_path(filepath): """Get path relative to repo root.""" return str(filepath.relative_to(REPO_ROOT)) # --- Test/spam detection --- TEST_PATTERNS = [ r'\btest\b', r'\btesting\b', r'\bmy-test\b', r'\bq\b$', r'\ba-very-unique', r'\btext-mint', r'\bsample\b', r'\basdf\b', r'\bfoo\b', r'\bbar\b', r'\bhello-world\b', r'\bgrpc-indexer\b', r'\brocks{0,2}wd\b', r'spending-limit', r'\btest-proposal\b', r'\bdummy\b', ] TEST_RE = re.compile('|'.join(TEST_PATTERNS), re.IGNORECASE) # Title-based patterns TEST_TITLE_PATTERNS = [ r'^test\b', r'^testing\b', r'^q$', r'^a$', r'^asdf', r'^my test', r'^sample', r'^hello', r'text mint ix', r'a very unique title', r'testing spending limit', r'testing.*grpc', r'my-test-proposal', ] TEST_TITLE_RE = re.compile('|'.join(TEST_TITLE_PATTERNS), re.IGNORECASE) def is_test_spam(filepath, fm_text): """Detect test/spam sources.""" name = filepath.stem if TEST_RE.search(name): return True title = get_title(fm_text) or "" if TEST_TITLE_RE.search(title): return True return False # --- Build indexes --- def build_decision_hash_index(): """Map proposal hash → decision file path.""" index = {} if not DECISIONS_DIR.exists(): return index for f in DECISIONS_DIR.rglob("*.md"): fm, _, _ = read_frontmatter(f) url = get_proposal_url(fm) h = extract_hash_from_url(url) if h: index[h] = f return index def build_entity_name_index(): """Map normalized entity name → entity file path.""" index = {} if not ENTITIES_DIR.exists(): return index for f in ENTITIES_DIR.rglob("*.md"): # Use filename as entity name name = f.stem.lower().replace("-", " ").replace("_", " ") index[name] = f return index def build_claim_source_index(): """Map archive source slug → list of claim file paths (via wiki-links).""" index = defaultdict(list) if not DOMAINS_DIR.exists(): return index for f in DOMAINS_DIR.rglob("*.md"): try: content = f.read_text(encoding="utf-8") except Exception: continue # Find wiki-links to archive: [[inbox/archive/...]] for m in re.finditer(r'\[\[inbox/archive/([^\]]+)\]\]', content): slug = m.group(1) index[slug].append(f) return index # --- Frontmatter modification --- def add_frontmatter_field(filepath, field_name, field_value): """Add a YAML field to frontmatter. Returns modified content or None if already present.""" content = filepath.read_text(encoding="utf-8") if not content.startswith("---"): return None end = content.find("\n---", 3) if end == -1: return None fm = content[3:end] # Check if field already exists if re.search(rf'^{field_name}:', fm, re.MULTILINE): return None # Already has this field # Add before closing --- if isinstance(field_value, list): lines = f"\n{field_name}:" for v in field_value: lines += f'\n - "{v}"' new_fm = fm.rstrip() + lines + "\n" else: new_fm = fm.rstrip() + f'\n{field_name}: "{field_value}"\n' return "---" + new_fm + "---" + content[end + 4:] def set_status(filepath, new_status): """Change status field in frontmatter.""" content = filepath.read_text(encoding="utf-8") if not content.startswith("---"): return None # Replace status field new_content = re.sub( r'^(status:\s*).*$', f'\\1{new_status}', content, count=1, flags=re.MULTILINE ) if new_content == content: return None return new_content # --- Main reconciliation --- def main(): print(f"{'DRY RUN' if DRY_RUN else 'APPLYING CHANGES'}") print(f"Repo root: {REPO_ROOT}") print() # Build indexes print("Building indexes...") decision_hash_idx = build_decision_hash_index() print(f" Decision hash index: {len(decision_hash_idx)} entries") entity_name_idx = build_entity_name_index() print(f" Entity name index: {len(entity_name_idx)} entries") claim_source_idx = build_claim_source_index() print(f" Claim source index: {len(claim_source_idx)} entries") print() # Find all unprocessed archive sources unprocessed = [] for f in sorted(ARCHIVE_DIR.rglob("*.md")): if ".extraction-debug" in str(f): continue fm, _, _ = read_frontmatter(f) if get_status(fm) == "unprocessed": unprocessed.append(f) print(f"Found {len(unprocessed)} unprocessed sources") print() # Categorize and match matched = [] # (source_path, [target_paths], match_type) test_spam = [] futardio_unmatched = [] # futardio proposals with no KB output → null-result genuine_backlog = [] # non-futardio sources still awaiting extraction → keep unprocessed def is_futardio_source(filepath): """Check if file is a futardio/metadao governance proposal (not research).""" name = filepath.name.lower() return "futardio" in name for src in unprocessed: fm, _, _ = read_frontmatter(src) # Check test/spam first if is_test_spam(src, fm): test_spam.append(src) continue targets = [] match_types = [] # Match 1: proposal hash → decision url = get_url(fm) src_hash = extract_hash_from_url(url) if src_hash and src_hash in decision_hash_idx: targets.append(decision_hash_idx[src_hash]) match_types.append("hash→decision") # Match 2: wiki-links from claims # Try multiple slug variants src_rel = rel_path(src) slug_no_ext = src_rel.replace("inbox/archive/", "").replace(".md", "") # Also try just the filename without extension slug_basename = src.stem for slug in [slug_no_ext, slug_basename]: if slug in claim_source_idx: for claim_path in claim_source_idx[slug]: if claim_path not in targets: targets.append(claim_path) match_types.append("wiki→claim") # Match 3: entity name matching (for launches/fundraises) title = get_title(fm) or "" # Extract project name from title like "Futardio: ProjectName ..." title_match = re.match(r'Futardio:\s*(.+?)(?:\s*[-—]|\s+Launch|\s+Fundraise|$)', title, re.IGNORECASE) if title_match: project_name = title_match.group(1).strip().lower().replace("-", " ") if project_name in entity_name_idx: entity_path = entity_name_idx[project_name] if entity_path not in targets: targets.append(entity_path) match_types.append("name→entity") if targets: matched.append((src, targets, match_types)) elif is_futardio_source(src): futardio_unmatched.append(src) else: genuine_backlog.append(src) print(f"Results:") print(f" Matched: {len(matched)}") print(f" Test/spam: {len(test_spam)}") print(f" Futardio unmatched (→ null-result): {len(futardio_unmatched)}") print(f" Genuine backlog (kept unprocessed): {len(genuine_backlog)}") print() # Validate all link targets exist broken_links = [] for src, targets, _ in matched: for t in targets: if isinstance(t, Path) and not t.exists(): broken_links.append((src, t)) if broken_links: print(f"ERROR: {len(broken_links)} broken link targets!") for src, target in broken_links: print(f" {rel_path(src)} → {rel_path(target)}") if not DRY_RUN: print("Aborting — fix broken links first.") sys.exit(1) # Show match samples print("Sample matches:") for src, targets, types in matched[:5]: print(f" {src.name}") for t, mt in zip(targets, types): print(f" → {rel_path(t)} ({mt})") print() # Show test/spam samples if test_spam: print(f"Test/spam samples ({len(test_spam)} total):") for src in test_spam[:5]: print(f" {src.name}") print() # Show futardio unmatched samples if futardio_unmatched: print(f"Futardio unmatched samples ({len(futardio_unmatched)} total):") for src in futardio_unmatched[:10]: print(f" {src.name}") print() # Show genuine backlog if genuine_backlog: print(f"Genuine backlog — kept unprocessed ({len(genuine_backlog)} total):") from collections import Counter backlog_domains = Counter() for src in genuine_backlog: parts = src.relative_to(ARCHIVE_DIR).parts domain = parts[0] if len(parts) > 1 else "root" backlog_domains[domain] += 1 for d, c in backlog_domains.most_common(): print(f" {d}: {c}") print() if DRY_RUN: print("=== DRY RUN — no changes made. Use --apply to apply. ===") return # --- Apply changes --- files_modified = 0 links_created = 0 # 1. Matched sources → processed + bidirectional links for src, targets, _ in matched: # Update source status new_content = set_status(src, "processed") if new_content: # Also add derived_items decision_entity_targets = [ rel_path(t) for t in targets if isinstance(t, Path) and ( str(t).startswith(str(DECISIONS_DIR)) or str(t).startswith(str(ENTITIES_DIR)) ) ] if decision_entity_targets: # Add derived_items to the already-modified content # Write status change first, then add field src.write_text(new_content, encoding="utf-8") linked = add_frontmatter_field(src, "derived_items", decision_entity_targets) if linked: src.write_text(linked, encoding="utf-8") links_created += len(decision_entity_targets) else: src.write_text(new_content, encoding="utf-8") files_modified += 1 # Add source_archive to decision/entity targets src_rel = rel_path(src) for t in targets: if isinstance(t, Path) and ( str(t).startswith(str(DECISIONS_DIR)) or str(t).startswith(str(ENTITIES_DIR)) ): linked = add_frontmatter_field(t, "source_archive", src_rel) if linked: t.write_text(linked, encoding="utf-8") files_modified += 1 links_created += 1 # 2. Test/spam → null-result for src in test_spam: new_content = set_status(src, "null-result") if new_content: src.write_text(new_content, encoding="utf-8") files_modified += 1 # 3. Futardio unmatched → null-result (no extraction output, won't be re-extracted) for src in futardio_unmatched: new_content = set_status(src, "null-result") if new_content: src.write_text(new_content, encoding="utf-8") files_modified += 1 # 4. Genuine backlog → KEEP unprocessed (these are real extraction targets) # No changes needed print(f"\n=== APPLIED ===") print(f"Files modified: {files_modified}") print(f"Bidirectional links created: {links_created}") print(f"Matched → processed: {len(matched)}") print(f"Test/spam → null-result: {len(test_spam)}") print(f"Futardio unmatched → null-result: {len(futardio_unmatched)}") print(f"Genuine backlog → kept unprocessed: {len(genuine_backlog)}") # Verify remaining = 0 for f in ARCHIVE_DIR.rglob("*.md"): if ".extraction-debug" in str(f): continue fm, _, _ = read_frontmatter(f) if get_status(fm) == "unprocessed": remaining += 1 print(f"\nRemaining unprocessed: {remaining}") if __name__ == "__main__": main()