#!/usr/bin/env python3 """Extract claims from a source file — v2. Uses lean prompt (judgment only) + deterministic post-extraction validation ($0). Replaces the 1331-line openrouter-extract.py. Changes from v1: - Prompt: ~100 lines (was ~400). Mechanical rules removed — code handles them. - Pass 2: Replaced Haiku LLM review with Python validator. $0 instead of ~$0.01/source. - Entity enrichment: Entities enqueued to JSON queue, applied to main by batch processor. Extraction branches create NEW claim files only — no entity modifications on branches. Eliminates merge conflicts + 83% near_duplicate false positive rate. - Fix mode: Removed. Rejected claims re-extract with feedback baked into prompt. Usage: python3 openrouter-extract-v2.py [--model MODEL] [--dry-run] """ import argparse import csv import glob import json import os import re import sys from datetime import date from pathlib import Path import requests # ─── Add lib/ to path for imports ────────────────────────────────────────── # Add pipeline lib/ to path. Script lives at /opt/teleo-eval/ but lib/ is at /opt/teleo-eval/pipeline/lib/ sys.path.insert(0, str(Path(__file__).parent / "pipeline")) sys.path.insert(0, str(Path(__file__).parent)) from lib.extraction_prompt import build_extraction_prompt from lib.post_extract import ( load_existing_claims_from_repo, validate_and_fix_claims, validate_and_fix_entities, ) # ─── Source registration (Argus: pipeline funnel tracking) ───────────────── def _source_db_conn(): """Get connection to pipeline.db for source registration.""" try: from lib import db return db.get_connection() except Exception: return None def _register_source(conn, path, status, domain=None, model=None, claims_count=0, error=None): """Register or update a source in pipeline.db for funnel tracking.""" if conn is None: return try: conn.execute( """INSERT INTO sources (path, status, priority, extraction_model, claims_count, created_at, updated_at) VALUES (?, ?, 'medium', ?, ?, datetime('now'), datetime('now')) ON CONFLICT(path) DO UPDATE SET status = excluded.status, extraction_model = COALESCE(excluded.extraction_model, extraction_model), claims_count = excluded.claims_count, last_error = ?, updated_at = datetime('now')""", (path, status, model, claims_count, error), ) except Exception as e: print(f" WARN: Source registration failed: {e}", file=sys.stderr) # ─── Constants ────────────────────────────────────────────────────────────── OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions" DEFAULT_MODEL = "anthropic/claude-sonnet-4.5" USAGE_CSV = "/opt/teleo-eval/logs/openrouter-usage.csv" DOMAIN_AGENTS = { "internet-finance": "rio", "entertainment": "clay", "ai-alignment": "theseus", "health": "vida", "space-development": "astra", "grand-strategy": "leo", "mechanisms": "leo", "living-capital": "rio", "living-agents": "theseus", "teleohumanity": "leo", "critical-systems": "theseus", "collective-intelligence": "theseus", "teleological-economics": "rio", "cultural-dynamics": "clay", "decision-markets": "rio", } # ─── Helpers ──────────────────────────────────────────────────────────────── def read_file(path): try: with open(path) as f: return f.read() except FileNotFoundError: return "" def get_domain_from_source(source_content): match = re.search(r"^domain:\s*(.+)$", source_content, re.MULTILINE) return match.group(1).strip() if match else None def get_kb_index(domain): """Build fresh KB index for duplicate checking and wiki-link targets. Regenerated before each extraction (not cached from cron) so the index reflects the current KB state. Stale indexes cause duplicate claims and broken wiki links. (Leo's fix #1) """ lines = [] # Primary domain claims domain_dir = f"domains/{domain}" for f in sorted(glob.glob(os.path.join(domain_dir, "*.md"))): basename = os.path.basename(f) if not basename.startswith("_"): title = basename.replace(".md", "").replace("-", " ") lines.append(f"- {basename}: {title}") # Cross-domain claims from core/ and foundations/ (for wiki-link targets) for subdir in ["core", "foundations"]: for f in sorted(glob.glob(os.path.join(subdir, "**", "*.md"), recursive=True)): basename = os.path.basename(f) if not basename.startswith("_"): title = basename.replace(".md", "").replace("-", " ") lines.append(f"- {basename}: {title}") # Entities in this domain (for enrichment detection) entity_dir = f"entities/{domain}" for f in sorted(glob.glob(os.path.join(entity_dir, "*.md"))): basename = os.path.basename(f) if not basename.startswith("_"): lines.append(f"- [entity] {basename}: {basename.replace('.md', '').replace('-', ' ')}") if not lines: return "No existing claims in this domain." # Cap at 200 entries to keep prompt size reasonable if len(lines) > 200: lines = lines[:200] lines.append(f"... and {len(lines) - 200} more (truncated)") return "\n".join(lines) def call_openrouter(prompt, model, api_key): headers = { "Authorization": f"Bearer {api_key}", "Content-Type": "application/json", "HTTP-Referer": "https://livingip.xyz", "X-Title": "Teleo Codex Extraction", } payload = { "model": model, "messages": [{"role": "user", "content": prompt}], "temperature": 0.3, "max_tokens": 16000, } resp = requests.post(OPENROUTER_URL, headers=headers, json=payload, timeout=120) resp.raise_for_status() data = resp.json() content = data["choices"][0]["message"]["content"] usage = data.get("usage", {}) return content, usage def parse_response(content): """Parse JSON response, handling markdown fencing and truncation.""" content = content.strip() if content.startswith("```"): content = re.sub(r"^```(?:json)?\s*\n?", "", content) content = re.sub(r"\n?```\s*$", "", content) try: return json.loads(content) except json.JSONDecodeError: pass # Fix common JSON issues fixed = re.sub(r",\s*([}\]])", r"\1", content) open_braces = fixed.count("{") - fixed.count("}") open_brackets = fixed.count("[") - fixed.count("]") fixed += "]" * max(0, open_brackets) + "}" * max(0, open_braces) try: parsed = json.loads(fixed) print(" WARN: Fixed malformed JSON (trailing commas or truncation)") return parsed except json.JSONDecodeError: pass # Last resort: try to salvage claims with regex result = {"claims": [], "enrichments": [], "entities": [], "facts": []} claim_pattern = r'\{"filename":\s*"([^"]+)"[^}]*"content":\s*"((?:[^"\\]|\\.)*)"\s*\}' for match in re.finditer(claim_pattern, content, re.DOTALL): filename = match.group(1) claim_content = match.group(2).replace("\\n", "\n").replace('\\"', '"') domain_match = re.search(r'"domain":\s*"([^"]+)"', match.group(0)) result["claims"].append({ "filename": filename, "domain": domain_match.group(1) if domain_match else "", "content": claim_content, }) if result["claims"]: print(f" WARN: Salvaged {len(result['claims'])} claims from malformed JSON") return result def reconstruct_claim_content(claim, domain, agent): """Build markdown content from structured claim fields (lean prompt output format).""" title = claim.get("title", claim.get("filename", "").replace(".md", "").replace("-", " ")) desc = claim.get("description", "") conf = claim.get("confidence", "experimental") source = claim.get("source", f"extraction by {agent}") body_text = claim.get("body", desc) related = claim.get("related_claims", []) sourcer = claim.get("sourcer", "") # Build attribution block (v1: extractor always known, sourcer best-effort) attr_lines = [ "attribution:", " extractor:", f' - handle: "{agent}"', ] if sourcer: sourcer_handle = sourcer.strip().lower().lstrip("@").replace(" ", "-") attr_lines.extend([ " sourcer:", f' - handle: "{sourcer_handle}"', f' context: "{source}"', ]) lines = [ "---", "type: claim", f"domain: {domain}", f'description: "{desc}"', f"confidence: {conf}", f'source: "{source}"', f"created: {date.today().isoformat()}", *attr_lines, "---", "", f"# {title}", "", body_text, "", "---", "", "Relevant Notes:", ] for r in related[:5]: lines.append(f"- [[{r}]]") lines.extend(["", "Topics:", "- [[_map]]", ""]) return "\n".join(lines) def update_source_file(source_path, source_content, update_info): """Update source file frontmatter with processing info.""" updated = re.sub( r"^status:\s*.+$", f"status: {update_info['status']}", source_content, count=1, flags=re.MULTILINE, ) parts = updated.split("---", 2) if len(parts) >= 3: fm = parts[1] fm += f"processed_by: {update_info['processed_by']}\n" fm += f"processed_date: {update_info['processed_date']}\n" if update_info.get("claims_extracted"): fm += f"claims_extracted: {json.dumps(update_info['claims_extracted'])}\n" if update_info.get("enrichments_applied"): fm += f"enrichments_applied: {json.dumps(update_info['enrichments_applied'])}\n" if update_info.get("entities_updated"): fm += f"entities_updated: {json.dumps(update_info['entities_updated'])}\n" if update_info.get("model"): fm += f'extraction_model: "{update_info["model"]}"\n' if update_info.get("notes"): fm += f'extraction_notes: "{update_info["notes"]}"\n' updated = f"---{fm}---{parts[2]}" key_facts = update_info.get("key_facts", []) if key_facts: updated += "\n\n## Key Facts\n" for fact in key_facts: updated += f"- {fact}\n" with open(source_path, "w") as f: f.write(updated) def log_usage(agent, model, source_file, usage): write_header = not os.path.exists(USAGE_CSV) with open(USAGE_CSV, "a", newline="") as f: writer = csv.writer(f) if write_header: writer.writerow(["date", "agent", "model", "source_file", "input_tokens", "output_tokens"]) writer.writerow([ date.today().isoformat(), agent, model, os.path.basename(source_file), usage.get("prompt_tokens", 0), usage.get("completion_tokens", 0), ]) # ─── Main ─────────────────────────────────────────────────────────────────── def main(): parser = argparse.ArgumentParser(description="Extract claims via OpenRouter (v2)") parser.add_argument("source_file", help="Path to source file in inbox/archive/") parser.add_argument("--model", default=DEFAULT_MODEL, help=f"Model (default: {DEFAULT_MODEL})") parser.add_argument("--domain", default=None, help="Override domain") parser.add_argument("--dry-run", action="store_true", help="Print prompt, don't call API") parser.add_argument("--no-review", action="store_true", help="No-op (v1 compat). Pass 2 is always Python validator in v2.") parser.add_argument("--key-file", default="/opt/teleo-eval/secrets/openrouter-key") args = parser.parse_args() # Read API key api_key = read_file(args.key_file).strip() if not api_key and not args.dry_run: print("ERROR: No API key found", file=sys.stderr) sys.exit(1) # Read source source_content = read_file(args.source_file) if not source_content: print(f"ERROR: Cannot read {args.source_file}", file=sys.stderr) sys.exit(1) # Get domain and agent domain = args.domain or get_domain_from_source(source_content) if not domain: print(f"ERROR: No domain field in {args.source_file}", file=sys.stderr) sys.exit(1) agent = DOMAIN_AGENTS.get(domain, "leo") # Get KB index for dedup kb_index = get_kb_index(domain) # Load existing claims for post-extraction validation existing_claims = load_existing_claims_from_repo(".") # ── Build lean prompt ── # Extract rationale and intake_tier from source frontmatter (directed contribution) rationale = None intake_tier = None proposed_by = None rationale_match = re.search(r"^rationale:\s*[\"']?(.+?)[\"']?\s*$", source_content, re.MULTILINE) if rationale_match: rationale = rationale_match.group(1).strip() tier_match = re.search(r"^intake_tier:\s*(\S+)", source_content, re.MULTILINE) if tier_match: intake_tier = tier_match.group(1).strip() proposed_match = re.search(r"^proposed_by:\s*[\"']?(.+?)[\"']?\s*$", source_content, re.MULTILINE) if proposed_match: proposed_by = proposed_match.group(1).strip() # Set intake tier based on rationale presence if rationale and not intake_tier: intake_tier = "directed" elif not intake_tier: intake_tier = "undirected" if rationale: print(f" Directed contribution from {proposed_by or '?'}: {rationale[:80]}...") prompt = build_extraction_prompt( args.source_file, source_content, domain, agent, kb_index, rationale=rationale, intake_tier=intake_tier, proposed_by=proposed_by, ) if args.dry_run: print(f"=== DRY RUN ===") print(f"Source: {args.source_file}") print(f"Domain: {domain}, Agent: {agent}") print(f"Model: {args.model}") print(f"Existing claims: {len(existing_claims)}") print(f"Prompt length: {len(prompt)} chars") print(f"\n=== PROMPT ===\n{prompt[:1000]}...") return print(f"Extracting from {args.source_file} via {args.model}...") print(f"Domain: {domain}, Agent: {agent}, Existing claims: {len(existing_claims)}") # Register source as extracting (Argus: pipeline funnel) _src_conn = _source_db_conn() _register_source(_src_conn, args.source_file, "extracting", domain, args.model) # ── Pass 1: LLM extraction ── try: content, usage = call_openrouter(prompt, args.model, api_key) except requests.exceptions.RequestException as e: _register_source(_src_conn, args.source_file, "error", domain, args.model, error=str(e)) print(f"ERROR: API call failed: {e}", file=sys.stderr) sys.exit(1) p1_in = usage.get("prompt_tokens", "?") p1_out = usage.get("completion_tokens", "?") print(f"LLM tokens: {p1_in} in, {p1_out} out") result = parse_response(content) raw_claims = result.get("claims", []) enrichments = result.get("enrichments", []) entities = result.get("entities", []) facts = result.get("facts", []) decisions = result.get("decisions", []) print(f"LLM output: {len(raw_claims)} claims, {len(enrichments)} enrichments, {len(entities)} entities, {len(decisions)} decisions, {len(facts)} facts") # ── Pass 2: Deterministic validation ($0) ── # Reconstruct content for claims that used the lean format (title/body fields instead of content) for claim in raw_claims: if "content" not in claim or not claim["content"]: claim["content"] = reconstruct_claim_content(claim, domain, agent) kept_claims, rejected_claims, claim_stats = validate_and_fix_claims( raw_claims, domain, agent, existing_claims, ) kept_entities, rejected_entities, entity_stats = validate_and_fix_entities( entities, domain, existing_claims, ) print(f"Validation: {claim_stats['kept']}/{claim_stats['total']} claims kept " f"({claim_stats['fixed']} fixed, {claim_stats['rejected']} rejected)") if entity_stats["total"]: print(f"Entities: {entity_stats['kept']}/{entity_stats['total']} kept") if claim_stats["rejections"]: print(f"Rejections: {claim_stats['rejections']}") # ── Write claim files ── domain_dir = f"domains/{domain}" os.makedirs(domain_dir, exist_ok=True) written = [] for claim in kept_claims: filename = claim["filename"] claim_path = os.path.join(domain_dir, filename) if os.path.exists(claim_path): print(f" WARN: {claim_path} exists, skipping") continue with open(claim_path, "w") as f: f.write(claim["content"]) written.append(filename) print(f" Wrote: {claim_path}") # ── Apply enrichments ── enriched = [] for enr in enrichments: target = enr.get("target_file", "") evidence = enr.get("evidence", "") enr_type = enr.get("type", "confirm") source_ref = enr.get("source_ref", os.path.basename(args.source_file)) if not target or not evidence: continue target_path = os.path.join(domain_dir, target) if not os.path.exists(target_path): print(f" WARN: Enrichment target {target_path} not found, skipping") continue existing_content = read_file(target_path) source_slug = os.path.basename(args.source_file).replace(".md", "") enrichment_block = ( f"\n\n### Additional Evidence ({enr_type})\n" f"*Source: [[{source_slug}]] | Added: {date.today().isoformat()}*\n\n" f"{evidence}\n" ) # Insert enrichment before "Relevant Notes:" or "Topics:" section. # Do NOT split on "---" — it matches frontmatter delimiters and corrupts YAML # when files lack a body separator. (Leo: root cause of PRs #1504, #1509) # Two tiers only (Ganymede: tier 2 delimiter counting dropped — horizontal rule edge case) notes_match = re.search(r'\n(?:#{0,3}\s*)?(?:[Rr]elevant [Nn]otes|[Tt]opics)\s*:?', existing_content) if notes_match: insert_pos = notes_match.start() updated = existing_content[:insert_pos] + enrichment_block + existing_content[insert_pos:] else: # No anchor found — append to end (always safe) updated = existing_content.rstrip() + enrichment_block + "\n" with open(target_path, "w") as f: f.write(updated) enriched.append(target) print(f" Enriched: {target_path} ({enr_type})") # ── Enqueue entities (NOT written to branch — applied to main by batch) ── # Entity enrichments on branches cause merge conflicts because 20+ PRs # modify the same entity file (futardio.md, metadao.md). Enqueuing to a # JSON queue eliminates this: branches only create NEW claim files, entity # updates are applied to main by entity_batch.py. (Leo's #1 fix) entities_enqueued = [] for ent in kept_entities: try: from lib.entity_queue import enqueue entry_id = enqueue(ent, args.source_file, agent) entities_enqueued.append(ent["filename"]) print(f" Entity enqueued: {ent['filename']} ({ent.get('action', '?')}) → queue:{entry_id}") except Exception as e: # No fallback — fail loudly if queue unavailable. Direct writes to branches # defeat the entire queue architecture. (Ganymede review) print(f" ERROR: Failed to enqueue entity {ent.get('filename', '?')}: {e}", file=sys.stderr) # ── Write decision files + enqueue parent timeline entries ── decisions = result.get("decisions", []) decisions_written = [] for dec in decisions: filename = dec.get("filename", "") dec_domain = dec.get("domain", domain) content = dec.get("content", "") parent = dec.get("parent_entity", "") parent_timeline = dec.get("parent_timeline_entry", "") if not filename: continue # Write decision file to branch (goes through PR eval like claims) if content: dec_dir = os.path.join("decisions", dec_domain) os.makedirs(dec_dir, exist_ok=True) dec_path = os.path.join(dec_dir, filename) if not os.path.exists(dec_path): with open(dec_path, "w") as f: f.write(content) decisions_written.append(filename) print(f" Decision written: {dec_path}") # Enqueue parent entity timeline entry (applied to main by entity_batch) if parent and parent_timeline: try: from lib.entity_queue import enqueue entry_id = enqueue({ "filename": parent, "domain": dec_domain, "action": "update", "timeline_entry": parent_timeline, }, args.source_file, agent) print(f" Decision → parent timeline: {parent} (queue:{entry_id})") except Exception as e: print(f" WARN: Failed to enqueue parent timeline for {parent}: {e}", file=sys.stderr) if decisions_written: print(f" Decisions: {len(decisions_written)} written") # ── Update source file ── if written or decisions_written: status = "processed" elif enriched or entities_enqueued: status = "enrichment" else: status = "null-result" source_update = { "status": status, "processed_by": agent, "processed_date": date.today().isoformat(), "claims_extracted": written, "model": args.model, } if enriched: source_update["enrichments_applied"] = enriched if entities_enqueued: source_update["entities_enqueued"] = entities_enqueued if facts: source_update["key_facts"] = facts if not written and not enriched and not entities_enqueued: source_update["notes"] = ( f"LLM returned {len(raw_claims)} claims, " f"{claim_stats['rejected']} rejected by validator" ) update_source_file(args.source_file, source_content, source_update) print(f" Updated: {args.source_file} → status: {status}") # Register final status (Argus: pipeline funnel) db_status = "extracted" if status == "processed" else ("null_result" if status == "null-result" else status) _register_source(_src_conn, args.source_file, db_status, domain, args.model, len(written)) # ── Save debug info for rejected claims ── if rejected_claims: debug_dir = os.path.join(os.path.dirname(args.source_file) or ".", ".extraction-debug") os.makedirs(debug_dir, exist_ok=True) debug_path = os.path.join(debug_dir, os.path.basename(args.source_file).replace(".md", ".json")) with open(debug_path, "w") as f: json.dump({ "rejected_claims": [ {"filename": c.get("filename"), "issues": c.get("issues", [])} for c in rejected_claims ], "validation_stats": claim_stats, "model": args.model, "date": date.today().isoformat(), }, f, indent=2) print(f" Debug: {debug_path}") # ── Log usage ── log_usage(agent, args.model, args.source_file, usage) # ── Summary ── print(f"\n{'='*60}") print(f" EXTRACTION COMPLETE (v2)") print(f" Source: {args.source_file}") print(f" Agent: {agent}") print(f" Model: {args.model} ({p1_in} in / {p1_out} out)") print(f" Pass 2: Python validator ($0)") print(f" Claims: {len(written)} written, {claim_stats['rejected']} rejected, {claim_stats['fixed']} auto-fixed") print(f" Enrichments: {len(enriched)} applied") if entities_enqueued: print(f" Entities: {len(entities_enqueued)} enqueued (applied by batch on main)") if facts: print(f" Facts: {len(facts)} stored in source notes") print(f"{'='*60}") if __name__ == "__main__": main()