feat: extract-time connection + post-merge reciprocal edges

Two-part fix for 58% orphan ratio: 1. Prompt-time prior art: Qdrant lookup before extraction injects existing claims as connection candidates. LLM classifies edges as supports/challenges/related. reconstruct_claim_content writes typed edges in frontmatter. 2. Post-merge reciprocal edges: _reciprocal_edges() runs after cherry-pick merge, reads new claims' outgoing edges, writes reciprocal edges on target files. Ensures every new claim has incoming links. Files: lib/extraction_prompt.py, lib/merge.py, openrouter-extract-v2.py Tests: 214 passed (3 failures + 3 errors pre-existing) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-04 15:25:31 +01:00 · 2026-04-04 15:25:31 +01:00 · be010e666a
commit be010e666a
parent 84cb001dd6
3 changed files with 271 additions and 4 deletions
--- a/lib/extraction_prompt.py
+++ b/lib/extraction_prompt.py
@ -27,6 +27,7 @@ def build_extraction_prompt(
    rationale: str | None = None,
    intake_tier: str | None = None,
    proposed_by: str | None = None,
+    prior_art: list[dict] | None = None,
 ) -> str:
    """Build the lean extraction prompt.

@ -40,6 +41,9 @@ def build_extraction_prompt(
        rationale: Contributor's natural-language thesis about the source (optional)
        intake_tier: undirected | directed | challenge (optional)
        proposed_by: Contributor handle who submitted the source (optional)
+        prior_art: Qdrant search results — existing claims semantically similar to this source.
+                   Each dict has: claim_title, claim_path, description, score.
+                   Injected as connection candidates for extract-time linking.

    Returns:
        The complete prompt string
@ -72,6 +76,27 @@ Set `contributor_thesis_extractable: true` if you extracted the contributor's th
    else:
        contributor_directive = ""

+    # Build connection candidates section (if prior art found via Qdrant)
+    if prior_art:
+        pa_lines = [
+            "\n## Connection Candidates (semantically similar existing claims)\n",
+            "These existing claims are topically related to this source. For each NEW claim you extract,",
+            "check this list and specify connections in the `connections` array.\n",
+        ]
+        for i, pa in enumerate(prior_art[:10], 1):
+            title = pa.get("claim_title", "untitled")
+            path = pa.get("claim_path", "")
+            desc = pa.get("description", "")
+            score = pa.get("score", 0)
+            filename = path.rsplit("/", 1)[-1].replace(".md", "") if path else title
+            pa_lines.append(f"{i}. **{title}** (`{filename}`, similarity: {score:.2f})")
+            if desc:
+                pa_lines.append(f"   {desc}")
+        pa_lines.append("")
+        connection_candidates = "\n".join(pa_lines)
+    else:
+        connection_candidates = ""
+
    return f"""You are {agent}, extracting knowledge from a source for TeleoHumanity's collective knowledge base.

 ## Your Task
@ -136,7 +161,7 @@ Single source = experimental at most. Pitch rhetoric or marketing copy = specula
 **File:** {source_file}

 {source_content}
-{contributor_directive}
+{contributor_directive}{connection_candidates}
 ## KB Index (existing claims — check for duplicates and enrichment targets)

 {kb_index}
@ -157,6 +182,13 @@ Return valid JSON. The post-processor handles frontmatter formatting, wiki links
      "source": "author/org, key evidence reference",
      "body": "Argument with evidence. Cite specific data, quotes, studies from the source. Explain WHY the claim is supported. This must be a real argument, not a restatement of the title.",
      "related_claims": ["existing-claim-stem-from-kb-index"],
+      "connections": [
+        {{
+          "target": "existing-claim-filename-from-connection-candidates-or-kb-index",
+          "relationship": "supports|challenges|related",
+          "reason": "One sentence: WHY does this claim support/challenge/relate to the target?"
+        }}
+      ],
      "scope": "structural|functional|causal|correlational",
      "sourcer": "handle or name of the original author/source (e.g., @theiaresearch, Pine Analytics)"
    }}
@ -206,8 +238,9 @@ Return valid JSON. The post-processor handles frontmatter formatting, wiki links
 3. **Facts are not claims.** Individual data points go in `facts`. Only generalized patterns from multiple data points become claims.
 4. **Proposals are entities, not claims.** A governance proposal, token launch, or funding event is structured data (entity). Only extract a claim if the event reveals a novel mechanism insight that generalizes beyond this specific case.
 5. **Scope your claims.** Say whether you're claiming a structural, functional, causal, or correlational relationship.
-6. **OPSEC.** Never extract specific dollar amounts, valuations, equity percentages, or deal terms for LivingIP/Teleo. General market data is fine.
-7. **Read the Agent Notes.** If the source has "Agent Notes" or "Curator Notes" sections, they contain context about why this source matters.
+6. **Connect your claims.** For every new claim, check the Connection Candidates list. If a candidate is related, add it to the `connections` array with the relationship type and a one-sentence reason. Use `supports` when your claim provides evidence for the target, `challenges` when it contradicts, `related` only as a last resort. Unconnected claims are orphans — connect them at birth.
+7. **OPSEC.** Never extract specific dollar amounts, valuations, equity percentages, or deal terms for LivingIP/Teleo. General market data is fine.
+8. **Read the Agent Notes.** If the source has "Agent Notes" or "Curator Notes" sections, they contain context about why this source matters.

 Return valid JSON only. No markdown fencing, no explanation outside the JSON.
 """
--- a/lib/merge.py
+++ b/lib/merge.py
@ -1102,6 +1102,165 @@ async def _embed_merged_claims(main_sha: str, branch_sha: str):
        logger.exception("embed: post-merge embedding failed (non-fatal)")


+async def _reciprocal_edges(main_sha: str, branch_sha: str):
+    """Add reciprocal edges on existing claims after a PR merges.
+
+    When a new claim A has `supports: [B]` in its frontmatter, B should have
+    `supports: [A]` added to its own frontmatter. This gives A an incoming link,
+    preventing it from being an orphan.
+
+    Runs on main after cherry-pick merge. Non-fatal — orphans are recoverable.
+    Only processes new files (diff-filter=A), not modified files.
+    """
+    EDGE_FIELDS = ("supports", "challenges", "related")
+    # Inverse mapping: if A supports B, then B is supported-by A.
+    # For simplicity, we use the same edge type (bidirectional "supports" means
+    # both claims support each other's argument). This matches reweave behavior.
+
+    try:
+        # Find newly added claim files
+        rc, diff_out = await _git(
+            "diff", "--name-only", "--diff-filter=A",
+            main_sha, branch_sha,
+            cwd=str(config.MAIN_WORKTREE),
+            timeout=10,
+        )
+        if rc != 0:
+            logger.warning("reciprocal_edges: diff failed (rc=%d), skipping", rc)
+            return
+
+        claim_dirs = {"domains/", "core/", "foundations/"}
+        new_claims = [
+            f for f in diff_out.strip().split("\n")
+            if f.endswith(".md")
+            and any(f.startswith(d) for d in claim_dirs)
+            and not f.split("/")[-1].startswith("_")
+            and "/entities/" not in f
+            and "/decisions/" not in f
+        ]
+
+        if not new_claims:
+            return
+
+        reciprocals_added = 0
+        for claim_path in new_claims:
+            full_path = config.MAIN_WORKTREE / claim_path
+            if not full_path.exists():
+                continue
+
+            try:
+                content = full_path.read_text()
+            except Exception:
+                continue
+
+            fm, raw_fm, body = _parse_yaml_frontmatter(content)
+            if fm is None:
+                continue
+
+            # Get the new claim's slug (filename without .md)
+            claim_slug = claim_path.rsplit("/", 1)[-1].replace(".md", "")
+
+            # Collect all edge targets from this new claim
+            for field in EDGE_FIELDS:
+                targets = fm.get(field, [])
+                if isinstance(targets, str):
+                    targets = [targets]
+                if not isinstance(targets, list):
+                    continue
+
+                for target_slug in targets:
+                    target_slug = str(target_slug).strip()
+                    if not target_slug:
+                        continue
+
+                    # Find the target file on disk
+                    target_file = _find_claim_file(target_slug)
+                    if target_file is None:
+                        continue
+
+                    # Add reciprocal edge: target now has field: [new_claim_slug]
+                    if _add_edge_to_file(target_file, field, claim_slug):
+                        reciprocals_added += 1
+
+        if reciprocals_added > 0:
+            # Commit the reciprocal edges
+            await _git("add", "-A", cwd=str(config.MAIN_WORKTREE))
+            rc, out = await _git(
+                "commit", "-m", f"reciprocal edges: {reciprocals_added} edges from {len(new_claims)} new claims",
+                cwd=str(config.MAIN_WORKTREE),
+            )
+            if rc == 0:
+                logger.info("reciprocal_edges: %d edges added across %d new claims", reciprocals_added, len(new_claims))
+            else:
+                logger.warning("reciprocal_edges: commit failed: %s", out[:200])
+
+    except Exception:
+        logger.exception("reciprocal_edges: failed (non-fatal)")
+
+
+def _find_claim_file(slug: str) -> "Path | None":
+    """Find a claim file on disk by its slug. Searches domains/, core/, foundations/."""
+    from pathlib import Path as _Path
+
+    worktree = config.MAIN_WORKTREE
+    for search_dir in ("domains", "core", "foundations"):
+        base = worktree / search_dir
+        if not base.is_dir():
+            continue
+        # Direct match
+        for md in base.rglob(f"{slug}.md"):
+            if not md.name.startswith("_"):
+                return md
+    return None
+
+
+def _add_edge_to_file(file_path, edge_type: str, target_slug: str) -> bool:
+    """Add a single edge to a file's frontmatter. Returns True if modified."""
+    try:
+        content = file_path.read_text()
+    except Exception:
+        return False
+
+    fm, raw_fm, body = _parse_yaml_frontmatter(content)
+    if fm is None:
+        return False
+
+    # Check for existing edge (dedup)
+    existing = fm.get(edge_type, [])
+    if isinstance(existing, str):
+        existing = [existing]
+    if not isinstance(existing, list):
+        existing = []
+
+    if any(str(e).strip().lower() == target_slug.lower() for e in existing):
+        return False  # Already exists
+
+    # Build merged edges (all edge fields, only modifying the target one)
+    merged_edges = {}
+    for field in REWEAVE_EDGE_FIELDS:
+        vals = fm.get(field, [])
+        if isinstance(vals, str):
+            vals = [vals]
+        if not isinstance(vals, list):
+            vals = []
+        merged_edges[field] = list(vals)
+
+    merged_edges.setdefault(edge_type, []).append(target_slug)
+
+    # Serialize using the same string-surgery approach as reweave
+    new_fm = _serialize_edge_fields(raw_fm, merged_edges)
+    if body.startswith("\n"):
+        new_content = f"---\n{new_fm}{body}"
+    else:
+        new_content = f"---\n{new_fm}\n{body}"
+
+    try:
+        file_path.write_text(new_content)
+        return True
+    except Exception:
+        return False
+
+
 def _archive_source_for_pr(branch: str, domain: str, merged: bool = True):
    """Move source from queue/ to archive/{domain}/ after PR merge or close.

@ -1320,6 +1479,10 @@ async def _merge_domain_queue(conn, domain: str) -> tuple[int, int]:
            # Embed new/changed claims into Qdrant (non-fatal)
            await _embed_merged_claims(main_sha, branch_sha)

+            # Add reciprocal edges on existing claims (non-fatal)
+            # New claim A with supports:[B] → add supports:[A] on B's frontmatter
+            await _reciprocal_edges(main_sha, branch_sha)
+
            # Delete remote branch immediately (Ganymede Q4)
            await _delete_remote_branch(branch)

--- a/openrouter-extract-v2.py
+++ b/openrouter-extract-v2.py
@ -42,6 +42,40 @@ from lib.post_extract import (
 )
 from lib.connect import connect_new_claims

+# --- Prior art lookup (extract-time connection) ---
+
+def _find_prior_art(source_title: str, source_body: str, limit: int = 10) -> list[dict]:
+    """Search Qdrant for existing claims similar to this source.
+
+    Uses source title + first 500 chars of body as the search query.
+    Returns list of {claim_title, claim_path, description, score} dicts.
+    Non-fatal — returns empty list on any failure.
+    """
+    try:
+        from lib.search import embed_query, search_qdrant
+    except ImportError:
+        return []
+
+    query = f"{source_title} {source_body[:500]}".strip()
+    if len(query) < 20:
+        return []
+
+    vector = embed_query(query)
+    if vector is None:
+        return []
+
+    hits = search_qdrant(vector, limit=limit, score_threshold=0.55)
+    results = []
+    for hit in hits:
+        payload = hit.get("payload", {})
+        results.append({
+            "claim_title": payload.get("claim_title", ""),
+            "claim_path": payload.get("claim_path", ""),
+            "description": payload.get("description", ""),
+            "score": hit.get("score", 0),
+        })
+    return results
+
 # ─── Source registration (Argus: pipeline funnel tracking) ─────────────────

 def _source_db_conn():
@ -225,6 +259,7 @@ def reconstruct_claim_content(claim, domain, agent):
    source = claim.get("source", f"extraction by {agent}")
    body_text = claim.get("body", desc)
    related = claim.get("related_claims", [])
+    connections = claim.get("connections", [])
    sourcer = claim.get("sourcer", "")

    # Build attribution block (v1: extractor always known, sourcer best-effort)
@ -241,6 +276,32 @@ def reconstruct_claim_content(claim, domain, agent):
            f'      context: "{source}"',
        ])

+    # Build typed edge fields from connections array
+    edge_fields = {"supports": [], "challenges": [], "related": []}
+    for conn in connections:
+        target = conn.get("target", "")
+        rel = conn.get("relationship", "related")
+        if target and rel in edge_fields:
+            # Normalize: strip .md extension if present
+            target = target.replace(".md", "")
+            if target not in edge_fields[rel]:
+                edge_fields[rel].append(target)
+
+    # Also fold related_claims into "related" edges (backwards compat)
+    for r in related[:5]:
+        r_clean = r.replace(".md", "")
+        if r_clean not in edge_fields["related"]:
+            edge_fields["related"].append(r_clean)
+
+    # Build edge lines for frontmatter
+    edge_lines = []
+    for edge_type in ("supports", "challenges", "related"):
+        targets = edge_fields[edge_type]
+        if targets:
+            edge_lines.append(f"{edge_type}:")
+            for t in targets:
+                edge_lines.append(f"  - {t}")
+
    lines = [
        "---",
        "type: claim",
@ -250,6 +311,7 @@ def reconstruct_claim_content(claim, domain, agent):
        f'source: "{source}"',
        f"created: {date.today().isoformat()}",
        *attr_lines,
+        *edge_lines,
        "---",
        "",
        f"# {title}",
@ -262,7 +324,7 @@ def reconstruct_claim_content(claim, domain, agent):
    ]
    for r in related[:5]:
        lines.append(f"- [[{r}]]")
-    lines.extend(["", "Topics:", "- [[_map]]", ""])
+    lines.extend(["", "Topics:", ""])
    return "\n".join(lines)


@ -378,9 +440,18 @@ def main():
    if rationale:
        print(f"  Directed contribution from {proposed_by or '?'}: {rationale[:80]}...")

+    # ── Prior art lookup (extract-time connection) ──
+    # Search Qdrant for existing claims similar to this source.
+    # Injected into prompt so LLM can classify connections at extraction time.
+    source_title = os.path.basename(args.source_file).replace(".md", "").replace("-", " ")
+    prior_art = _find_prior_art(source_title, source_content)
+    if prior_art:
+        print(f"  Prior art: {len(prior_art)} connection candidates (top: {prior_art[0]['claim_title'][:50]}... @ {prior_art[0]['score']:.2f})")
+
    prompt = build_extraction_prompt(
        args.source_file, source_content, domain, agent, kb_index,
        rationale=rationale, intake_tier=intake_tier, proposed_by=proposed_by,
+        prior_art=prior_art,
    )

    if args.dry_run: