diff --git a/lib/extraction_prompt.py b/lib/extraction_prompt.py index 406b16c..d432327 100644 --- a/lib/extraction_prompt.py +++ b/lib/extraction_prompt.py @@ -27,6 +27,7 @@ def build_extraction_prompt( rationale: str | None = None, intake_tier: str | None = None, proposed_by: str | None = None, + prior_art: list[dict] | None = None, ) -> str: """Build the lean extraction prompt. @@ -40,6 +41,9 @@ def build_extraction_prompt( rationale: Contributor's natural-language thesis about the source (optional) intake_tier: undirected | directed | challenge (optional) proposed_by: Contributor handle who submitted the source (optional) + prior_art: Qdrant search results — existing claims semantically similar to this source. + Each dict has: claim_title, claim_path, description, score. + Injected as connection candidates for extract-time linking. Returns: The complete prompt string @@ -72,6 +76,27 @@ Set `contributor_thesis_extractable: true` if you extracted the contributor's th else: contributor_directive = "" + # Build connection candidates section (if prior art found via Qdrant) + if prior_art: + pa_lines = [ + "\n## Connection Candidates (semantically similar existing claims)\n", + "These existing claims are topically related to this source. For each NEW claim you extract,", + "check this list and specify connections in the `connections` array.\n", + ] + for i, pa in enumerate(prior_art[:10], 1): + title = pa.get("claim_title", "untitled") + path = pa.get("claim_path", "") + desc = pa.get("description", "") + score = pa.get("score", 0) + filename = path.rsplit("/", 1)[-1].replace(".md", "") if path else title + pa_lines.append(f"{i}. **{title}** (`{filename}`, similarity: {score:.2f})") + if desc: + pa_lines.append(f" {desc}") + pa_lines.append("") + connection_candidates = "\n".join(pa_lines) + else: + connection_candidates = "" + return f"""You are {agent}, extracting knowledge from a source for TeleoHumanity's collective knowledge base. ## Your Task @@ -136,7 +161,7 @@ Single source = experimental at most. Pitch rhetoric or marketing copy = specula **File:** {source_file} {source_content} -{contributor_directive} +{contributor_directive}{connection_candidates} ## KB Index (existing claims — check for duplicates and enrichment targets) {kb_index} @@ -157,6 +182,13 @@ Return valid JSON. The post-processor handles frontmatter formatting, wiki links "source": "author/org, key evidence reference", "body": "Argument with evidence. Cite specific data, quotes, studies from the source. Explain WHY the claim is supported. This must be a real argument, not a restatement of the title.", "related_claims": ["existing-claim-stem-from-kb-index"], + "connections": [ + {{ + "target": "existing-claim-filename-from-connection-candidates-or-kb-index", + "relationship": "supports|challenges|related", + "reason": "One sentence: WHY does this claim support/challenge/relate to the target?" + }} + ], "scope": "structural|functional|causal|correlational", "sourcer": "handle or name of the original author/source (e.g., @theiaresearch, Pine Analytics)" }} @@ -206,8 +238,9 @@ Return valid JSON. The post-processor handles frontmatter formatting, wiki links 3. **Facts are not claims.** Individual data points go in `facts`. Only generalized patterns from multiple data points become claims. 4. **Proposals are entities, not claims.** A governance proposal, token launch, or funding event is structured data (entity). Only extract a claim if the event reveals a novel mechanism insight that generalizes beyond this specific case. 5. **Scope your claims.** Say whether you're claiming a structural, functional, causal, or correlational relationship. -6. **OPSEC.** Never extract specific dollar amounts, valuations, equity percentages, or deal terms for LivingIP/Teleo. General market data is fine. -7. **Read the Agent Notes.** If the source has "Agent Notes" or "Curator Notes" sections, they contain context about why this source matters. +6. **Connect your claims.** For every new claim, check the Connection Candidates list. If a candidate is related, add it to the `connections` array with the relationship type and a one-sentence reason. Use `supports` when your claim provides evidence for the target, `challenges` when it contradicts, `related` only as a last resort. Unconnected claims are orphans — connect them at birth. +7. **OPSEC.** Never extract specific dollar amounts, valuations, equity percentages, or deal terms for LivingIP/Teleo. General market data is fine. +8. **Read the Agent Notes.** If the source has "Agent Notes" or "Curator Notes" sections, they contain context about why this source matters. Return valid JSON only. No markdown fencing, no explanation outside the JSON. """ diff --git a/lib/merge.py b/lib/merge.py index 866adf8..a3a0873 100644 --- a/lib/merge.py +++ b/lib/merge.py @@ -1102,6 +1102,165 @@ async def _embed_merged_claims(main_sha: str, branch_sha: str): logger.exception("embed: post-merge embedding failed (non-fatal)") +async def _reciprocal_edges(main_sha: str, branch_sha: str): + """Add reciprocal edges on existing claims after a PR merges. + + When a new claim A has `supports: [B]` in its frontmatter, B should have + `supports: [A]` added to its own frontmatter. This gives A an incoming link, + preventing it from being an orphan. + + Runs on main after cherry-pick merge. Non-fatal — orphans are recoverable. + Only processes new files (diff-filter=A), not modified files. + """ + EDGE_FIELDS = ("supports", "challenges", "related") + # Inverse mapping: if A supports B, then B is supported-by A. + # For simplicity, we use the same edge type (bidirectional "supports" means + # both claims support each other's argument). This matches reweave behavior. + + try: + # Find newly added claim files + rc, diff_out = await _git( + "diff", "--name-only", "--diff-filter=A", + main_sha, branch_sha, + cwd=str(config.MAIN_WORKTREE), + timeout=10, + ) + if rc != 0: + logger.warning("reciprocal_edges: diff failed (rc=%d), skipping", rc) + return + + claim_dirs = {"domains/", "core/", "foundations/"} + new_claims = [ + f for f in diff_out.strip().split("\n") + if f.endswith(".md") + and any(f.startswith(d) for d in claim_dirs) + and not f.split("/")[-1].startswith("_") + and "/entities/" not in f + and "/decisions/" not in f + ] + + if not new_claims: + return + + reciprocals_added = 0 + for claim_path in new_claims: + full_path = config.MAIN_WORKTREE / claim_path + if not full_path.exists(): + continue + + try: + content = full_path.read_text() + except Exception: + continue + + fm, raw_fm, body = _parse_yaml_frontmatter(content) + if fm is None: + continue + + # Get the new claim's slug (filename without .md) + claim_slug = claim_path.rsplit("/", 1)[-1].replace(".md", "") + + # Collect all edge targets from this new claim + for field in EDGE_FIELDS: + targets = fm.get(field, []) + if isinstance(targets, str): + targets = [targets] + if not isinstance(targets, list): + continue + + for target_slug in targets: + target_slug = str(target_slug).strip() + if not target_slug: + continue + + # Find the target file on disk + target_file = _find_claim_file(target_slug) + if target_file is None: + continue + + # Add reciprocal edge: target now has field: [new_claim_slug] + if _add_edge_to_file(target_file, field, claim_slug): + reciprocals_added += 1 + + if reciprocals_added > 0: + # Commit the reciprocal edges + await _git("add", "-A", cwd=str(config.MAIN_WORKTREE)) + rc, out = await _git( + "commit", "-m", f"reciprocal edges: {reciprocals_added} edges from {len(new_claims)} new claims", + cwd=str(config.MAIN_WORKTREE), + ) + if rc == 0: + logger.info("reciprocal_edges: %d edges added across %d new claims", reciprocals_added, len(new_claims)) + else: + logger.warning("reciprocal_edges: commit failed: %s", out[:200]) + + except Exception: + logger.exception("reciprocal_edges: failed (non-fatal)") + + +def _find_claim_file(slug: str) -> "Path | None": + """Find a claim file on disk by its slug. Searches domains/, core/, foundations/.""" + from pathlib import Path as _Path + + worktree = config.MAIN_WORKTREE + for search_dir in ("domains", "core", "foundations"): + base = worktree / search_dir + if not base.is_dir(): + continue + # Direct match + for md in base.rglob(f"{slug}.md"): + if not md.name.startswith("_"): + return md + return None + + +def _add_edge_to_file(file_path, edge_type: str, target_slug: str) -> bool: + """Add a single edge to a file's frontmatter. Returns True if modified.""" + try: + content = file_path.read_text() + except Exception: + return False + + fm, raw_fm, body = _parse_yaml_frontmatter(content) + if fm is None: + return False + + # Check for existing edge (dedup) + existing = fm.get(edge_type, []) + if isinstance(existing, str): + existing = [existing] + if not isinstance(existing, list): + existing = [] + + if any(str(e).strip().lower() == target_slug.lower() for e in existing): + return False # Already exists + + # Build merged edges (all edge fields, only modifying the target one) + merged_edges = {} + for field in REWEAVE_EDGE_FIELDS: + vals = fm.get(field, []) + if isinstance(vals, str): + vals = [vals] + if not isinstance(vals, list): + vals = [] + merged_edges[field] = list(vals) + + merged_edges.setdefault(edge_type, []).append(target_slug) + + # Serialize using the same string-surgery approach as reweave + new_fm = _serialize_edge_fields(raw_fm, merged_edges) + if body.startswith("\n"): + new_content = f"---\n{new_fm}{body}" + else: + new_content = f"---\n{new_fm}\n{body}" + + try: + file_path.write_text(new_content) + return True + except Exception: + return False + + def _archive_source_for_pr(branch: str, domain: str, merged: bool = True): """Move source from queue/ to archive/{domain}/ after PR merge or close. @@ -1320,6 +1479,10 @@ async def _merge_domain_queue(conn, domain: str) -> tuple[int, int]: # Embed new/changed claims into Qdrant (non-fatal) await _embed_merged_claims(main_sha, branch_sha) + # Add reciprocal edges on existing claims (non-fatal) + # New claim A with supports:[B] → add supports:[A] on B's frontmatter + await _reciprocal_edges(main_sha, branch_sha) + # Delete remote branch immediately (Ganymede Q4) await _delete_remote_branch(branch) diff --git a/openrouter-extract-v2.py b/openrouter-extract-v2.py index a7e7b24..1265b82 100644 --- a/openrouter-extract-v2.py +++ b/openrouter-extract-v2.py @@ -42,6 +42,40 @@ from lib.post_extract import ( ) from lib.connect import connect_new_claims +# --- Prior art lookup (extract-time connection) --- + +def _find_prior_art(source_title: str, source_body: str, limit: int = 10) -> list[dict]: + """Search Qdrant for existing claims similar to this source. + + Uses source title + first 500 chars of body as the search query. + Returns list of {claim_title, claim_path, description, score} dicts. + Non-fatal — returns empty list on any failure. + """ + try: + from lib.search import embed_query, search_qdrant + except ImportError: + return [] + + query = f"{source_title} {source_body[:500]}".strip() + if len(query) < 20: + return [] + + vector = embed_query(query) + if vector is None: + return [] + + hits = search_qdrant(vector, limit=limit, score_threshold=0.55) + results = [] + for hit in hits: + payload = hit.get("payload", {}) + results.append({ + "claim_title": payload.get("claim_title", ""), + "claim_path": payload.get("claim_path", ""), + "description": payload.get("description", ""), + "score": hit.get("score", 0), + }) + return results + # ─── Source registration (Argus: pipeline funnel tracking) ───────────────── def _source_db_conn(): @@ -225,6 +259,7 @@ def reconstruct_claim_content(claim, domain, agent): source = claim.get("source", f"extraction by {agent}") body_text = claim.get("body", desc) related = claim.get("related_claims", []) + connections = claim.get("connections", []) sourcer = claim.get("sourcer", "") # Build attribution block (v1: extractor always known, sourcer best-effort) @@ -241,6 +276,32 @@ def reconstruct_claim_content(claim, domain, agent): f' context: "{source}"', ]) + # Build typed edge fields from connections array + edge_fields = {"supports": [], "challenges": [], "related": []} + for conn in connections: + target = conn.get("target", "") + rel = conn.get("relationship", "related") + if target and rel in edge_fields: + # Normalize: strip .md extension if present + target = target.replace(".md", "") + if target not in edge_fields[rel]: + edge_fields[rel].append(target) + + # Also fold related_claims into "related" edges (backwards compat) + for r in related[:5]: + r_clean = r.replace(".md", "") + if r_clean not in edge_fields["related"]: + edge_fields["related"].append(r_clean) + + # Build edge lines for frontmatter + edge_lines = [] + for edge_type in ("supports", "challenges", "related"): + targets = edge_fields[edge_type] + if targets: + edge_lines.append(f"{edge_type}:") + for t in targets: + edge_lines.append(f" - {t}") + lines = [ "---", "type: claim", @@ -250,6 +311,7 @@ def reconstruct_claim_content(claim, domain, agent): f'source: "{source}"', f"created: {date.today().isoformat()}", *attr_lines, + *edge_lines, "---", "", f"# {title}", @@ -262,7 +324,7 @@ def reconstruct_claim_content(claim, domain, agent): ] for r in related[:5]: lines.append(f"- [[{r}]]") - lines.extend(["", "Topics:", "- [[_map]]", ""]) + lines.extend(["", "Topics:", ""]) return "\n".join(lines) @@ -378,9 +440,18 @@ def main(): if rationale: print(f" Directed contribution from {proposed_by or '?'}: {rationale[:80]}...") + # ── Prior art lookup (extract-time connection) ── + # Search Qdrant for existing claims similar to this source. + # Injected into prompt so LLM can classify connections at extraction time. + source_title = os.path.basename(args.source_file).replace(".md", "").replace("-", " ") + prior_art = _find_prior_art(source_title, source_content) + if prior_art: + print(f" Prior art: {len(prior_art)} connection candidates (top: {prior_art[0]['claim_title'][:50]}... @ {prior_art[0]['score']:.2f})") + prompt = build_extraction_prompt( args.source_file, source_content, domain, agent, kb_index, rationale=rationale, intake_tier=intake_tier, proposed_by=proposed_by, + prior_art=prior_art, ) if args.dry_run: