feat: extract-time connection + post-merge reciprocal edges
Some checks are pending
CI / lint-and-test (push) Waiting to run

Two-part fix for 58% orphan ratio:

1. Prompt-time prior art: Qdrant lookup before extraction injects
   existing claims as connection candidates. LLM classifies edges
   as supports/challenges/related. reconstruct_claim_content writes
   typed edges in frontmatter.

2. Post-merge reciprocal edges: _reciprocal_edges() runs after
   cherry-pick merge, reads new claims' outgoing edges, writes
   reciprocal edges on target files. Ensures every new claim has
   incoming links.

Files: lib/extraction_prompt.py, lib/merge.py, openrouter-extract-v2.py
Tests: 214 passed (3 failures + 3 errors pre-existing)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
m3taversal 2026-04-04 15:25:31 +01:00
parent 84cb001dd6
commit be010e666a
3 changed files with 271 additions and 4 deletions

View file

@ -27,6 +27,7 @@ def build_extraction_prompt(
rationale: str | None = None,
intake_tier: str | None = None,
proposed_by: str | None = None,
prior_art: list[dict] | None = None,
) -> str:
"""Build the lean extraction prompt.
@ -40,6 +41,9 @@ def build_extraction_prompt(
rationale: Contributor's natural-language thesis about the source (optional)
intake_tier: undirected | directed | challenge (optional)
proposed_by: Contributor handle who submitted the source (optional)
prior_art: Qdrant search results existing claims semantically similar to this source.
Each dict has: claim_title, claim_path, description, score.
Injected as connection candidates for extract-time linking.
Returns:
The complete prompt string
@ -72,6 +76,27 @@ Set `contributor_thesis_extractable: true` if you extracted the contributor's th
else:
contributor_directive = ""
# Build connection candidates section (if prior art found via Qdrant)
if prior_art:
pa_lines = [
"\n## Connection Candidates (semantically similar existing claims)\n",
"These existing claims are topically related to this source. For each NEW claim you extract,",
"check this list and specify connections in the `connections` array.\n",
]
for i, pa in enumerate(prior_art[:10], 1):
title = pa.get("claim_title", "untitled")
path = pa.get("claim_path", "")
desc = pa.get("description", "")
score = pa.get("score", 0)
filename = path.rsplit("/", 1)[-1].replace(".md", "") if path else title
pa_lines.append(f"{i}. **{title}** (`{filename}`, similarity: {score:.2f})")
if desc:
pa_lines.append(f" {desc}")
pa_lines.append("")
connection_candidates = "\n".join(pa_lines)
else:
connection_candidates = ""
return f"""You are {agent}, extracting knowledge from a source for TeleoHumanity's collective knowledge base.
## Your Task
@ -136,7 +161,7 @@ Single source = experimental at most. Pitch rhetoric or marketing copy = specula
**File:** {source_file}
{source_content}
{contributor_directive}
{contributor_directive}{connection_candidates}
## KB Index (existing claims — check for duplicates and enrichment targets)
{kb_index}
@ -157,6 +182,13 @@ Return valid JSON. The post-processor handles frontmatter formatting, wiki links
"source": "author/org, key evidence reference",
"body": "Argument with evidence. Cite specific data, quotes, studies from the source. Explain WHY the claim is supported. This must be a real argument, not a restatement of the title.",
"related_claims": ["existing-claim-stem-from-kb-index"],
"connections": [
{{
"target": "existing-claim-filename-from-connection-candidates-or-kb-index",
"relationship": "supports|challenges|related",
"reason": "One sentence: WHY does this claim support/challenge/relate to the target?"
}}
],
"scope": "structural|functional|causal|correlational",
"sourcer": "handle or name of the original author/source (e.g., @theiaresearch, Pine Analytics)"
}}
@ -206,8 +238,9 @@ Return valid JSON. The post-processor handles frontmatter formatting, wiki links
3. **Facts are not claims.** Individual data points go in `facts`. Only generalized patterns from multiple data points become claims.
4. **Proposals are entities, not claims.** A governance proposal, token launch, or funding event is structured data (entity). Only extract a claim if the event reveals a novel mechanism insight that generalizes beyond this specific case.
5. **Scope your claims.** Say whether you're claiming a structural, functional, causal, or correlational relationship.
6. **OPSEC.** Never extract specific dollar amounts, valuations, equity percentages, or deal terms for LivingIP/Teleo. General market data is fine.
7. **Read the Agent Notes.** If the source has "Agent Notes" or "Curator Notes" sections, they contain context about why this source matters.
6. **Connect your claims.** For every new claim, check the Connection Candidates list. If a candidate is related, add it to the `connections` array with the relationship type and a one-sentence reason. Use `supports` when your claim provides evidence for the target, `challenges` when it contradicts, `related` only as a last resort. Unconnected claims are orphans connect them at birth.
7. **OPSEC.** Never extract specific dollar amounts, valuations, equity percentages, or deal terms for LivingIP/Teleo. General market data is fine.
8. **Read the Agent Notes.** If the source has "Agent Notes" or "Curator Notes" sections, they contain context about why this source matters.
Return valid JSON only. No markdown fencing, no explanation outside the JSON.
"""

View file

@ -1102,6 +1102,165 @@ async def _embed_merged_claims(main_sha: str, branch_sha: str):
logger.exception("embed: post-merge embedding failed (non-fatal)")
async def _reciprocal_edges(main_sha: str, branch_sha: str):
"""Add reciprocal edges on existing claims after a PR merges.
When a new claim A has `supports: [B]` in its frontmatter, B should have
`supports: [A]` added to its own frontmatter. This gives A an incoming link,
preventing it from being an orphan.
Runs on main after cherry-pick merge. Non-fatal orphans are recoverable.
Only processes new files (diff-filter=A), not modified files.
"""
EDGE_FIELDS = ("supports", "challenges", "related")
# Inverse mapping: if A supports B, then B is supported-by A.
# For simplicity, we use the same edge type (bidirectional "supports" means
# both claims support each other's argument). This matches reweave behavior.
try:
# Find newly added claim files
rc, diff_out = await _git(
"diff", "--name-only", "--diff-filter=A",
main_sha, branch_sha,
cwd=str(config.MAIN_WORKTREE),
timeout=10,
)
if rc != 0:
logger.warning("reciprocal_edges: diff failed (rc=%d), skipping", rc)
return
claim_dirs = {"domains/", "core/", "foundations/"}
new_claims = [
f for f in diff_out.strip().split("\n")
if f.endswith(".md")
and any(f.startswith(d) for d in claim_dirs)
and not f.split("/")[-1].startswith("_")
and "/entities/" not in f
and "/decisions/" not in f
]
if not new_claims:
return
reciprocals_added = 0
for claim_path in new_claims:
full_path = config.MAIN_WORKTREE / claim_path
if not full_path.exists():
continue
try:
content = full_path.read_text()
except Exception:
continue
fm, raw_fm, body = _parse_yaml_frontmatter(content)
if fm is None:
continue
# Get the new claim's slug (filename without .md)
claim_slug = claim_path.rsplit("/", 1)[-1].replace(".md", "")
# Collect all edge targets from this new claim
for field in EDGE_FIELDS:
targets = fm.get(field, [])
if isinstance(targets, str):
targets = [targets]
if not isinstance(targets, list):
continue
for target_slug in targets:
target_slug = str(target_slug).strip()
if not target_slug:
continue
# Find the target file on disk
target_file = _find_claim_file(target_slug)
if target_file is None:
continue
# Add reciprocal edge: target now has field: [new_claim_slug]
if _add_edge_to_file(target_file, field, claim_slug):
reciprocals_added += 1
if reciprocals_added > 0:
# Commit the reciprocal edges
await _git("add", "-A", cwd=str(config.MAIN_WORKTREE))
rc, out = await _git(
"commit", "-m", f"reciprocal edges: {reciprocals_added} edges from {len(new_claims)} new claims",
cwd=str(config.MAIN_WORKTREE),
)
if rc == 0:
logger.info("reciprocal_edges: %d edges added across %d new claims", reciprocals_added, len(new_claims))
else:
logger.warning("reciprocal_edges: commit failed: %s", out[:200])
except Exception:
logger.exception("reciprocal_edges: failed (non-fatal)")
def _find_claim_file(slug: str) -> "Path | None":
"""Find a claim file on disk by its slug. Searches domains/, core/, foundations/."""
from pathlib import Path as _Path
worktree = config.MAIN_WORKTREE
for search_dir in ("domains", "core", "foundations"):
base = worktree / search_dir
if not base.is_dir():
continue
# Direct match
for md in base.rglob(f"{slug}.md"):
if not md.name.startswith("_"):
return md
return None
def _add_edge_to_file(file_path, edge_type: str, target_slug: str) -> bool:
"""Add a single edge to a file's frontmatter. Returns True if modified."""
try:
content = file_path.read_text()
except Exception:
return False
fm, raw_fm, body = _parse_yaml_frontmatter(content)
if fm is None:
return False
# Check for existing edge (dedup)
existing = fm.get(edge_type, [])
if isinstance(existing, str):
existing = [existing]
if not isinstance(existing, list):
existing = []
if any(str(e).strip().lower() == target_slug.lower() for e in existing):
return False # Already exists
# Build merged edges (all edge fields, only modifying the target one)
merged_edges = {}
for field in REWEAVE_EDGE_FIELDS:
vals = fm.get(field, [])
if isinstance(vals, str):
vals = [vals]
if not isinstance(vals, list):
vals = []
merged_edges[field] = list(vals)
merged_edges.setdefault(edge_type, []).append(target_slug)
# Serialize using the same string-surgery approach as reweave
new_fm = _serialize_edge_fields(raw_fm, merged_edges)
if body.startswith("\n"):
new_content = f"---\n{new_fm}{body}"
else:
new_content = f"---\n{new_fm}\n{body}"
try:
file_path.write_text(new_content)
return True
except Exception:
return False
def _archive_source_for_pr(branch: str, domain: str, merged: bool = True):
"""Move source from queue/ to archive/{domain}/ after PR merge or close.
@ -1320,6 +1479,10 @@ async def _merge_domain_queue(conn, domain: str) -> tuple[int, int]:
# Embed new/changed claims into Qdrant (non-fatal)
await _embed_merged_claims(main_sha, branch_sha)
# Add reciprocal edges on existing claims (non-fatal)
# New claim A with supports:[B] → add supports:[A] on B's frontmatter
await _reciprocal_edges(main_sha, branch_sha)
# Delete remote branch immediately (Ganymede Q4)
await _delete_remote_branch(branch)

View file

@ -42,6 +42,40 @@ from lib.post_extract import (
)
from lib.connect import connect_new_claims
# --- Prior art lookup (extract-time connection) ---
def _find_prior_art(source_title: str, source_body: str, limit: int = 10) -> list[dict]:
"""Search Qdrant for existing claims similar to this source.
Uses source title + first 500 chars of body as the search query.
Returns list of {claim_title, claim_path, description, score} dicts.
Non-fatal returns empty list on any failure.
"""
try:
from lib.search import embed_query, search_qdrant
except ImportError:
return []
query = f"{source_title} {source_body[:500]}".strip()
if len(query) < 20:
return []
vector = embed_query(query)
if vector is None:
return []
hits = search_qdrant(vector, limit=limit, score_threshold=0.55)
results = []
for hit in hits:
payload = hit.get("payload", {})
results.append({
"claim_title": payload.get("claim_title", ""),
"claim_path": payload.get("claim_path", ""),
"description": payload.get("description", ""),
"score": hit.get("score", 0),
})
return results
# ─── Source registration (Argus: pipeline funnel tracking) ─────────────────
def _source_db_conn():
@ -225,6 +259,7 @@ def reconstruct_claim_content(claim, domain, agent):
source = claim.get("source", f"extraction by {agent}")
body_text = claim.get("body", desc)
related = claim.get("related_claims", [])
connections = claim.get("connections", [])
sourcer = claim.get("sourcer", "")
# Build attribution block (v1: extractor always known, sourcer best-effort)
@ -241,6 +276,32 @@ def reconstruct_claim_content(claim, domain, agent):
f' context: "{source}"',
])
# Build typed edge fields from connections array
edge_fields = {"supports": [], "challenges": [], "related": []}
for conn in connections:
target = conn.get("target", "")
rel = conn.get("relationship", "related")
if target and rel in edge_fields:
# Normalize: strip .md extension if present
target = target.replace(".md", "")
if target not in edge_fields[rel]:
edge_fields[rel].append(target)
# Also fold related_claims into "related" edges (backwards compat)
for r in related[:5]:
r_clean = r.replace(".md", "")
if r_clean not in edge_fields["related"]:
edge_fields["related"].append(r_clean)
# Build edge lines for frontmatter
edge_lines = []
for edge_type in ("supports", "challenges", "related"):
targets = edge_fields[edge_type]
if targets:
edge_lines.append(f"{edge_type}:")
for t in targets:
edge_lines.append(f" - {t}")
lines = [
"---",
"type: claim",
@ -250,6 +311,7 @@ def reconstruct_claim_content(claim, domain, agent):
f'source: "{source}"',
f"created: {date.today().isoformat()}",
*attr_lines,
*edge_lines,
"---",
"",
f"# {title}",
@ -262,7 +324,7 @@ def reconstruct_claim_content(claim, domain, agent):
]
for r in related[:5]:
lines.append(f"- [[{r}]]")
lines.extend(["", "Topics:", "- [[_map]]", ""])
lines.extend(["", "Topics:", ""])
return "\n".join(lines)
@ -378,9 +440,18 @@ def main():
if rationale:
print(f" Directed contribution from {proposed_by or '?'}: {rationale[:80]}...")
# ── Prior art lookup (extract-time connection) ──
# Search Qdrant for existing claims similar to this source.
# Injected into prompt so LLM can classify connections at extraction time.
source_title = os.path.basename(args.source_file).replace(".md", "").replace("-", " ")
prior_art = _find_prior_art(source_title, source_content)
if prior_art:
print(f" Prior art: {len(prior_art)} connection candidates (top: {prior_art[0]['claim_title'][:50]}... @ {prior_art[0]['score']:.2f})")
prompt = build_extraction_prompt(
args.source_file, source_content, domain, agent, kb_index,
rationale=rationale, intake_tier=intake_tier, proposed_by=proposed_by,
prior_art=prior_art,
)
if args.dry_run: