feat: extract-time connection + post-merge reciprocal edges
Some checks are pending
CI / lint-and-test (push) Waiting to run
Some checks are pending
CI / lint-and-test (push) Waiting to run
Two-part fix for 58% orphan ratio: 1. Prompt-time prior art: Qdrant lookup before extraction injects existing claims as connection candidates. LLM classifies edges as supports/challenges/related. reconstruct_claim_content writes typed edges in frontmatter. 2. Post-merge reciprocal edges: _reciprocal_edges() runs after cherry-pick merge, reads new claims' outgoing edges, writes reciprocal edges on target files. Ensures every new claim has incoming links. Files: lib/extraction_prompt.py, lib/merge.py, openrouter-extract-v2.py Tests: 214 passed (3 failures + 3 errors pre-existing) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
84cb001dd6
commit
be010e666a
3 changed files with 271 additions and 4 deletions
|
|
@ -27,6 +27,7 @@ def build_extraction_prompt(
|
|||
rationale: str | None = None,
|
||||
intake_tier: str | None = None,
|
||||
proposed_by: str | None = None,
|
||||
prior_art: list[dict] | None = None,
|
||||
) -> str:
|
||||
"""Build the lean extraction prompt.
|
||||
|
||||
|
|
@ -40,6 +41,9 @@ def build_extraction_prompt(
|
|||
rationale: Contributor's natural-language thesis about the source (optional)
|
||||
intake_tier: undirected | directed | challenge (optional)
|
||||
proposed_by: Contributor handle who submitted the source (optional)
|
||||
prior_art: Qdrant search results — existing claims semantically similar to this source.
|
||||
Each dict has: claim_title, claim_path, description, score.
|
||||
Injected as connection candidates for extract-time linking.
|
||||
|
||||
Returns:
|
||||
The complete prompt string
|
||||
|
|
@ -72,6 +76,27 @@ Set `contributor_thesis_extractable: true` if you extracted the contributor's th
|
|||
else:
|
||||
contributor_directive = ""
|
||||
|
||||
# Build connection candidates section (if prior art found via Qdrant)
|
||||
if prior_art:
|
||||
pa_lines = [
|
||||
"\n## Connection Candidates (semantically similar existing claims)\n",
|
||||
"These existing claims are topically related to this source. For each NEW claim you extract,",
|
||||
"check this list and specify connections in the `connections` array.\n",
|
||||
]
|
||||
for i, pa in enumerate(prior_art[:10], 1):
|
||||
title = pa.get("claim_title", "untitled")
|
||||
path = pa.get("claim_path", "")
|
||||
desc = pa.get("description", "")
|
||||
score = pa.get("score", 0)
|
||||
filename = path.rsplit("/", 1)[-1].replace(".md", "") if path else title
|
||||
pa_lines.append(f"{i}. **{title}** (`{filename}`, similarity: {score:.2f})")
|
||||
if desc:
|
||||
pa_lines.append(f" {desc}")
|
||||
pa_lines.append("")
|
||||
connection_candidates = "\n".join(pa_lines)
|
||||
else:
|
||||
connection_candidates = ""
|
||||
|
||||
return f"""You are {agent}, extracting knowledge from a source for TeleoHumanity's collective knowledge base.
|
||||
|
||||
## Your Task
|
||||
|
|
@ -136,7 +161,7 @@ Single source = experimental at most. Pitch rhetoric or marketing copy = specula
|
|||
**File:** {source_file}
|
||||
|
||||
{source_content}
|
||||
{contributor_directive}
|
||||
{contributor_directive}{connection_candidates}
|
||||
## KB Index (existing claims — check for duplicates and enrichment targets)
|
||||
|
||||
{kb_index}
|
||||
|
|
@ -157,6 +182,13 @@ Return valid JSON. The post-processor handles frontmatter formatting, wiki links
|
|||
"source": "author/org, key evidence reference",
|
||||
"body": "Argument with evidence. Cite specific data, quotes, studies from the source. Explain WHY the claim is supported. This must be a real argument, not a restatement of the title.",
|
||||
"related_claims": ["existing-claim-stem-from-kb-index"],
|
||||
"connections": [
|
||||
{{
|
||||
"target": "existing-claim-filename-from-connection-candidates-or-kb-index",
|
||||
"relationship": "supports|challenges|related",
|
||||
"reason": "One sentence: WHY does this claim support/challenge/relate to the target?"
|
||||
}}
|
||||
],
|
||||
"scope": "structural|functional|causal|correlational",
|
||||
"sourcer": "handle or name of the original author/source (e.g., @theiaresearch, Pine Analytics)"
|
||||
}}
|
||||
|
|
@ -206,8 +238,9 @@ Return valid JSON. The post-processor handles frontmatter formatting, wiki links
|
|||
3. **Facts are not claims.** Individual data points go in `facts`. Only generalized patterns from multiple data points become claims.
|
||||
4. **Proposals are entities, not claims.** A governance proposal, token launch, or funding event is structured data (entity). Only extract a claim if the event reveals a novel mechanism insight that generalizes beyond this specific case.
|
||||
5. **Scope your claims.** Say whether you're claiming a structural, functional, causal, or correlational relationship.
|
||||
6. **OPSEC.** Never extract specific dollar amounts, valuations, equity percentages, or deal terms for LivingIP/Teleo. General market data is fine.
|
||||
7. **Read the Agent Notes.** If the source has "Agent Notes" or "Curator Notes" sections, they contain context about why this source matters.
|
||||
6. **Connect your claims.** For every new claim, check the Connection Candidates list. If a candidate is related, add it to the `connections` array with the relationship type and a one-sentence reason. Use `supports` when your claim provides evidence for the target, `challenges` when it contradicts, `related` only as a last resort. Unconnected claims are orphans — connect them at birth.
|
||||
7. **OPSEC.** Never extract specific dollar amounts, valuations, equity percentages, or deal terms for LivingIP/Teleo. General market data is fine.
|
||||
8. **Read the Agent Notes.** If the source has "Agent Notes" or "Curator Notes" sections, they contain context about why this source matters.
|
||||
|
||||
Return valid JSON only. No markdown fencing, no explanation outside the JSON.
|
||||
"""
|
||||
|
|
|
|||
163
lib/merge.py
163
lib/merge.py
|
|
@ -1102,6 +1102,165 @@ async def _embed_merged_claims(main_sha: str, branch_sha: str):
|
|||
logger.exception("embed: post-merge embedding failed (non-fatal)")
|
||||
|
||||
|
||||
async def _reciprocal_edges(main_sha: str, branch_sha: str):
|
||||
"""Add reciprocal edges on existing claims after a PR merges.
|
||||
|
||||
When a new claim A has `supports: [B]` in its frontmatter, B should have
|
||||
`supports: [A]` added to its own frontmatter. This gives A an incoming link,
|
||||
preventing it from being an orphan.
|
||||
|
||||
Runs on main after cherry-pick merge. Non-fatal — orphans are recoverable.
|
||||
Only processes new files (diff-filter=A), not modified files.
|
||||
"""
|
||||
EDGE_FIELDS = ("supports", "challenges", "related")
|
||||
# Inverse mapping: if A supports B, then B is supported-by A.
|
||||
# For simplicity, we use the same edge type (bidirectional "supports" means
|
||||
# both claims support each other's argument). This matches reweave behavior.
|
||||
|
||||
try:
|
||||
# Find newly added claim files
|
||||
rc, diff_out = await _git(
|
||||
"diff", "--name-only", "--diff-filter=A",
|
||||
main_sha, branch_sha,
|
||||
cwd=str(config.MAIN_WORKTREE),
|
||||
timeout=10,
|
||||
)
|
||||
if rc != 0:
|
||||
logger.warning("reciprocal_edges: diff failed (rc=%d), skipping", rc)
|
||||
return
|
||||
|
||||
claim_dirs = {"domains/", "core/", "foundations/"}
|
||||
new_claims = [
|
||||
f for f in diff_out.strip().split("\n")
|
||||
if f.endswith(".md")
|
||||
and any(f.startswith(d) for d in claim_dirs)
|
||||
and not f.split("/")[-1].startswith("_")
|
||||
and "/entities/" not in f
|
||||
and "/decisions/" not in f
|
||||
]
|
||||
|
||||
if not new_claims:
|
||||
return
|
||||
|
||||
reciprocals_added = 0
|
||||
for claim_path in new_claims:
|
||||
full_path = config.MAIN_WORKTREE / claim_path
|
||||
if not full_path.exists():
|
||||
continue
|
||||
|
||||
try:
|
||||
content = full_path.read_text()
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
fm, raw_fm, body = _parse_yaml_frontmatter(content)
|
||||
if fm is None:
|
||||
continue
|
||||
|
||||
# Get the new claim's slug (filename without .md)
|
||||
claim_slug = claim_path.rsplit("/", 1)[-1].replace(".md", "")
|
||||
|
||||
# Collect all edge targets from this new claim
|
||||
for field in EDGE_FIELDS:
|
||||
targets = fm.get(field, [])
|
||||
if isinstance(targets, str):
|
||||
targets = [targets]
|
||||
if not isinstance(targets, list):
|
||||
continue
|
||||
|
||||
for target_slug in targets:
|
||||
target_slug = str(target_slug).strip()
|
||||
if not target_slug:
|
||||
continue
|
||||
|
||||
# Find the target file on disk
|
||||
target_file = _find_claim_file(target_slug)
|
||||
if target_file is None:
|
||||
continue
|
||||
|
||||
# Add reciprocal edge: target now has field: [new_claim_slug]
|
||||
if _add_edge_to_file(target_file, field, claim_slug):
|
||||
reciprocals_added += 1
|
||||
|
||||
if reciprocals_added > 0:
|
||||
# Commit the reciprocal edges
|
||||
await _git("add", "-A", cwd=str(config.MAIN_WORKTREE))
|
||||
rc, out = await _git(
|
||||
"commit", "-m", f"reciprocal edges: {reciprocals_added} edges from {len(new_claims)} new claims",
|
||||
cwd=str(config.MAIN_WORKTREE),
|
||||
)
|
||||
if rc == 0:
|
||||
logger.info("reciprocal_edges: %d edges added across %d new claims", reciprocals_added, len(new_claims))
|
||||
else:
|
||||
logger.warning("reciprocal_edges: commit failed: %s", out[:200])
|
||||
|
||||
except Exception:
|
||||
logger.exception("reciprocal_edges: failed (non-fatal)")
|
||||
|
||||
|
||||
def _find_claim_file(slug: str) -> "Path | None":
|
||||
"""Find a claim file on disk by its slug. Searches domains/, core/, foundations/."""
|
||||
from pathlib import Path as _Path
|
||||
|
||||
worktree = config.MAIN_WORKTREE
|
||||
for search_dir in ("domains", "core", "foundations"):
|
||||
base = worktree / search_dir
|
||||
if not base.is_dir():
|
||||
continue
|
||||
# Direct match
|
||||
for md in base.rglob(f"{slug}.md"):
|
||||
if not md.name.startswith("_"):
|
||||
return md
|
||||
return None
|
||||
|
||||
|
||||
def _add_edge_to_file(file_path, edge_type: str, target_slug: str) -> bool:
|
||||
"""Add a single edge to a file's frontmatter. Returns True if modified."""
|
||||
try:
|
||||
content = file_path.read_text()
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
fm, raw_fm, body = _parse_yaml_frontmatter(content)
|
||||
if fm is None:
|
||||
return False
|
||||
|
||||
# Check for existing edge (dedup)
|
||||
existing = fm.get(edge_type, [])
|
||||
if isinstance(existing, str):
|
||||
existing = [existing]
|
||||
if not isinstance(existing, list):
|
||||
existing = []
|
||||
|
||||
if any(str(e).strip().lower() == target_slug.lower() for e in existing):
|
||||
return False # Already exists
|
||||
|
||||
# Build merged edges (all edge fields, only modifying the target one)
|
||||
merged_edges = {}
|
||||
for field in REWEAVE_EDGE_FIELDS:
|
||||
vals = fm.get(field, [])
|
||||
if isinstance(vals, str):
|
||||
vals = [vals]
|
||||
if not isinstance(vals, list):
|
||||
vals = []
|
||||
merged_edges[field] = list(vals)
|
||||
|
||||
merged_edges.setdefault(edge_type, []).append(target_slug)
|
||||
|
||||
# Serialize using the same string-surgery approach as reweave
|
||||
new_fm = _serialize_edge_fields(raw_fm, merged_edges)
|
||||
if body.startswith("\n"):
|
||||
new_content = f"---\n{new_fm}{body}"
|
||||
else:
|
||||
new_content = f"---\n{new_fm}\n{body}"
|
||||
|
||||
try:
|
||||
file_path.write_text(new_content)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _archive_source_for_pr(branch: str, domain: str, merged: bool = True):
|
||||
"""Move source from queue/ to archive/{domain}/ after PR merge or close.
|
||||
|
||||
|
|
@ -1320,6 +1479,10 @@ async def _merge_domain_queue(conn, domain: str) -> tuple[int, int]:
|
|||
# Embed new/changed claims into Qdrant (non-fatal)
|
||||
await _embed_merged_claims(main_sha, branch_sha)
|
||||
|
||||
# Add reciprocal edges on existing claims (non-fatal)
|
||||
# New claim A with supports:[B] → add supports:[A] on B's frontmatter
|
||||
await _reciprocal_edges(main_sha, branch_sha)
|
||||
|
||||
# Delete remote branch immediately (Ganymede Q4)
|
||||
await _delete_remote_branch(branch)
|
||||
|
||||
|
|
|
|||
|
|
@ -42,6 +42,40 @@ from lib.post_extract import (
|
|||
)
|
||||
from lib.connect import connect_new_claims
|
||||
|
||||
# --- Prior art lookup (extract-time connection) ---
|
||||
|
||||
def _find_prior_art(source_title: str, source_body: str, limit: int = 10) -> list[dict]:
|
||||
"""Search Qdrant for existing claims similar to this source.
|
||||
|
||||
Uses source title + first 500 chars of body as the search query.
|
||||
Returns list of {claim_title, claim_path, description, score} dicts.
|
||||
Non-fatal — returns empty list on any failure.
|
||||
"""
|
||||
try:
|
||||
from lib.search import embed_query, search_qdrant
|
||||
except ImportError:
|
||||
return []
|
||||
|
||||
query = f"{source_title} {source_body[:500]}".strip()
|
||||
if len(query) < 20:
|
||||
return []
|
||||
|
||||
vector = embed_query(query)
|
||||
if vector is None:
|
||||
return []
|
||||
|
||||
hits = search_qdrant(vector, limit=limit, score_threshold=0.55)
|
||||
results = []
|
||||
for hit in hits:
|
||||
payload = hit.get("payload", {})
|
||||
results.append({
|
||||
"claim_title": payload.get("claim_title", ""),
|
||||
"claim_path": payload.get("claim_path", ""),
|
||||
"description": payload.get("description", ""),
|
||||
"score": hit.get("score", 0),
|
||||
})
|
||||
return results
|
||||
|
||||
# ─── Source registration (Argus: pipeline funnel tracking) ─────────────────
|
||||
|
||||
def _source_db_conn():
|
||||
|
|
@ -225,6 +259,7 @@ def reconstruct_claim_content(claim, domain, agent):
|
|||
source = claim.get("source", f"extraction by {agent}")
|
||||
body_text = claim.get("body", desc)
|
||||
related = claim.get("related_claims", [])
|
||||
connections = claim.get("connections", [])
|
||||
sourcer = claim.get("sourcer", "")
|
||||
|
||||
# Build attribution block (v1: extractor always known, sourcer best-effort)
|
||||
|
|
@ -241,6 +276,32 @@ def reconstruct_claim_content(claim, domain, agent):
|
|||
f' context: "{source}"',
|
||||
])
|
||||
|
||||
# Build typed edge fields from connections array
|
||||
edge_fields = {"supports": [], "challenges": [], "related": []}
|
||||
for conn in connections:
|
||||
target = conn.get("target", "")
|
||||
rel = conn.get("relationship", "related")
|
||||
if target and rel in edge_fields:
|
||||
# Normalize: strip .md extension if present
|
||||
target = target.replace(".md", "")
|
||||
if target not in edge_fields[rel]:
|
||||
edge_fields[rel].append(target)
|
||||
|
||||
# Also fold related_claims into "related" edges (backwards compat)
|
||||
for r in related[:5]:
|
||||
r_clean = r.replace(".md", "")
|
||||
if r_clean not in edge_fields["related"]:
|
||||
edge_fields["related"].append(r_clean)
|
||||
|
||||
# Build edge lines for frontmatter
|
||||
edge_lines = []
|
||||
for edge_type in ("supports", "challenges", "related"):
|
||||
targets = edge_fields[edge_type]
|
||||
if targets:
|
||||
edge_lines.append(f"{edge_type}:")
|
||||
for t in targets:
|
||||
edge_lines.append(f" - {t}")
|
||||
|
||||
lines = [
|
||||
"---",
|
||||
"type: claim",
|
||||
|
|
@ -250,6 +311,7 @@ def reconstruct_claim_content(claim, domain, agent):
|
|||
f'source: "{source}"',
|
||||
f"created: {date.today().isoformat()}",
|
||||
*attr_lines,
|
||||
*edge_lines,
|
||||
"---",
|
||||
"",
|
||||
f"# {title}",
|
||||
|
|
@ -262,7 +324,7 @@ def reconstruct_claim_content(claim, domain, agent):
|
|||
]
|
||||
for r in related[:5]:
|
||||
lines.append(f"- [[{r}]]")
|
||||
lines.extend(["", "Topics:", "- [[_map]]", ""])
|
||||
lines.extend(["", "Topics:", ""])
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
|
|
@ -378,9 +440,18 @@ def main():
|
|||
if rationale:
|
||||
print(f" Directed contribution from {proposed_by or '?'}: {rationale[:80]}...")
|
||||
|
||||
# ── Prior art lookup (extract-time connection) ──
|
||||
# Search Qdrant for existing claims similar to this source.
|
||||
# Injected into prompt so LLM can classify connections at extraction time.
|
||||
source_title = os.path.basename(args.source_file).replace(".md", "").replace("-", " ")
|
||||
prior_art = _find_prior_art(source_title, source_content)
|
||||
if prior_art:
|
||||
print(f" Prior art: {len(prior_art)} connection candidates (top: {prior_art[0]['claim_title'][:50]}... @ {prior_art[0]['score']:.2f})")
|
||||
|
||||
prompt = build_extraction_prompt(
|
||||
args.source_file, source_content, domain, agent, kb_index,
|
||||
rationale=rationale, intake_tier=intake_tier, proposed_by=proposed_by,
|
||||
prior_art=prior_art,
|
||||
)
|
||||
|
||||
if args.dry_run:
|
||||
|
|
|
|||
Loading…
Reference in a new issue