feat: bidirectional source↔claim linking
Some checks are pending
CI / lint-and-test (push) Waiting to run

Forward link: claims get `sourced_from: {domain}/{filename}` at extraction time.
Reverse link: after merge, backlink_source_claims() updates source files with
`claims_extracted:` list. All disk writes happen under async_main_worktree_lock.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
m3taversal 2026-04-21 13:00:59 +01:00
parent 05f375d775
commit 8de28d6ee0
3 changed files with 145 additions and 2 deletions

View file

@ -229,7 +229,7 @@ def _parse_extraction_json(text: str) -> dict | None:
return None
def _build_claim_content(claim: dict, agent: str, source_format: str | None = None) -> str:
def _build_claim_content(claim: dict, agent: str, source_format: str | None = None, source_file: str = "") -> str:
"""Build claim markdown file content from extraction JSON."""
today = date.today().isoformat()
domain = claim.get("domain", "")
@ -281,6 +281,8 @@ def _build_claim_content(claim: dict, agent: str, source_format: str | None = No
f"created: {today}",
f"agent: {agent}",
]
if source_file:
lines.append(f"sourced_from: {source_file}")
if scope:
lines.append(f"scope: {scope}")
if sourcer:
@ -432,7 +434,7 @@ async def _extract_one_source(
filename = Path(filename).name # Strip directory components — LLM output may contain path traversal
if not filename.endswith(".md"):
filename += ".md"
content = _build_claim_content(c, agent_lower, source_format=source_format)
content = _build_claim_content(c, agent_lower, source_format=source_format, source_file=f"{domain}/{source_file}" if domain else source_file)
claim_files.append({"filename": filename, "domain": c.get("domain", domain), "content": content})
# Build entity file contents

View file

@ -436,6 +436,7 @@ from .frontmatter import (
serialize_frontmatter,
)
from .post_merge import (
backlink_source_claims,
embed_merged_claims,
reciprocal_edges,
archive_source_for_pr,
@ -855,6 +856,12 @@ async def _merge_domain_queue(conn, domain: str) -> tuple[int, int]:
# Archive source file (closes near-duplicate loop — Ganymede review)
archive_source_for_pr(branch, domain)
# Backlink: update source files with claims_extracted refs
try:
await backlink_source_claims(main_sha, branch_sha, _git)
except Exception:
logger.exception("PR #%d: backlink_source_claims failed (non-fatal)", pr_num)
# Embed new/changed claims into Qdrant (non-fatal)
await embed_merged_claims(main_sha, branch_sha, _git)

View file

@ -13,6 +13,7 @@ import logging
import os
import re
import shutil
from pathlib import Path
from typing import Callable
from . import config
@ -295,6 +296,139 @@ async def reciprocal_edges(main_sha: str, branch_sha: str, git_fn: Callable):
logger.exception("reciprocal_edges: failed (non-fatal)")
async def backlink_source_claims(main_sha: str, branch_sha: str, git_fn: Callable):
"""After merge, update source files with claims_extracted backlinks.
Reads sourced_from from merged claim frontmatter, finds the source file,
and appends the claim filename to its claims_extracted list.
Only runs for newly added claims (diff-filter=A).
"""
try:
rc, diff_out = await git_fn(
"diff", "--name-only", "--diff-filter=A",
main_sha, branch_sha,
cwd=str(config.MAIN_WORKTREE),
timeout=10,
)
if rc != 0:
logger.warning("backlink_source_claims: diff failed (rc=%d), skipping", rc)
return
claim_dirs = {"domains/", "core/", "foundations/"}
new_claims = [
f for f in diff_out.strip().split("\n")
if f.endswith(".md")
and any(f.startswith(d) for d in claim_dirs)
and not f.split("/")[-1].startswith("_")
and "/entities/" not in f
and "/decisions/" not in f
]
if not new_claims:
return
modified_sources = {}
for claim_path in new_claims:
full_path = config.MAIN_WORKTREE / claim_path
if not full_path.exists():
continue
try:
content = full_path.read_text()
except Exception:
continue
fm, raw_fm, body = parse_yaml_frontmatter(content)
if fm is None:
continue
sourced_from = fm.get("sourced_from", "")
if not sourced_from:
continue
source_path = config.MAIN_WORKTREE / "inbox" / "archive" / sourced_from
if not source_path.exists():
logger.debug("backlink_source_claims: source %s not found at %s", sourced_from, source_path)
continue
claim_filename = claim_path.rsplit("/", 1)[-1].replace(".md", "")
try:
source_content = source_path.read_text()
except Exception:
continue
source_fm, source_raw_fm, source_body = parse_yaml_frontmatter(source_content)
if source_fm is None:
continue
existing_claims = source_fm.get("claims_extracted", [])
if isinstance(existing_claims, str):
existing_claims = [existing_claims]
if not isinstance(existing_claims, list):
existing_claims = []
if claim_filename in existing_claims:
continue
existing_claims.append(claim_filename)
new_block = "claims_extracted:\n" + "\n".join(f"- {c}" for c in existing_claims)
lines = source_content.split("\n")
if "claims_extracted:" not in source_content:
end_idx = None
for i, line in enumerate(lines):
if i > 0 and line.strip() == "---":
end_idx = i
break
if end_idx is None:
continue
lines.insert(end_idx, new_block)
else:
start_idx = None
end_idx = None
for i, line in enumerate(lines):
if line.startswith("claims_extracted:"):
start_idx = i
elif start_idx is not None and not line.startswith("- "):
end_idx = i
break
if start_idx is None:
continue
if end_idx is None:
end_idx = len(lines)
lines[start_idx:end_idx] = new_block.split("\n")
modified_sources[str(source_path)] = "\n".join(lines)
logger.info("backlink_source_claims: added %s to %s", claim_filename, sourced_from)
if modified_sources:
async with async_main_worktree_lock():
for sp, content in modified_sources.items():
Path(sp).write_text(content)
await git_fn("add", sp, cwd=str(config.MAIN_WORKTREE))
rc, out = await git_fn(
"commit", "-m", f"backlink: update claims_extracted on {len(modified_sources)} source(s)",
cwd=str(config.MAIN_WORKTREE),
timeout=15,
)
if rc == 0:
push_rc, push_out = await git_fn(
"push", "origin", "main",
cwd=str(config.MAIN_WORKTREE),
timeout=30,
)
if push_rc == 0:
logger.info("backlink_source_claims: %d source(s) updated and pushed", len(modified_sources))
else:
logger.warning("backlink_source_claims: push failed: %s", push_out[:200])
else:
logger.warning("backlink_source_claims: commit failed: %s", out[:200])
except Exception:
logger.exception("backlink_source_claims: failed (non-fatal)")
def archive_source_for_pr(branch: str, domain: str, merged: bool = True):
"""Move source from queue/ to archive/{domain}/ after PR merge or close.