feat: bidirectional source↔claim linking
Some checks are pending
CI / lint-and-test (push) Waiting to run
Some checks are pending
CI / lint-and-test (push) Waiting to run
Forward link: claims get `sourced_from: {domain}/{filename}` at extraction time.
Reverse link: after merge, backlink_source_claims() updates source files with
`claims_extracted:` list. All disk writes happen under async_main_worktree_lock.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
05f375d775
commit
8de28d6ee0
3 changed files with 145 additions and 2 deletions
|
|
@ -229,7 +229,7 @@ def _parse_extraction_json(text: str) -> dict | None:
|
|||
return None
|
||||
|
||||
|
||||
def _build_claim_content(claim: dict, agent: str, source_format: str | None = None) -> str:
|
||||
def _build_claim_content(claim: dict, agent: str, source_format: str | None = None, source_file: str = "") -> str:
|
||||
"""Build claim markdown file content from extraction JSON."""
|
||||
today = date.today().isoformat()
|
||||
domain = claim.get("domain", "")
|
||||
|
|
@ -281,6 +281,8 @@ def _build_claim_content(claim: dict, agent: str, source_format: str | None = No
|
|||
f"created: {today}",
|
||||
f"agent: {agent}",
|
||||
]
|
||||
if source_file:
|
||||
lines.append(f"sourced_from: {source_file}")
|
||||
if scope:
|
||||
lines.append(f"scope: {scope}")
|
||||
if sourcer:
|
||||
|
|
@ -432,7 +434,7 @@ async def _extract_one_source(
|
|||
filename = Path(filename).name # Strip directory components — LLM output may contain path traversal
|
||||
if not filename.endswith(".md"):
|
||||
filename += ".md"
|
||||
content = _build_claim_content(c, agent_lower, source_format=source_format)
|
||||
content = _build_claim_content(c, agent_lower, source_format=source_format, source_file=f"{domain}/{source_file}" if domain else source_file)
|
||||
claim_files.append({"filename": filename, "domain": c.get("domain", domain), "content": content})
|
||||
|
||||
# Build entity file contents
|
||||
|
|
|
|||
|
|
@ -436,6 +436,7 @@ from .frontmatter import (
|
|||
serialize_frontmatter,
|
||||
)
|
||||
from .post_merge import (
|
||||
backlink_source_claims,
|
||||
embed_merged_claims,
|
||||
reciprocal_edges,
|
||||
archive_source_for_pr,
|
||||
|
|
@ -855,6 +856,12 @@ async def _merge_domain_queue(conn, domain: str) -> tuple[int, int]:
|
|||
# Archive source file (closes near-duplicate loop — Ganymede review)
|
||||
archive_source_for_pr(branch, domain)
|
||||
|
||||
# Backlink: update source files with claims_extracted refs
|
||||
try:
|
||||
await backlink_source_claims(main_sha, branch_sha, _git)
|
||||
except Exception:
|
||||
logger.exception("PR #%d: backlink_source_claims failed (non-fatal)", pr_num)
|
||||
|
||||
# Embed new/changed claims into Qdrant (non-fatal)
|
||||
await embed_merged_claims(main_sha, branch_sha, _git)
|
||||
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ import logging
|
|||
import os
|
||||
import re
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Callable
|
||||
|
||||
from . import config
|
||||
|
|
@ -295,6 +296,139 @@ async def reciprocal_edges(main_sha: str, branch_sha: str, git_fn: Callable):
|
|||
logger.exception("reciprocal_edges: failed (non-fatal)")
|
||||
|
||||
|
||||
async def backlink_source_claims(main_sha: str, branch_sha: str, git_fn: Callable):
|
||||
"""After merge, update source files with claims_extracted backlinks.
|
||||
|
||||
Reads sourced_from from merged claim frontmatter, finds the source file,
|
||||
and appends the claim filename to its claims_extracted list.
|
||||
Only runs for newly added claims (diff-filter=A).
|
||||
"""
|
||||
try:
|
||||
rc, diff_out = await git_fn(
|
||||
"diff", "--name-only", "--diff-filter=A",
|
||||
main_sha, branch_sha,
|
||||
cwd=str(config.MAIN_WORKTREE),
|
||||
timeout=10,
|
||||
)
|
||||
if rc != 0:
|
||||
logger.warning("backlink_source_claims: diff failed (rc=%d), skipping", rc)
|
||||
return
|
||||
|
||||
claim_dirs = {"domains/", "core/", "foundations/"}
|
||||
new_claims = [
|
||||
f for f in diff_out.strip().split("\n")
|
||||
if f.endswith(".md")
|
||||
and any(f.startswith(d) for d in claim_dirs)
|
||||
and not f.split("/")[-1].startswith("_")
|
||||
and "/entities/" not in f
|
||||
and "/decisions/" not in f
|
||||
]
|
||||
|
||||
if not new_claims:
|
||||
return
|
||||
|
||||
modified_sources = {}
|
||||
for claim_path in new_claims:
|
||||
full_path = config.MAIN_WORKTREE / claim_path
|
||||
if not full_path.exists():
|
||||
continue
|
||||
|
||||
try:
|
||||
content = full_path.read_text()
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
fm, raw_fm, body = parse_yaml_frontmatter(content)
|
||||
if fm is None:
|
||||
continue
|
||||
|
||||
sourced_from = fm.get("sourced_from", "")
|
||||
if not sourced_from:
|
||||
continue
|
||||
|
||||
source_path = config.MAIN_WORKTREE / "inbox" / "archive" / sourced_from
|
||||
if not source_path.exists():
|
||||
logger.debug("backlink_source_claims: source %s not found at %s", sourced_from, source_path)
|
||||
continue
|
||||
|
||||
claim_filename = claim_path.rsplit("/", 1)[-1].replace(".md", "")
|
||||
|
||||
try:
|
||||
source_content = source_path.read_text()
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
source_fm, source_raw_fm, source_body = parse_yaml_frontmatter(source_content)
|
||||
if source_fm is None:
|
||||
continue
|
||||
|
||||
existing_claims = source_fm.get("claims_extracted", [])
|
||||
if isinstance(existing_claims, str):
|
||||
existing_claims = [existing_claims]
|
||||
if not isinstance(existing_claims, list):
|
||||
existing_claims = []
|
||||
|
||||
if claim_filename in existing_claims:
|
||||
continue
|
||||
|
||||
existing_claims.append(claim_filename)
|
||||
new_block = "claims_extracted:\n" + "\n".join(f"- {c}" for c in existing_claims)
|
||||
|
||||
lines = source_content.split("\n")
|
||||
if "claims_extracted:" not in source_content:
|
||||
end_idx = None
|
||||
for i, line in enumerate(lines):
|
||||
if i > 0 and line.strip() == "---":
|
||||
end_idx = i
|
||||
break
|
||||
if end_idx is None:
|
||||
continue
|
||||
lines.insert(end_idx, new_block)
|
||||
else:
|
||||
start_idx = None
|
||||
end_idx = None
|
||||
for i, line in enumerate(lines):
|
||||
if line.startswith("claims_extracted:"):
|
||||
start_idx = i
|
||||
elif start_idx is not None and not line.startswith("- "):
|
||||
end_idx = i
|
||||
break
|
||||
if start_idx is None:
|
||||
continue
|
||||
if end_idx is None:
|
||||
end_idx = len(lines)
|
||||
lines[start_idx:end_idx] = new_block.split("\n")
|
||||
|
||||
modified_sources[str(source_path)] = "\n".join(lines)
|
||||
logger.info("backlink_source_claims: added %s to %s", claim_filename, sourced_from)
|
||||
|
||||
if modified_sources:
|
||||
async with async_main_worktree_lock():
|
||||
for sp, content in modified_sources.items():
|
||||
Path(sp).write_text(content)
|
||||
await git_fn("add", sp, cwd=str(config.MAIN_WORKTREE))
|
||||
rc, out = await git_fn(
|
||||
"commit", "-m", f"backlink: update claims_extracted on {len(modified_sources)} source(s)",
|
||||
cwd=str(config.MAIN_WORKTREE),
|
||||
timeout=15,
|
||||
)
|
||||
if rc == 0:
|
||||
push_rc, push_out = await git_fn(
|
||||
"push", "origin", "main",
|
||||
cwd=str(config.MAIN_WORKTREE),
|
||||
timeout=30,
|
||||
)
|
||||
if push_rc == 0:
|
||||
logger.info("backlink_source_claims: %d source(s) updated and pushed", len(modified_sources))
|
||||
else:
|
||||
logger.warning("backlink_source_claims: push failed: %s", push_out[:200])
|
||||
else:
|
||||
logger.warning("backlink_source_claims: commit failed: %s", out[:200])
|
||||
|
||||
except Exception:
|
||||
logger.exception("backlink_source_claims: failed (non-fatal)")
|
||||
|
||||
|
||||
def archive_source_for_pr(branch: str, domain: str, merged: bool = True):
|
||||
"""Move source from queue/ to archive/{domain}/ after PR merge or close.
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue