From 8de28d6ee035cededaf25de38c79205702541200 Mon Sep 17 00:00:00 2001 From: m3taversal Date: Tue, 21 Apr 2026 13:00:59 +0100 Subject: [PATCH] =?UTF-8?q?feat:=20bidirectional=20source=E2=86=94claim=20?= =?UTF-8?q?linking?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Forward link: claims get `sourced_from: {domain}/{filename}` at extraction time. Reverse link: after merge, backlink_source_claims() updates source files with `claims_extracted:` list. All disk writes happen under async_main_worktree_lock. Co-Authored-By: Claude Opus 4.6 (1M context) --- lib/extract.py | 6 ++- lib/merge.py | 7 +++ lib/post_merge.py | 134 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 145 insertions(+), 2 deletions(-) diff --git a/lib/extract.py b/lib/extract.py index b54c522..4681a9f 100644 --- a/lib/extract.py +++ b/lib/extract.py @@ -229,7 +229,7 @@ def _parse_extraction_json(text: str) -> dict | None: return None -def _build_claim_content(claim: dict, agent: str, source_format: str | None = None) -> str: +def _build_claim_content(claim: dict, agent: str, source_format: str | None = None, source_file: str = "") -> str: """Build claim markdown file content from extraction JSON.""" today = date.today().isoformat() domain = claim.get("domain", "") @@ -281,6 +281,8 @@ def _build_claim_content(claim: dict, agent: str, source_format: str | None = No f"created: {today}", f"agent: {agent}", ] + if source_file: + lines.append(f"sourced_from: {source_file}") if scope: lines.append(f"scope: {scope}") if sourcer: @@ -432,7 +434,7 @@ async def _extract_one_source( filename = Path(filename).name # Strip directory components — LLM output may contain path traversal if not filename.endswith(".md"): filename += ".md" - content = _build_claim_content(c, agent_lower, source_format=source_format) + content = _build_claim_content(c, agent_lower, source_format=source_format, source_file=f"{domain}/{source_file}" if domain else source_file) claim_files.append({"filename": filename, "domain": c.get("domain", domain), "content": content}) # Build entity file contents diff --git a/lib/merge.py b/lib/merge.py index 0e4ccef..23498bd 100644 --- a/lib/merge.py +++ b/lib/merge.py @@ -436,6 +436,7 @@ from .frontmatter import ( serialize_frontmatter, ) from .post_merge import ( + backlink_source_claims, embed_merged_claims, reciprocal_edges, archive_source_for_pr, @@ -855,6 +856,12 @@ async def _merge_domain_queue(conn, domain: str) -> tuple[int, int]: # Archive source file (closes near-duplicate loop — Ganymede review) archive_source_for_pr(branch, domain) + # Backlink: update source files with claims_extracted refs + try: + await backlink_source_claims(main_sha, branch_sha, _git) + except Exception: + logger.exception("PR #%d: backlink_source_claims failed (non-fatal)", pr_num) + # Embed new/changed claims into Qdrant (non-fatal) await embed_merged_claims(main_sha, branch_sha, _git) diff --git a/lib/post_merge.py b/lib/post_merge.py index 9d1cf2f..ea79f02 100644 --- a/lib/post_merge.py +++ b/lib/post_merge.py @@ -13,6 +13,7 @@ import logging import os import re import shutil +from pathlib import Path from typing import Callable from . import config @@ -295,6 +296,139 @@ async def reciprocal_edges(main_sha: str, branch_sha: str, git_fn: Callable): logger.exception("reciprocal_edges: failed (non-fatal)") +async def backlink_source_claims(main_sha: str, branch_sha: str, git_fn: Callable): + """After merge, update source files with claims_extracted backlinks. + + Reads sourced_from from merged claim frontmatter, finds the source file, + and appends the claim filename to its claims_extracted list. + Only runs for newly added claims (diff-filter=A). + """ + try: + rc, diff_out = await git_fn( + "diff", "--name-only", "--diff-filter=A", + main_sha, branch_sha, + cwd=str(config.MAIN_WORKTREE), + timeout=10, + ) + if rc != 0: + logger.warning("backlink_source_claims: diff failed (rc=%d), skipping", rc) + return + + claim_dirs = {"domains/", "core/", "foundations/"} + new_claims = [ + f for f in diff_out.strip().split("\n") + if f.endswith(".md") + and any(f.startswith(d) for d in claim_dirs) + and not f.split("/")[-1].startswith("_") + and "/entities/" not in f + and "/decisions/" not in f + ] + + if not new_claims: + return + + modified_sources = {} + for claim_path in new_claims: + full_path = config.MAIN_WORKTREE / claim_path + if not full_path.exists(): + continue + + try: + content = full_path.read_text() + except Exception: + continue + + fm, raw_fm, body = parse_yaml_frontmatter(content) + if fm is None: + continue + + sourced_from = fm.get("sourced_from", "") + if not sourced_from: + continue + + source_path = config.MAIN_WORKTREE / "inbox" / "archive" / sourced_from + if not source_path.exists(): + logger.debug("backlink_source_claims: source %s not found at %s", sourced_from, source_path) + continue + + claim_filename = claim_path.rsplit("/", 1)[-1].replace(".md", "") + + try: + source_content = source_path.read_text() + except Exception: + continue + + source_fm, source_raw_fm, source_body = parse_yaml_frontmatter(source_content) + if source_fm is None: + continue + + existing_claims = source_fm.get("claims_extracted", []) + if isinstance(existing_claims, str): + existing_claims = [existing_claims] + if not isinstance(existing_claims, list): + existing_claims = [] + + if claim_filename in existing_claims: + continue + + existing_claims.append(claim_filename) + new_block = "claims_extracted:\n" + "\n".join(f"- {c}" for c in existing_claims) + + lines = source_content.split("\n") + if "claims_extracted:" not in source_content: + end_idx = None + for i, line in enumerate(lines): + if i > 0 and line.strip() == "---": + end_idx = i + break + if end_idx is None: + continue + lines.insert(end_idx, new_block) + else: + start_idx = None + end_idx = None + for i, line in enumerate(lines): + if line.startswith("claims_extracted:"): + start_idx = i + elif start_idx is not None and not line.startswith("- "): + end_idx = i + break + if start_idx is None: + continue + if end_idx is None: + end_idx = len(lines) + lines[start_idx:end_idx] = new_block.split("\n") + + modified_sources[str(source_path)] = "\n".join(lines) + logger.info("backlink_source_claims: added %s to %s", claim_filename, sourced_from) + + if modified_sources: + async with async_main_worktree_lock(): + for sp, content in modified_sources.items(): + Path(sp).write_text(content) + await git_fn("add", sp, cwd=str(config.MAIN_WORKTREE)) + rc, out = await git_fn( + "commit", "-m", f"backlink: update claims_extracted on {len(modified_sources)} source(s)", + cwd=str(config.MAIN_WORKTREE), + timeout=15, + ) + if rc == 0: + push_rc, push_out = await git_fn( + "push", "origin", "main", + cwd=str(config.MAIN_WORKTREE), + timeout=30, + ) + if push_rc == 0: + logger.info("backlink_source_claims: %d source(s) updated and pushed", len(modified_sources)) + else: + logger.warning("backlink_source_claims: push failed: %s", push_out[:200]) + else: + logger.warning("backlink_source_claims: commit failed: %s", out[:200]) + + except Exception: + logger.exception("backlink_source_claims: failed (non-fatal)") + + def archive_source_for_pr(branch: str, domain: str, merged: bool = True): """Move source from queue/ to archive/{domain}/ after PR merge or close.