From 8de28d6ee035cededaf25de38c79205702541200 Mon Sep 17 00:00:00 2001
From: m3taversal <m3taversal@gmail.com>
Date: Tue, 21 Apr 2026 13:00:59 +0100
Subject: [PATCH] =?UTF-8?q?feat:=20bidirectional=20source=E2=86=94claim=20?=
 =?UTF-8?q?linking?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Forward link: claims get `sourced_from: {domain}/{filename}` at extraction time.
Reverse link: after merge, backlink_source_claims() updates source files with
`claims_extracted:` list. All disk writes happen under async_main_worktree_lock.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 lib/extract.py    |   6 ++-
 lib/merge.py      |   7 +++
 lib/post_merge.py | 134 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 145 insertions(+), 2 deletions(-)

diff --git a/lib/extract.py b/lib/extract.py
index b54c522..4681a9f 100644
--- a/lib/extract.py
+++ b/lib/extract.py
@@ -229,7 +229,7 @@ def _parse_extraction_json(text: str) -> dict | None:
         return None
 
 
-def _build_claim_content(claim: dict, agent: str, source_format: str | None = None) -> str:
+def _build_claim_content(claim: dict, agent: str, source_format: str | None = None, source_file: str = "") -> str:
     """Build claim markdown file content from extraction JSON."""
     today = date.today().isoformat()
     domain = claim.get("domain", "")
@@ -281,6 +281,8 @@ def _build_claim_content(claim: dict, agent: str, source_format: str | None = No
         f"created: {today}",
         f"agent: {agent}",
     ]
+    if source_file:
+        lines.append(f"sourced_from: {source_file}")
     if scope:
         lines.append(f"scope: {scope}")
     if sourcer:
@@ -432,7 +434,7 @@ async def _extract_one_source(
         filename = Path(filename).name  # Strip directory components — LLM output may contain path traversal
         if not filename.endswith(".md"):
             filename += ".md"
-        content = _build_claim_content(c, agent_lower, source_format=source_format)
+        content = _build_claim_content(c, agent_lower, source_format=source_format, source_file=f"{domain}/{source_file}" if domain else source_file)
         claim_files.append({"filename": filename, "domain": c.get("domain", domain), "content": content})
 
     # Build entity file contents
diff --git a/lib/merge.py b/lib/merge.py
index 0e4ccef..23498bd 100644
--- a/lib/merge.py
+++ b/lib/merge.py
@@ -436,6 +436,7 @@ from .frontmatter import (
     serialize_frontmatter,
 )
 from .post_merge import (
+    backlink_source_claims,
     embed_merged_claims,
     reciprocal_edges,
     archive_source_for_pr,
@@ -855,6 +856,12 @@ async def _merge_domain_queue(conn, domain: str) -> tuple[int, int]:
             # Archive source file (closes near-duplicate loop — Ganymede review)
             archive_source_for_pr(branch, domain)
 
+            # Backlink: update source files with claims_extracted refs
+            try:
+                await backlink_source_claims(main_sha, branch_sha, _git)
+            except Exception:
+                logger.exception("PR #%d: backlink_source_claims failed (non-fatal)", pr_num)
+
             # Embed new/changed claims into Qdrant (non-fatal)
             await embed_merged_claims(main_sha, branch_sha, _git)
 
diff --git a/lib/post_merge.py b/lib/post_merge.py
index 9d1cf2f..ea79f02 100644
--- a/lib/post_merge.py
+++ b/lib/post_merge.py
@@ -13,6 +13,7 @@ import logging
 import os
 import re
 import shutil
+from pathlib import Path
 from typing import Callable
 
 from . import config
@@ -295,6 +296,139 @@ async def reciprocal_edges(main_sha: str, branch_sha: str, git_fn: Callable):
         logger.exception("reciprocal_edges: failed (non-fatal)")
 
 
+async def backlink_source_claims(main_sha: str, branch_sha: str, git_fn: Callable):
+    """After merge, update source files with claims_extracted backlinks.
+
+    Reads sourced_from from merged claim frontmatter, finds the source file,
+    and appends the claim filename to its claims_extracted list.
+    Only runs for newly added claims (diff-filter=A).
+    """
+    try:
+        rc, diff_out = await git_fn(
+            "diff", "--name-only", "--diff-filter=A",
+            main_sha, branch_sha,
+            cwd=str(config.MAIN_WORKTREE),
+            timeout=10,
+        )
+        if rc != 0:
+            logger.warning("backlink_source_claims: diff failed (rc=%d), skipping", rc)
+            return
+
+        claim_dirs = {"domains/", "core/", "foundations/"}
+        new_claims = [
+            f for f in diff_out.strip().split("\n")
+            if f.endswith(".md")
+            and any(f.startswith(d) for d in claim_dirs)
+            and not f.split("/")[-1].startswith("_")
+            and "/entities/" not in f
+            and "/decisions/" not in f
+        ]
+
+        if not new_claims:
+            return
+
+        modified_sources = {}
+        for claim_path in new_claims:
+            full_path = config.MAIN_WORKTREE / claim_path
+            if not full_path.exists():
+                continue
+
+            try:
+                content = full_path.read_text()
+            except Exception:
+                continue
+
+            fm, raw_fm, body = parse_yaml_frontmatter(content)
+            if fm is None:
+                continue
+
+            sourced_from = fm.get("sourced_from", "")
+            if not sourced_from:
+                continue
+
+            source_path = config.MAIN_WORKTREE / "inbox" / "archive" / sourced_from
+            if not source_path.exists():
+                logger.debug("backlink_source_claims: source %s not found at %s", sourced_from, source_path)
+                continue
+
+            claim_filename = claim_path.rsplit("/", 1)[-1].replace(".md", "")
+
+            try:
+                source_content = source_path.read_text()
+            except Exception:
+                continue
+
+            source_fm, source_raw_fm, source_body = parse_yaml_frontmatter(source_content)
+            if source_fm is None:
+                continue
+
+            existing_claims = source_fm.get("claims_extracted", [])
+            if isinstance(existing_claims, str):
+                existing_claims = [existing_claims]
+            if not isinstance(existing_claims, list):
+                existing_claims = []
+
+            if claim_filename in existing_claims:
+                continue
+
+            existing_claims.append(claim_filename)
+            new_block = "claims_extracted:\n" + "\n".join(f"- {c}" for c in existing_claims)
+
+            lines = source_content.split("\n")
+            if "claims_extracted:" not in source_content:
+                end_idx = None
+                for i, line in enumerate(lines):
+                    if i > 0 and line.strip() == "---":
+                        end_idx = i
+                        break
+                if end_idx is None:
+                    continue
+                lines.insert(end_idx, new_block)
+            else:
+                start_idx = None
+                end_idx = None
+                for i, line in enumerate(lines):
+                    if line.startswith("claims_extracted:"):
+                        start_idx = i
+                    elif start_idx is not None and not line.startswith("- "):
+                        end_idx = i
+                        break
+                if start_idx is None:
+                    continue
+                if end_idx is None:
+                    end_idx = len(lines)
+                lines[start_idx:end_idx] = new_block.split("\n")
+
+            modified_sources[str(source_path)] = "\n".join(lines)
+            logger.info("backlink_source_claims: added %s to %s", claim_filename, sourced_from)
+
+        if modified_sources:
+            async with async_main_worktree_lock():
+                for sp, content in modified_sources.items():
+                    Path(sp).write_text(content)
+                    await git_fn("add", sp, cwd=str(config.MAIN_WORKTREE))
+                rc, out = await git_fn(
+                    "commit", "-m", f"backlink: update claims_extracted on {len(modified_sources)} source(s)",
+                    cwd=str(config.MAIN_WORKTREE),
+                    timeout=15,
+                )
+                if rc == 0:
+                    push_rc, push_out = await git_fn(
+                        "push", "origin", "main",
+                        cwd=str(config.MAIN_WORKTREE),
+                        timeout=30,
+                    )
+                    if push_rc == 0:
+                        logger.info("backlink_source_claims: %d source(s) updated and pushed", len(modified_sources))
+                    else:
+                        logger.warning("backlink_source_claims: push failed: %s", push_out[:200])
+                else:
+                    logger.warning("backlink_source_claims: commit failed: %s", out[:200])
+
+    except Exception:
+        logger.exception("backlink_source_claims: failed (non-fatal)")
+
+
 def archive_source_for_pr(branch: str, domain: str, merged: bool = True):
     """Move source from queue/ to archive/{domain}/ after PR merge or close.