From 687f3d3151a7a38d1fb20073b2e17568afcbb397 Mon Sep 17 00:00:00 2001
From: m3taversal <m3taversal@gmail.com>
Date: Thu, 16 Apr 2026 14:28:48 +0100
Subject: [PATCH] fix: prevent broken wiki links in extraction (226 rejections)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two changes to address the #1 rejection reason:

1. extraction_prompt.py: Explicitly tell LLM NOT to use [[wiki links]]
   in body text — use connections/related_claims JSON fields instead.
   Remove misleading "post-processor handles wiki links" language.

2. extract.py _get_kb_index(): Expand KB index to include entity stems
   from entities/{domain}/ so the LLM knows what entities exist when
   building connections. Previously only showed domain claims.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 lib/extract.py           | 18 ++++++++++++++++--
 lib/extraction_prompt.py | 10 ++++++----
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/lib/extract.py b/lib/extract.py
index 378cd11..b1d06f2 100644
--- a/lib/extract.py
+++ b/lib/extract.py
@@ -101,14 +101,28 @@ def _get_kb_index(domain: str) -> str:
 
     # Fallback: build from repo
     main = config.MAIN_WORKTREE
+    sections = []
+
+    # Domain claims
     claims = []
     domain_dir = main / "domains" / domain
     if domain_dir.is_dir():
         for f in domain_dir.glob("*.md"):
             if not f.name.startswith("_"):
-                claims.append(f"- {f.name}")
+                claims.append(f"- {f.stem}")
+    sections.append(f"## Claims in domains/{domain}/\n" + "\n".join(sorted(claims)))
 
-    text = f"## Claims in domains/{domain}/\n" + "\n".join(sorted(claims))
+    # Domain entities — so the LLM knows what entities exist for connections
+    entities = []
+    entity_dir = main / "entities" / domain
+    if entity_dir.is_dir():
+        for f in entity_dir.glob("*.md"):
+            if not f.name.startswith("_"):
+                entities.append(f"- {f.stem}")
+    if entities:
+        sections.append(f"## Entities in entities/{domain}/\n" + "\n".join(sorted(entities)))
+
+    text = "\n\n".join(sections)
     _kb_index_cache[domain] = text
     return text
 
diff --git a/lib/extraction_prompt.py b/lib/extraction_prompt.py
index 8502f10..e48bb5e 100644
--- a/lib/extraction_prompt.py
+++ b/lib/extraction_prompt.py
@@ -6,7 +6,7 @@ The extraction prompt focuses on WHAT to extract:
 - Identify entity data
 - Check for duplicates against KB index
 
-Mechanical enforcement (frontmatter format, wiki links, dates, filenames)
+Mechanical enforcement (frontmatter format, dates, filenames)
 is handled by post_extract.py AFTER the LLM returns.
 
 Design principle (Leo): mechanical rules in code, judgment in prompts.
@@ -98,7 +98,7 @@ Set `contributor_thesis_extractable: true` if you extracted the contributor's th
                     "factual_discrepancy": "Check facts carefully — verify dates, numbers, and attributions against the source text.",
                     "near_duplicate": "Check the KB index more carefully — this claim may already exist. Prefer enrichment over duplication.",
                     "scope_error": "Scope claims correctly — don't mix structural, functional, and causal claims in one.",
-                    "broken_wiki_links": "Ensure wiki links reference real entities/claims in the KB.",
+                    "broken_wiki_links": "Do NOT use [[wiki links]] in body text. Use the connections and related_claims JSON fields instead.",
                 }
                 guidance = issue_guidance.get(issue, f"Address: {issue}")
                 feedback_lines.append(f"- **{issue}**: {guidance}")
@@ -281,13 +281,15 @@ Single source = experimental at most. Pitch rhetoric or marketing copy = specula
 
 {source_content}
 {conversation_section}{contributor_directive}{previous_feedback_section}{connection_candidates}
-## KB Index (existing claims — check for duplicates and enrichment targets)
+## KB Index (existing claims and entities — check for duplicates, enrichment targets, and connections)
 
 {kb_index}
 
 ## Output Format
 
-Return valid JSON. The post-processor handles frontmatter formatting, wiki links, and dates — focus on the intellectual content.
+Return valid JSON. The post-processor handles frontmatter formatting and dates — focus on the intellectual content.
+
+**Do NOT use [[wiki links]] in body text.** Express all cross-references through the `connections` and `related_claims` JSON fields instead. Inline [[links]] are stripped by the post-processor — use the structured JSON fields which capture relationship type and reason.
 
 ```json
 {{