From 687f3d3151a7a38d1fb20073b2e17568afcbb397 Mon Sep 17 00:00:00 2001 From: m3taversal Date: Thu, 16 Apr 2026 14:28:48 +0100 Subject: [PATCH] fix: prevent broken wiki links in extraction (226 rejections) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two changes to address the #1 rejection reason: 1. extraction_prompt.py: Explicitly tell LLM NOT to use [[wiki links]] in body text — use connections/related_claims JSON fields instead. Remove misleading "post-processor handles wiki links" language. 2. extract.py _get_kb_index(): Expand KB index to include entity stems from entities/{domain}/ so the LLM knows what entities exist when building connections. Previously only showed domain claims. Co-Authored-By: Claude Opus 4.6 (1M context) --- lib/extract.py | 18 ++++++++++++++++-- lib/extraction_prompt.py | 10 ++++++---- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/lib/extract.py b/lib/extract.py index 378cd11..b1d06f2 100644 --- a/lib/extract.py +++ b/lib/extract.py @@ -101,14 +101,28 @@ def _get_kb_index(domain: str) -> str: # Fallback: build from repo main = config.MAIN_WORKTREE + sections = [] + + # Domain claims claims = [] domain_dir = main / "domains" / domain if domain_dir.is_dir(): for f in domain_dir.glob("*.md"): if not f.name.startswith("_"): - claims.append(f"- {f.name}") + claims.append(f"- {f.stem}") + sections.append(f"## Claims in domains/{domain}/\n" + "\n".join(sorted(claims))) - text = f"## Claims in domains/{domain}/\n" + "\n".join(sorted(claims)) + # Domain entities — so the LLM knows what entities exist for connections + entities = [] + entity_dir = main / "entities" / domain + if entity_dir.is_dir(): + for f in entity_dir.glob("*.md"): + if not f.name.startswith("_"): + entities.append(f"- {f.stem}") + if entities: + sections.append(f"## Entities in entities/{domain}/\n" + "\n".join(sorted(entities))) + + text = "\n\n".join(sections) _kb_index_cache[domain] = text return text diff --git a/lib/extraction_prompt.py b/lib/extraction_prompt.py index 8502f10..e48bb5e 100644 --- a/lib/extraction_prompt.py +++ b/lib/extraction_prompt.py @@ -6,7 +6,7 @@ The extraction prompt focuses on WHAT to extract: - Identify entity data - Check for duplicates against KB index -Mechanical enforcement (frontmatter format, wiki links, dates, filenames) +Mechanical enforcement (frontmatter format, dates, filenames) is handled by post_extract.py AFTER the LLM returns. Design principle (Leo): mechanical rules in code, judgment in prompts. @@ -98,7 +98,7 @@ Set `contributor_thesis_extractable: true` if you extracted the contributor's th "factual_discrepancy": "Check facts carefully — verify dates, numbers, and attributions against the source text.", "near_duplicate": "Check the KB index more carefully — this claim may already exist. Prefer enrichment over duplication.", "scope_error": "Scope claims correctly — don't mix structural, functional, and causal claims in one.", - "broken_wiki_links": "Ensure wiki links reference real entities/claims in the KB.", + "broken_wiki_links": "Do NOT use [[wiki links]] in body text. Use the connections and related_claims JSON fields instead.", } guidance = issue_guidance.get(issue, f"Address: {issue}") feedback_lines.append(f"- **{issue}**: {guidance}") @@ -281,13 +281,15 @@ Single source = experimental at most. Pitch rhetoric or marketing copy = specula {source_content} {conversation_section}{contributor_directive}{previous_feedback_section}{connection_candidates} -## KB Index (existing claims — check for duplicates and enrichment targets) +## KB Index (existing claims and entities — check for duplicates, enrichment targets, and connections) {kb_index} ## Output Format -Return valid JSON. The post-processor handles frontmatter formatting, wiki links, and dates — focus on the intellectual content. +Return valid JSON. The post-processor handles frontmatter formatting and dates — focus on the intellectual content. + +**Do NOT use [[wiki links]] in body text.** Express all cross-references through the `connections` and `related_claims` JSON fields instead. Inline [[links]] are stripped by the post-processor — use the structured JSON fields which capture relationship type and reason. ```json {{