diff --git a/lib/extract.py b/lib/extract.py index 378cd11..b1d06f2 100644 --- a/lib/extract.py +++ b/lib/extract.py @@ -101,14 +101,28 @@ def _get_kb_index(domain: str) -> str: # Fallback: build from repo main = config.MAIN_WORKTREE + sections = [] + + # Domain claims claims = [] domain_dir = main / "domains" / domain if domain_dir.is_dir(): for f in domain_dir.glob("*.md"): if not f.name.startswith("_"): - claims.append(f"- {f.name}") + claims.append(f"- {f.stem}") + sections.append(f"## Claims in domains/{domain}/\n" + "\n".join(sorted(claims))) - text = f"## Claims in domains/{domain}/\n" + "\n".join(sorted(claims)) + # Domain entities — so the LLM knows what entities exist for connections + entities = [] + entity_dir = main / "entities" / domain + if entity_dir.is_dir(): + for f in entity_dir.glob("*.md"): + if not f.name.startswith("_"): + entities.append(f"- {f.stem}") + if entities: + sections.append(f"## Entities in entities/{domain}/\n" + "\n".join(sorted(entities))) + + text = "\n\n".join(sections) _kb_index_cache[domain] = text return text diff --git a/lib/extraction_prompt.py b/lib/extraction_prompt.py index 8502f10..e48bb5e 100644 --- a/lib/extraction_prompt.py +++ b/lib/extraction_prompt.py @@ -6,7 +6,7 @@ The extraction prompt focuses on WHAT to extract: - Identify entity data - Check for duplicates against KB index -Mechanical enforcement (frontmatter format, wiki links, dates, filenames) +Mechanical enforcement (frontmatter format, dates, filenames) is handled by post_extract.py AFTER the LLM returns. Design principle (Leo): mechanical rules in code, judgment in prompts. @@ -98,7 +98,7 @@ Set `contributor_thesis_extractable: true` if you extracted the contributor's th "factual_discrepancy": "Check facts carefully — verify dates, numbers, and attributions against the source text.", "near_duplicate": "Check the KB index more carefully — this claim may already exist. Prefer enrichment over duplication.", "scope_error": "Scope claims correctly — don't mix structural, functional, and causal claims in one.", - "broken_wiki_links": "Ensure wiki links reference real entities/claims in the KB.", + "broken_wiki_links": "Do NOT use [[wiki links]] in body text. Use the connections and related_claims JSON fields instead.", } guidance = issue_guidance.get(issue, f"Address: {issue}") feedback_lines.append(f"- **{issue}**: {guidance}") @@ -281,13 +281,15 @@ Single source = experimental at most. Pitch rhetoric or marketing copy = specula {source_content} {conversation_section}{contributor_directive}{previous_feedback_section}{connection_candidates} -## KB Index (existing claims — check for duplicates and enrichment targets) +## KB Index (existing claims and entities — check for duplicates, enrichment targets, and connections) {kb_index} ## Output Format -Return valid JSON. The post-processor handles frontmatter formatting, wiki links, and dates — focus on the intellectual content. +Return valid JSON. The post-processor handles frontmatter formatting and dates — focus on the intellectual content. + +**Do NOT use [[wiki links]] in body text.** Express all cross-references through the `connections` and `related_claims` JSON fields instead. Inline [[links]] are stripped by the post-processor — use the structured JSON fields which capture relationship type and reason. ```json {{