diff --git a/lib/extract.py b/lib/extract.py index de6a8c9..cb68682 100644 --- a/lib/extract.py +++ b/lib/extract.py @@ -320,6 +320,7 @@ async def _extract_one_source( rationale = fm.get("rationale") intake_tier = fm.get("intake_tier") proposed_by = fm.get("proposed_by") + source_format = fm.get("format") logger.info("Extracting: %s (domain: %s, agent: %s)", source_file, domain, agent_name) @@ -343,6 +344,7 @@ async def _extract_one_source( proposed_by=proposed_by, prior_art=prior_art, previous_feedback=feedback, + source_format=source_format, ) # 4. Call LLM (OpenRouter — not Claude Max CLI) diff --git a/lib/extraction_prompt.py b/lib/extraction_prompt.py index 0ddea52..797f4d8 100644 --- a/lib/extraction_prompt.py +++ b/lib/extraction_prompt.py @@ -29,6 +29,7 @@ def build_extraction_prompt( proposed_by: str | None = None, prior_art: list[dict] | None = None, previous_feedback: dict | None = None, + source_format: str | None = None, ) -> str: """Build the lean extraction prompt. @@ -45,6 +46,7 @@ def build_extraction_prompt( prior_art: Qdrant search results — existing claims semantically similar to this source. Each dict has: claim_title, claim_path, description, score. Injected as connection candidates for extract-time linking. + source_format: Source format hint (e.g. "conversation" for Telegram chats). Returns: The complete prompt string @@ -131,6 +133,65 @@ Set `contributor_thesis_extractable: true` if you extracted the contributor's th else: connection_candidates = "" + # Build conversation extraction section (for Telegram/chat sources) + if source_format and source_format.lower() == "conversation": + conversation_section = """ +## Conversation Source — Special Extraction Rules + +This source is a **conversation between a human domain expert and an AI agent**. +The extraction rules are DIFFERENT from article sources: + +### Who said what matters + +- **The human (@m3taversal / contributor)** is the domain expert. Their statements carry + authority — especially corrections, pushback, and factual assertions. +- **The AI agent's responses** are secondary. They are useful for context (what was being + discussed) and for confirming when the human's correction landed (look for "you're right", + "fair point", confidence drops). + +### Corrections are the HIGHEST-VALUE content + +When the human says "that's wrong", "not true", "you're wrong", "out of date", or similar: + +1. **Extract the correction as a claim or enrichment.** The human is correcting the KB's + understanding. This is precisely what the KB needs. +2. **The correction itself IS the claim.** "Curated launches had significantly more committed + capital than permissionless launches" is a testable, disagreeable proposition — extract it. +3. **Short corrections are HIGH value, not low value.** A 15-word correction that fixes a + factual error is worth more than a 500-word article that confirms what we already know. + NEVER null-result a conversation just because the human's message is short. +4. **Map corrections to existing claims.** Search the KB index for claims that the correction + challenges. Output as an ENRICHMENT with `type: "challenge"` if the target claim exists. + +### Bot LEARNING lines are extraction hints + +When the AI agent includes a `LEARNING:` line, it's a pre-extracted correction. Use it as +a starting point — but reformulate it as a proper claim (the LEARNING line is often too +casual or too specific to the conversation context). + +### Bot CONFIDENCE drops are signals + +When the AI agent drops its confidence score after a correction, that CONFIRMS the human +was right. Low confidence (0.3-0.5) after pushback = strong signal the correction is valid. + +### Anti-circularity rule + +If the AI agent is simply reflecting the human's thesis back (restating what the human said +in different words), do NOT extract that as a claim sourced from the agent. That's circular. +Only extract claims that either: +- Represent the human's ORIGINAL assertion (source it to the human) +- Introduce genuinely NEW information from the agent's knowledge (source it to the agent + context) + +### Retrieval-only conversations → null_result + +If the conversation is purely a lookup request ("what is X", "give me a list of Y", +"what's the market cap of Z") with no analytical content, corrections, or novel claims, +return an empty extraction (null_result). The dividing line: did the human ASSERT something +or only ASK something? +""" + else: + conversation_section = "" + return f"""You are {agent}, extracting knowledge from a source for TeleoHumanity's collective knowledge base. ## Your Task @@ -195,7 +256,7 @@ Single source = experimental at most. Pitch rhetoric or marketing copy = specula **File:** {source_file} {source_content} -{contributor_directive}{previous_feedback_section}{connection_candidates} +{conversation_section}{contributor_directive}{previous_feedback_section}{connection_candidates} ## KB Index (existing claims — check for duplicates and enrichment targets) {kb_index}