teleo-codex/ops/pipeline-v2/lib/extraction_prompt.py

"""Lean extraction prompt — judgment only, mechanical rules in code.

The extraction prompt focuses on WHAT to extract:
- Separate facts from claims from enrichments
- Classify confidence honestly
- Identify entity data
- Check for duplicates against KB index

Mechanical enforcement (frontmatter format, wiki links, dates, filenames)
is handled by post_extract.py AFTER the LLM returns.

Design principle (Leo): mechanical rules in code, judgment in prompts.
Epimetheus owns this module. Leo reviews changes.
"""

from datetime import date


def build_extraction_prompt(
    source_file: str,
    source_content: str,
    domain: str,
    agent: str,
    kb_index: str,
    *,
    today: str | None = None,
    rationale: str | None = None,
    intake_tier: str | None = None,
    proposed_by: str | None = None,
    prior_art: list[dict] | None = None,
    previous_feedback: dict | None = None,
) -> str:
    """Build the lean extraction prompt.

    Args:
        source_file: Path to the source being extracted
        source_content: Full text of the source
        domain: Primary domain for this source
        agent: Agent name performing extraction
        kb_index: Pre-generated KB index text (claim titles for dedup)
        today: Override date for testing (default: today)
        rationale: Contributor's natural-language thesis about the source (optional)
        intake_tier: undirected | directed | challenge (optional)
        proposed_by: Contributor handle who submitted the source (optional)
        prior_art: Qdrant search results — existing claims semantically similar to this source.
                   Each dict has: claim_title, claim_path, description, score.
                   Injected as connection candidates for extract-time linking.

    Returns:
        The complete prompt string
    """
    today = today or date.today().isoformat()

    # Build contributor directive section (if rationale provided)
    if rationale and rationale.strip():
        contributor_name = proposed_by or "a contributor"
        tier_label = intake_tier or "directed"
        contributor_directive = f"""
## Contributor Directive (intake_tier: {tier_label})

**{contributor_name}** submitted this source and said:

> {rationale.strip()}

This is an extraction directive — use it to focus your extraction:
- Extract claims that relate to the contributor's thesis
- If the source SUPPORTS their thesis, extract the supporting evidence as claims
- If the source CONTRADICTS their thesis, extract the contradiction — that's even more valuable
- Evaluate whether the contributor's own thesis is extractable as a standalone claim
  - If specific enough to disagree with and supported by the source: extract it with `source: "{contributor_name}, original analysis"`
  - If too vague or already in the KB: use it as a directive only
- If the contributor references existing claims ("I disagree with X"), identify those claims by filename from the KB index and include them in the `challenges` field
- ALSO extract anything else valuable in the source — the directive is a spotlight, not a filter

Set `contributor_thesis_extractable: true` if you extracted the contributor's thesis as a claim, `false` otherwise.
"""
    else:
        contributor_directive = ""

    # Build previous feedback section (for re-extraction after eval rejection)
    if previous_feedback:
        issues = previous_feedback.get("issues", [])
        leo_verdict = previous_feedback.get("leo", "")
        domain_verdict = previous_feedback.get("domain", "")
        feedback_lines = [
            "\n## Previous Extraction Feedback\n",
            "A previous extraction from this source was **rejected** by the evaluation pipeline.",
            "Learn from these issues and avoid repeating them:\n",
        ]
        if issues:
            for issue in issues:
                issue_guidance = {
                    "frontmatter_schema": "Fix frontmatter format — ensure all required fields are present and correctly typed.",
                    "title_overclaims": "Make titles more precise — avoid broad generalizations. The title must be specific enough to disagree with.",
                    "confidence_miscalibration": "Calibrate confidence honestly — single source = experimental at most. Don't mark speculative claims as likely.",
                    "factual_discrepancy": "Check facts carefully — verify dates, numbers, and attributions against the source text.",
                    "near_duplicate": "Check the KB index more carefully — this claim may already exist. Prefer enrichment over duplication.",
                    "scope_error": "Scope claims correctly — don't mix structural, functional, and causal claims in one.",
                    "broken_wiki_links": "Ensure wiki links reference real entities/claims in the KB.",
                }
                guidance = issue_guidance.get(issue, f"Address: {issue}")
                feedback_lines.append(f"- **{issue}**: {guidance}")
        feedback_lines.append("")
        if leo_verdict == "request_changes":
            feedback_lines.append("The lead reviewer requested changes. Extract fewer, higher-quality claims.")
        if domain_verdict == "request_changes":
            feedback_lines.append("The domain reviewer requested changes. Pay closer attention to domain-specific standards.")
        feedback_lines.append("")
        previous_feedback_section = "\n".join(feedback_lines)
    else:
        previous_feedback_section = ""

    # Build connection candidates section (if prior art found via Qdrant)
    if prior_art:
        pa_lines = [
            "\n## Connection Candidates (semantically similar existing claims)\n",
            "These existing claims are topically related to this source. For each NEW claim you extract,",
            "check this list and specify connections in the `connections` array.\n",
        ]
        for i, pa in enumerate(prior_art[:10], 1):
            title = pa.get("claim_title", "untitled")
            path = pa.get("claim_path", "")
            desc = pa.get("description", "")
            score = pa.get("score", 0)
            filename = path.rsplit("/", 1)[-1].replace(".md", "") if path else title
            pa_lines.append(f"{i}. **{title}** (`{filename}`, similarity: {score:.2f})")
            if desc:
                pa_lines.append(f"   {desc}")
        pa_lines.append("")
        connection_candidates = "\n".join(pa_lines)
    else:
        connection_candidates = ""

    return f"""You are {agent}, extracting knowledge from a source for TeleoHumanity's collective knowledge base.

## Your Task

Read the source below. Be SELECTIVE — extract only what genuinely expands the KB's understanding. Most sources produce 0-3 claims. A source that produces 5+ claims is almost certainly over-extracting.

For each insight, classify it as one of:

**CLAIM** — An arguable proposition someone could disagree with. Must name a specific mechanism.
- Good: "futarchy is manipulation-resistant because attack attempts create profitable opportunities for defenders"
- Bad: "futarchy has interesting governance properties"
- Test: "This note argues that [title]" must work as a sentence.
- MAXIMUM 3-5 claims per source. If you find more, keep only the most novel and surprising.

**ENRICHMENT** — New evidence that strengthens, challenges, or extends an existing claim in the KB.
- If an insight supports something already in the KB index below, it's an enrichment, NOT a new claim.
- Enrichment over duplication: ALWAYS prefer adding evidence to an existing claim.
- Most sources should produce more enrichments than new claims.

**ENTITY** — Factual data about a company, protocol, person, organization, or market. Not arguable.
- Entity types: company, person, protocol, organization, market (core). Domain-specific: lab, fund, token, exchange, therapy, research_program, benchmark.
- One file per entity. If the entity already exists, append a timeline entry — don't create a new file.
- New entities: raised real capital (>$10K), launched a product, or discussed by 2+ sources.
- Skip: test proposals, spam, trivial projects.
- Filing: `entities/{{domain}}/{{entity-name}}.md`

**DECISION** — A governance decision, futarchic proposal, funding vote, or policy action. Separate from entities.
- Decisions are events with terminal states (passed/failed/expired). Entities are persistent objects.
- Each significant decision gets its own file in `decisions/{{domain}}/`.
- ALSO output a timeline entry for the parent entity: `- **YYYY-MM-DD** — [[decision-filename]] Outcome: one-line summary`
- Only extract a CLAIM from a decision if it reveals a novel MECHANISM INSIGHT (~1 per 10-15 decisions).
- Routine decisions (minor budgets, operational tweaks, uncontested votes) → timeline entry on parent entity only, no decision file.
- Filing: `decisions/{{domain}}/{{parent}}-{{slug}}.md`

**FACT** — A verifiable data point no one would disagree with. Store in source notes, not as a claim.
- "Jupiter DAO vote reached 75% support" is a fact, not a claim.
- Individual data points about specific events are facts. Generalizable patterns from multiple data points are claims.

## Selectivity Rules

**Novelty gate — argument, not topic:** Before extracting a claim, check the KB index below. The question is NOT "does the KB cover this topic?" but "does the KB already make THIS SPECIFIC ARGUMENT?" A new argument in a well-covered topic IS a new claim. A new data point supporting an existing argument is an enrichment.
- New data point for existing argument → ENRICHMENT (add evidence to existing claim)
- New argument the KB doesn't have yet → CLAIM (even if the topic is well-covered)
- Same argument with different wording → ENRICHMENT (don't create near-duplicates)

**Challenge premium:** A single well-evidenced claim that challenges an existing KB position is worth more than 10 claims that confirm what we already know. Prioritize extraction of counter-evidence and boundary conditions.

**What would change an agent's mind?** Ask this for every potential claim. If the answer is "nothing — this is more evidence for what we already believe," it's an enrichment. If the answer is "this introduces a mechanism or argument we haven't considered," it's a claim.

## Confidence Calibration

Be honest about uncertainty:
- **proven**: Multiple independent confirmations, tested against challenges
- **likely**: 3+ corroborating sources with empirical data
- **experimental**: 1-2 sources with data, or strong theoretical argument
- **speculative**: Theory without data, single anecdote, or self-reported company claims

Single source = experimental at most. Pitch rhetoric or marketing copy = speculative.

## Source

**File:** {source_file}

{source_content}
{contributor_directive}{previous_feedback_section}{connection_candidates}
## KB Index (existing claims — check for duplicates and enrichment targets)

{kb_index}

## Output Format

Return valid JSON. The post-processor handles frontmatter formatting, wiki links, and dates — focus on the intellectual content.

```json
{{
  "claims": [
    {{
      "filename": "descriptive-slug-matching-the-claim.md",
      "domain": "{domain}",
      "title": "Prose claim title that is specific enough to disagree with",
      "description": "One sentence adding context beyond the title",
      "confidence": "experimental",
      "source": "author/org, key evidence reference",
      "body": "Argument with evidence. Cite specific data, quotes, studies from the source. Explain WHY the claim is supported. This must be a real argument, not a restatement of the title.",
      "related_claims": ["existing-claim-stem-from-kb-index"],
      "connections": [
        {{
          "target": "existing-claim-filename-from-connection-candidates-or-kb-index",
          "relationship": "supports|challenges|related",
          "reason": "One sentence: WHY does this claim support/challenge/relate to the target?"
        }}
      ],
      "scope": "structural|functional|causal|correlational",
      "sourcer": "handle or name of the original author/source (e.g., @theiaresearch, Pine Analytics)"
    }}
  ],
  "enrichments": [
    {{
      "target_file": "existing-claim-filename.md",
      "type": "confirm|challenge|extend",
      "evidence": "The new evidence from this source",
      "source_ref": "Brief source reference"
    }}
  ],
  "entities": [
    {{
      "filename": "entity-name.md",
      "domain": "{domain}",
      "action": "create|update",
      "entity_type": "company|person|protocol|organization|market|lab|fund|research_program",
      "content": "Full markdown for new entities. For updates, leave empty.",
      "timeline_entry": "- **YYYY-MM-DD** — Event with specifics"
    }}
  ],
  "decisions": [
    {{
      "filename": "parent-slug-decision-slug.md",
      "domain": "{domain}",
      "parent_entity": "parent-entity-filename.md",
      "status": "passed|failed|active",
      "category": "treasury|fundraise|hiring|mechanism|liquidation|grants|strategy",
      "summary": "One-sentence description of the decision",
      "content": "Full markdown for significant decisions. Empty for routine ones.",
      "parent_timeline_entry": "- **YYYY-MM-DD** — [[decision-filename]] Passed: one-line summary"
    }}
  ],
  "facts": [
    "Verifiable data points to store in source archive notes"
  ],
  "extraction_notes": "Brief summary: N claims, N enrichments, N entities, N decisions. What was most interesting.",
  "contributor_thesis_extractable": false
}}
```

## Rules

1. **Quality over quantity.** 0-3 precise claims beats 8 vague ones. If you can't name the specific mechanism in the title, don't extract it. Empty claims arrays are fine — not every source produces novel claims.
2. **Enrichment over duplication.** Check the KB index FIRST. If something similar exists, add evidence to it. New claims are only for genuinely novel propositions.
3. **Facts are not claims.** Individual data points go in `facts`. Only generalized patterns from multiple data points become claims.
4. **Proposals are entities, not claims.** A governance proposal, token launch, or funding event is structured data (entity). Only extract a claim if the event reveals a novel mechanism insight that generalizes beyond this specific case.
5. **Scope your claims.** Say whether you're claiming a structural, functional, causal, or correlational relationship.
6. **Connect your claims.** For every new claim, check the Connection Candidates list. If a candidate is related, add it to the `connections` array with the relationship type and a one-sentence reason. Use `supports` when your claim provides evidence for the target, `challenges` when it contradicts, `related` only as a last resort. Unconnected claims are orphans — connect them at birth.
7. **OPSEC.** Never extract specific dollar amounts, valuations, equity percentages, or deal terms for LivingIP/Teleo. General market data is fine.
8. **Read the Agent Notes.** If the source has "Agent Notes" or "Curator Notes" sections, they contain context about why this source matters.

Return valid JSON only. No markdown fencing, no explanation outside the JSON.
"""


def build_entity_enrichment_prompt(
    entity_file: str,
    entity_content: str,
    new_data: list[dict],
    domain: str,
) -> str:
    """Build prompt for batch entity enrichment (runs on main, not extraction branch).

    This is separate from claim extraction to avoid merge conflicts.
    Entity enrichments are additive timeline entries — commutative, auto-mergeable.

    Args:
        entity_file: Path to the entity being enriched
        entity_content: Current content of the entity file
        new_data: List of timeline entries from recent extractions
        domain: Entity domain

    Returns:
        Prompt for entity enrichment
    """
    entries_text = "\n".join(
        f"- Source: {d.get('source', '?')}\n  Entry: {d.get('timeline_entry', '')}"
        for d in new_data
    )

    return f"""You are a Teleo knowledge base agent. Merge these new timeline entries into an existing entity.

## Current Entity: {entity_file}

{entity_content}

## New Data Points

{entries_text}

## Rules

1. Append new entries to the Timeline section in chronological order
2. Deduplicate: skip entries that describe events already in the timeline
3. Preserve all existing content — append only
4. If a new data point updates a metric (revenue, valuation, user count), add it as a new timeline entry, don't modify existing entries

Return the complete updated entity file content.
"""