Imports 67 files from VPS (/opt/teleo-eval/) into repo as the single source of truth. Previously only 8 of 67 files existed in repo — the rest were deployed directly to VPS via SCP, causing massive drift. Includes: - pipeline/lib/: 33 Python modules (daemon core, extraction, evaluation, merge, cascade, cross-domain, costs, attribution, etc.) - pipeline/: main daemon (teleo-pipeline.py), reweave.py, batch-extract-50.sh - diagnostics/: 19 files (4-page dashboard, alerting, daily digest, review queue, tier1 metrics) - agent-state/: bootstrap, lib-state, cascade inbox processor, schema - systemd/: service unit files for reference - deploy.sh: rsync-based deploy with --dry-run, syntax checks, dirty-tree gate - research-session.sh: updated with Step 8.5 digest + cascade inbox processing No new code written — all files are exact copies from VPS as of 2026-04-06. From this point forward: edit in repo, commit, then deploy.sh. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
326 lines
16 KiB
Python
326 lines
16 KiB
Python
"""Lean extraction prompt — judgment only, mechanical rules in code.
|
|
|
|
The extraction prompt focuses on WHAT to extract:
|
|
- Separate facts from claims from enrichments
|
|
- Classify confidence honestly
|
|
- Identify entity data
|
|
- Check for duplicates against KB index
|
|
|
|
Mechanical enforcement (frontmatter format, wiki links, dates, filenames)
|
|
is handled by post_extract.py AFTER the LLM returns.
|
|
|
|
Design principle (Leo): mechanical rules in code, judgment in prompts.
|
|
Epimetheus owns this module. Leo reviews changes.
|
|
"""
|
|
|
|
from datetime import date
|
|
|
|
|
|
def build_extraction_prompt(
|
|
source_file: str,
|
|
source_content: str,
|
|
domain: str,
|
|
agent: str,
|
|
kb_index: str,
|
|
*,
|
|
today: str | None = None,
|
|
rationale: str | None = None,
|
|
intake_tier: str | None = None,
|
|
proposed_by: str | None = None,
|
|
prior_art: list[dict] | None = None,
|
|
previous_feedback: dict | None = None,
|
|
) -> str:
|
|
"""Build the lean extraction prompt.
|
|
|
|
Args:
|
|
source_file: Path to the source being extracted
|
|
source_content: Full text of the source
|
|
domain: Primary domain for this source
|
|
agent: Agent name performing extraction
|
|
kb_index: Pre-generated KB index text (claim titles for dedup)
|
|
today: Override date for testing (default: today)
|
|
rationale: Contributor's natural-language thesis about the source (optional)
|
|
intake_tier: undirected | directed | challenge (optional)
|
|
proposed_by: Contributor handle who submitted the source (optional)
|
|
prior_art: Qdrant search results — existing claims semantically similar to this source.
|
|
Each dict has: claim_title, claim_path, description, score.
|
|
Injected as connection candidates for extract-time linking.
|
|
|
|
Returns:
|
|
The complete prompt string
|
|
"""
|
|
today = today or date.today().isoformat()
|
|
|
|
# Build contributor directive section (if rationale provided)
|
|
if rationale and rationale.strip():
|
|
contributor_name = proposed_by or "a contributor"
|
|
tier_label = intake_tier or "directed"
|
|
contributor_directive = f"""
|
|
## Contributor Directive (intake_tier: {tier_label})
|
|
|
|
**{contributor_name}** submitted this source and said:
|
|
|
|
> {rationale.strip()}
|
|
|
|
This is an extraction directive — use it to focus your extraction:
|
|
- Extract claims that relate to the contributor's thesis
|
|
- If the source SUPPORTS their thesis, extract the supporting evidence as claims
|
|
- If the source CONTRADICTS their thesis, extract the contradiction — that's even more valuable
|
|
- Evaluate whether the contributor's own thesis is extractable as a standalone claim
|
|
- If specific enough to disagree with and supported by the source: extract it with `source: "{contributor_name}, original analysis"`
|
|
- If too vague or already in the KB: use it as a directive only
|
|
- If the contributor references existing claims ("I disagree with X"), identify those claims by filename from the KB index and include them in the `challenges` field
|
|
- ALSO extract anything else valuable in the source — the directive is a spotlight, not a filter
|
|
|
|
Set `contributor_thesis_extractable: true` if you extracted the contributor's thesis as a claim, `false` otherwise.
|
|
"""
|
|
else:
|
|
contributor_directive = ""
|
|
|
|
# Build previous feedback section (for re-extraction after eval rejection)
|
|
if previous_feedback:
|
|
issues = previous_feedback.get("issues", [])
|
|
leo_verdict = previous_feedback.get("leo", "")
|
|
domain_verdict = previous_feedback.get("domain", "")
|
|
feedback_lines = [
|
|
"\n## Previous Extraction Feedback\n",
|
|
"A previous extraction from this source was **rejected** by the evaluation pipeline.",
|
|
"Learn from these issues and avoid repeating them:\n",
|
|
]
|
|
if issues:
|
|
for issue in issues:
|
|
issue_guidance = {
|
|
"frontmatter_schema": "Fix frontmatter format — ensure all required fields are present and correctly typed.",
|
|
"title_overclaims": "Make titles more precise — avoid broad generalizations. The title must be specific enough to disagree with.",
|
|
"confidence_miscalibration": "Calibrate confidence honestly — single source = experimental at most. Don't mark speculative claims as likely.",
|
|
"factual_discrepancy": "Check facts carefully — verify dates, numbers, and attributions against the source text.",
|
|
"near_duplicate": "Check the KB index more carefully — this claim may already exist. Prefer enrichment over duplication.",
|
|
"scope_error": "Scope claims correctly — don't mix structural, functional, and causal claims in one.",
|
|
"broken_wiki_links": "Ensure wiki links reference real entities/claims in the KB.",
|
|
}
|
|
guidance = issue_guidance.get(issue, f"Address: {issue}")
|
|
feedback_lines.append(f"- **{issue}**: {guidance}")
|
|
feedback_lines.append("")
|
|
if leo_verdict == "request_changes":
|
|
feedback_lines.append("The lead reviewer requested changes. Extract fewer, higher-quality claims.")
|
|
if domain_verdict == "request_changes":
|
|
feedback_lines.append("The domain reviewer requested changes. Pay closer attention to domain-specific standards.")
|
|
feedback_lines.append("")
|
|
previous_feedback_section = "\n".join(feedback_lines)
|
|
else:
|
|
previous_feedback_section = ""
|
|
|
|
# Build connection candidates section (if prior art found via Qdrant)
|
|
if prior_art:
|
|
pa_lines = [
|
|
"\n## Connection Candidates (semantically similar existing claims)\n",
|
|
"These existing claims are topically related to this source. For each NEW claim you extract,",
|
|
"check this list and specify connections in the `connections` array.\n",
|
|
]
|
|
for i, pa in enumerate(prior_art[:10], 1):
|
|
title = pa.get("claim_title", "untitled")
|
|
path = pa.get("claim_path", "")
|
|
desc = pa.get("description", "")
|
|
score = pa.get("score", 0)
|
|
filename = path.rsplit("/", 1)[-1].replace(".md", "") if path else title
|
|
pa_lines.append(f"{i}. **{title}** (`{filename}`, similarity: {score:.2f})")
|
|
if desc:
|
|
pa_lines.append(f" {desc}")
|
|
pa_lines.append("")
|
|
connection_candidates = "\n".join(pa_lines)
|
|
else:
|
|
connection_candidates = ""
|
|
|
|
return f"""You are {agent}, extracting knowledge from a source for TeleoHumanity's collective knowledge base.
|
|
|
|
## Your Task
|
|
|
|
Read the source below. Be SELECTIVE — extract only what genuinely expands the KB's understanding. Most sources produce 0-3 claims. A source that produces 5+ claims is almost certainly over-extracting.
|
|
|
|
For each insight, classify it as one of:
|
|
|
|
**CLAIM** — An arguable proposition someone could disagree with. Must name a specific mechanism.
|
|
- Good: "futarchy is manipulation-resistant because attack attempts create profitable opportunities for defenders"
|
|
- Bad: "futarchy has interesting governance properties"
|
|
- Test: "This note argues that [title]" must work as a sentence.
|
|
- MAXIMUM 3-5 claims per source. If you find more, keep only the most novel and surprising.
|
|
|
|
**ENRICHMENT** — New evidence that strengthens, challenges, or extends an existing claim in the KB.
|
|
- If an insight supports something already in the KB index below, it's an enrichment, NOT a new claim.
|
|
- Enrichment over duplication: ALWAYS prefer adding evidence to an existing claim.
|
|
- Most sources should produce more enrichments than new claims.
|
|
|
|
**ENTITY** — Factual data about a company, protocol, person, organization, or market. Not arguable.
|
|
- Entity types: company, person, protocol, organization, market (core). Domain-specific: lab, fund, token, exchange, therapy, research_program, benchmark.
|
|
- One file per entity. If the entity already exists, append a timeline entry — don't create a new file.
|
|
- New entities: raised real capital (>$10K), launched a product, or discussed by 2+ sources.
|
|
- Skip: test proposals, spam, trivial projects.
|
|
- Filing: `entities/{{domain}}/{{entity-name}}.md`
|
|
|
|
**DECISION** — A governance decision, futarchic proposal, funding vote, or policy action. Separate from entities.
|
|
- Decisions are events with terminal states (passed/failed/expired). Entities are persistent objects.
|
|
- Each significant decision gets its own file in `decisions/{{domain}}/`.
|
|
- ALSO output a timeline entry for the parent entity: `- **YYYY-MM-DD** — [[decision-filename]] Outcome: one-line summary`
|
|
- Only extract a CLAIM from a decision if it reveals a novel MECHANISM INSIGHT (~1 per 10-15 decisions).
|
|
- Routine decisions (minor budgets, operational tweaks, uncontested votes) → timeline entry on parent entity only, no decision file.
|
|
- Filing: `decisions/{{domain}}/{{parent}}-{{slug}}.md`
|
|
|
|
**FACT** — A verifiable data point no one would disagree with. Store in source notes, not as a claim.
|
|
- "Jupiter DAO vote reached 75% support" is a fact, not a claim.
|
|
- Individual data points about specific events are facts. Generalizable patterns from multiple data points are claims.
|
|
|
|
## Selectivity Rules
|
|
|
|
**Novelty gate — argument, not topic:** Before extracting a claim, check the KB index below. The question is NOT "does the KB cover this topic?" but "does the KB already make THIS SPECIFIC ARGUMENT?" A new argument in a well-covered topic IS a new claim. A new data point supporting an existing argument is an enrichment.
|
|
- New data point for existing argument → ENRICHMENT (add evidence to existing claim)
|
|
- New argument the KB doesn't have yet → CLAIM (even if the topic is well-covered)
|
|
- Same argument with different wording → ENRICHMENT (don't create near-duplicates)
|
|
|
|
**Challenge premium:** A single well-evidenced claim that challenges an existing KB position is worth more than 10 claims that confirm what we already know. Prioritize extraction of counter-evidence and boundary conditions.
|
|
|
|
**What would change an agent's mind?** Ask this for every potential claim. If the answer is "nothing — this is more evidence for what we already believe," it's an enrichment. If the answer is "this introduces a mechanism or argument we haven't considered," it's a claim.
|
|
|
|
## Confidence Calibration
|
|
|
|
Be honest about uncertainty:
|
|
- **proven**: Multiple independent confirmations, tested against challenges
|
|
- **likely**: 3+ corroborating sources with empirical data
|
|
- **experimental**: 1-2 sources with data, or strong theoretical argument
|
|
- **speculative**: Theory without data, single anecdote, or self-reported company claims
|
|
|
|
Single source = experimental at most. Pitch rhetoric or marketing copy = speculative.
|
|
|
|
## Source
|
|
|
|
**File:** {source_file}
|
|
|
|
{source_content}
|
|
{contributor_directive}{previous_feedback_section}{connection_candidates}
|
|
## KB Index (existing claims — check for duplicates and enrichment targets)
|
|
|
|
{kb_index}
|
|
|
|
## Output Format
|
|
|
|
Return valid JSON. The post-processor handles frontmatter formatting, wiki links, and dates — focus on the intellectual content.
|
|
|
|
```json
|
|
{{
|
|
"claims": [
|
|
{{
|
|
"filename": "descriptive-slug-matching-the-claim.md",
|
|
"domain": "{domain}",
|
|
"title": "Prose claim title that is specific enough to disagree with",
|
|
"description": "One sentence adding context beyond the title",
|
|
"confidence": "experimental",
|
|
"source": "author/org, key evidence reference",
|
|
"body": "Argument with evidence. Cite specific data, quotes, studies from the source. Explain WHY the claim is supported. This must be a real argument, not a restatement of the title.",
|
|
"related_claims": ["existing-claim-stem-from-kb-index"],
|
|
"connections": [
|
|
{{
|
|
"target": "existing-claim-filename-from-connection-candidates-or-kb-index",
|
|
"relationship": "supports|challenges|related",
|
|
"reason": "One sentence: WHY does this claim support/challenge/relate to the target?"
|
|
}}
|
|
],
|
|
"scope": "structural|functional|causal|correlational",
|
|
"sourcer": "handle or name of the original author/source (e.g., @theiaresearch, Pine Analytics)"
|
|
}}
|
|
],
|
|
"enrichments": [
|
|
{{
|
|
"target_file": "existing-claim-filename.md",
|
|
"type": "confirm|challenge|extend",
|
|
"evidence": "The new evidence from this source",
|
|
"source_ref": "Brief source reference"
|
|
}}
|
|
],
|
|
"entities": [
|
|
{{
|
|
"filename": "entity-name.md",
|
|
"domain": "{domain}",
|
|
"action": "create|update",
|
|
"entity_type": "company|person|protocol|organization|market|lab|fund|research_program",
|
|
"content": "Full markdown for new entities. For updates, leave empty.",
|
|
"timeline_entry": "- **YYYY-MM-DD** — Event with specifics"
|
|
}}
|
|
],
|
|
"decisions": [
|
|
{{
|
|
"filename": "parent-slug-decision-slug.md",
|
|
"domain": "{domain}",
|
|
"parent_entity": "parent-entity-filename.md",
|
|
"status": "passed|failed|active",
|
|
"category": "treasury|fundraise|hiring|mechanism|liquidation|grants|strategy",
|
|
"summary": "One-sentence description of the decision",
|
|
"content": "Full markdown for significant decisions. Empty for routine ones.",
|
|
"parent_timeline_entry": "- **YYYY-MM-DD** — [[decision-filename]] Passed: one-line summary"
|
|
}}
|
|
],
|
|
"facts": [
|
|
"Verifiable data points to store in source archive notes"
|
|
],
|
|
"extraction_notes": "Brief summary: N claims, N enrichments, N entities, N decisions. What was most interesting.",
|
|
"contributor_thesis_extractable": false
|
|
}}
|
|
```
|
|
|
|
## Rules
|
|
|
|
1. **Quality over quantity.** 0-3 precise claims beats 8 vague ones. If you can't name the specific mechanism in the title, don't extract it. Empty claims arrays are fine — not every source produces novel claims.
|
|
2. **Enrichment over duplication.** Check the KB index FIRST. If something similar exists, add evidence to it. New claims are only for genuinely novel propositions.
|
|
3. **Facts are not claims.** Individual data points go in `facts`. Only generalized patterns from multiple data points become claims.
|
|
4. **Proposals are entities, not claims.** A governance proposal, token launch, or funding event is structured data (entity). Only extract a claim if the event reveals a novel mechanism insight that generalizes beyond this specific case.
|
|
5. **Scope your claims.** Say whether you're claiming a structural, functional, causal, or correlational relationship.
|
|
6. **Connect your claims.** For every new claim, check the Connection Candidates list. If a candidate is related, add it to the `connections` array with the relationship type and a one-sentence reason. Use `supports` when your claim provides evidence for the target, `challenges` when it contradicts, `related` only as a last resort. Unconnected claims are orphans — connect them at birth.
|
|
7. **OPSEC.** Never extract specific dollar amounts, valuations, equity percentages, or deal terms for LivingIP/Teleo. General market data is fine.
|
|
8. **Read the Agent Notes.** If the source has "Agent Notes" or "Curator Notes" sections, they contain context about why this source matters.
|
|
|
|
Return valid JSON only. No markdown fencing, no explanation outside the JSON.
|
|
"""
|
|
|
|
|
|
def build_entity_enrichment_prompt(
|
|
entity_file: str,
|
|
entity_content: str,
|
|
new_data: list[dict],
|
|
domain: str,
|
|
) -> str:
|
|
"""Build prompt for batch entity enrichment (runs on main, not extraction branch).
|
|
|
|
This is separate from claim extraction to avoid merge conflicts.
|
|
Entity enrichments are additive timeline entries — commutative, auto-mergeable.
|
|
|
|
Args:
|
|
entity_file: Path to the entity being enriched
|
|
entity_content: Current content of the entity file
|
|
new_data: List of timeline entries from recent extractions
|
|
domain: Entity domain
|
|
|
|
Returns:
|
|
Prompt for entity enrichment
|
|
"""
|
|
entries_text = "\n".join(
|
|
f"- Source: {d.get('source', '?')}\n Entry: {d.get('timeline_entry', '')}"
|
|
for d in new_data
|
|
)
|
|
|
|
return f"""You are a Teleo knowledge base agent. Merge these new timeline entries into an existing entity.
|
|
|
|
## Current Entity: {entity_file}
|
|
|
|
{entity_content}
|
|
|
|
## New Data Points
|
|
|
|
{entries_text}
|
|
|
|
## Rules
|
|
|
|
1. Append new entries to the Timeline section in chronological order
|
|
2. Deduplicate: skip entries that describe events already in the timeline
|
|
3. Preserve all existing content — append only
|
|
4. If a new data point updates a metric (revenue, valuation, user count), add it as a new timeline entry, don't modify existing entries
|
|
|
|
Return the complete updated entity file content.
|
|
"""
|