teleo-infrastructure/tests/test_post_extract.py

"""Tests for post-extraction validator — the $0 mechanical quality gate.

Tests cover the fixers and validators that catch 73% of eval rejections:
- Frontmatter fixing (missing fields, wrong dates, invalid values)
- Wiki link stripping (broken links → plain text)
- Title validation (proposition check, word count)
- Duplicate detection (SequenceMatcher threshold)
- Entity validation (schema, decision_market fields)
- The full validate_and_fix_claims pipeline
"""

import pytest
from datetime import date

from lib.post_extract import (
    parse_frontmatter,
    fix_frontmatter,
    fix_wiki_links,
    fix_trailing_newline,
    fix_h1_title_match,
    validate_claim,
    validate_and_fix_claims,
    validate_and_fix_entities,
)


# ─── Fixtures ──────────────────────────────────────────────────────────────


VALID_CLAIM = """---
type: claim
domain: internet-finance
description: "MetaDAO futarchy implementation demonstrates limited volume in uncontested decisions"
confidence: experimental
source: "Pine Analytics, Q4 2025 report"
created: {today}
---

# MetaDAO futarchy implementation shows limited trading volume in uncontested decisions

Analysis of MetaDAO proposal markets shows that uncontested decisions attract
minimal trading volume. When proposals have clear consensus (>80% pass rate),
conditional token markets see <$1000 in volume. This suggests futarchy's
information aggregation mechanism is most valuable when outcomes are uncertain.

Evidence from Pine Analytics Q4 2025 report shows 15 proposals with >80%
pass rate averaged $340 in total volume, while 3 contested proposals
averaged $45,000.

---

Relevant Notes:
- [[metadao]]
- [[futarchy-adoption-faces-friction]]

Topics:
- [[_map]]
""".format(today=date.today().isoformat())


MISSING_FIELDS_CLAIM = """---
type: claim
domain: internet-finance
---

# Some claim title that is specific enough to argue about meaningfully

Body text here.
"""

ENTITY_CONTENT = """---
type: entity
entity_type: company
name: "MetaDAO"
domain: internet-finance
description: "Futarchy governance platform on Solana"
status: active
tracked_by: rio
---

# MetaDAO

Overview of MetaDAO.

## Timeline

- **2024-01-01** — Launch of Autocrat v0.1
"""


@pytest.fixture
def existing_claims():
    """Sample existing claim stems for dedup/link checking."""
    return {
        "metadao",
        "futarchy-adoption-faces-friction",
        "coin-price-is-the-fairest-objective-function-for-asset-futarchy",
        "futarchy-is-manipulation-resistant-because-attack-attempts-create-profitable-opportunities-for-defenders",
        "_map",
    }


# ─── parse_frontmatter ────────────────────────────────────────────────────


class TestParseFrontmatter:
    def test_valid_frontmatter(self):
        fm, body = parse_frontmatter(VALID_CLAIM)
        assert fm is not None
        assert fm["type"] == "claim"
        assert fm["domain"] == "internet-finance"
        assert "# MetaDAO" in body

    def test_no_frontmatter(self):
        fm, body = parse_frontmatter("# Just a title\n\nSome body.")
        assert fm is None
        assert "Just a title" in body

    def test_empty_frontmatter(self):
        fm, body = parse_frontmatter("---\n---\nBody")
        # Empty YAML → None
        assert fm is None or fm == {}


# ─── fix_frontmatter ──────────────────────────────────────────────────────


class TestFixFrontmatter:
    def test_no_fixes_needed(self):
        fixed, fixes = fix_frontmatter(VALID_CLAIM, "internet-finance", "rio")
        assert len(fixes) == 0

    def test_missing_created_date(self):
        content = MISSING_FIELDS_CLAIM
        fixed, fixes = fix_frontmatter(content, "internet-finance", "rio")
        assert any("added_created" in f or "added_confidence" in f for f in fixes)
        fm, _ = parse_frontmatter(fixed)
        assert fm["created"] == date.today().isoformat()

    def test_wrong_created_date(self):
        content = """---
type: claim
domain: internet-finance
description: "test"
confidence: experimental
source: "test"
created: 2025-01-15
---

# test claim that is long enough to pass validation checks

Body.
"""
        fixed, fixes = fix_frontmatter(content, "internet-finance", "rio")
        assert any("set_created" in f for f in fixes)
        fm, _ = parse_frontmatter(fixed)
        assert fm["created"] == date.today().isoformat()

    def test_invalid_confidence(self):
        content = """---
type: claim
domain: internet-finance
description: "test"
confidence: probable
source: "test"
created: 2026-03-15
---

# test claim body

Body.
"""
        fixed, fixes = fix_frontmatter(content, "internet-finance", "rio")
        assert any("fixed_confidence" in f for f in fixes)
        fm, _ = parse_frontmatter(fixed)
        assert fm["confidence"] == "experimental"

    def test_missing_domain_uses_provided(self):
        content = """---
type: claim
description: "test"
confidence: experimental
source: "test"
created: 2026-03-15
---

# test claim

Body.
"""
        fixed, fixes = fix_frontmatter(content, "health", "vida")
        assert any("fixed_domain" in f for f in fixes)
        fm, _ = parse_frontmatter(fixed)
        assert fm["domain"] == "health"


# ─── fix_wiki_links ───────────────────────────────────────────────────────


class TestFixWikiLinks:
    def test_valid_links_preserved(self, existing_claims):
        content = "See [[metadao]] and [[_map]] for context."
        fixed, fixes = fix_wiki_links(content, existing_claims)
        assert "[[metadao]]" in fixed
        assert "[[_map]]" in fixed
        assert len(fixes) == 0

    def test_broken_links_stripped(self, existing_claims):
        content = "See [[nonexistent-claim]] for details."
        fixed, fixes = fix_wiki_links(content, existing_claims)
        assert "[[nonexistent-claim]]" not in fixed
        assert "nonexistent-claim" in fixed  # Text kept
        assert len(fixes) == 1

    def test_mixed_links(self, existing_claims):
        content = "Both [[metadao]] and [[invented-link]] are relevant."
        fixed, fixes = fix_wiki_links(content, existing_claims)
        assert "[[metadao]]" in fixed
        assert "[[invented-link]]" not in fixed
        assert "invented-link" in fixed
        assert len(fixes) == 1


# ─── fix_trailing_newline ─────────────────────────────────────────────────


class TestFixTrailingNewline:
    def test_adds_newline(self):
        fixed, fixes = fix_trailing_newline("content without newline")
        assert fixed.endswith("\n")
        assert len(fixes) == 1

    def test_already_has_newline(self):
        fixed, fixes = fix_trailing_newline("content with newline\n")
        assert len(fixes) == 0


# ─── validate_claim ───────────────────────────────────────────────────────


class TestValidateClaim:
    def test_valid_claim_passes(self, existing_claims):
        issues = validate_claim(
            "metadao-futarchy-shows-limited-volume.md",
            VALID_CLAIM,
            existing_claims,
        )
        assert len(issues) == 0

    def test_no_frontmatter_fails(self, existing_claims):
        issues = validate_claim("test.md", "# Just text\n\nNo frontmatter.", existing_claims)
        assert "no_frontmatter" in issues

    def test_missing_required_fields(self, existing_claims):
        content = """---
type: claim
---

# test

Body.
"""
        issues = validate_claim("test-claim.md", content, existing_claims)
        assert any("missing_field" in i for i in issues)

    def test_short_title_flagged(self, existing_claims):
        content = """---
type: claim
domain: internet-finance
description: "test description"
confidence: experimental
source: "test source"
created: 2026-03-15
---

# short

Body content here.
"""
        issues = validate_claim("short.md", content, existing_claims)
        assert any("title_too_few_words" in i for i in issues)

    def test_near_duplicate_detected(self, existing_claims):
        # Title nearly identical to existing "futarchy-adoption-faces-friction"
        content = """---
type: claim
domain: internet-finance
description: "test"
confidence: experimental
source: "test"
created: 2026-03-15
---

# futarchy adoption faces friction barriers

Body content with enough text to pass body validation minimum length checks here.
"""
        issues = validate_claim(
            "futarchy-adoption-faces-friction-barriers.md",
            content,
            existing_claims,
        )
        assert any("near_duplicate" in i for i in issues)

    def test_opsec_flags_internal_deal_terms(self, existing_claims):
        content = """---
type: claim
domain: internet-finance
description: "LivingIP raised $5M at a $50M valuation in the seed round"
confidence: experimental
source: "internal memo"
created: 2026-03-15
---

# LivingIP raised five million dollars at a fifty million dollar valuation

The deal terms show LivingIP secured $5M from investors at a $50M valuation.

---

Relevant Notes:
- [[_map]]
"""
        issues = validate_claim(
            "livingip-raised-five-million-at-fifty-million-valuation.md",
            content, existing_claims,
        )
        assert any("opsec" in i for i in issues)

    def test_opsec_allows_general_market_data(self, existing_claims):
        content = """---
type: claim
domain: internet-finance
description: "MetaDAO treasury holds $2M in reserves"
confidence: experimental
source: "on-chain data"
created: 2026-03-15
---

# MetaDAO treasury holds two million dollars in reserves based on on chain data analysis

On-chain analysis shows the MetaDAO treasury holds approximately $2M across
SOL and USDC positions, providing sufficient runway for operations.

---

Relevant Notes:
- [[metadao]]
"""
        issues = validate_claim(
            "metadao-treasury-holds-two-million-in-reserves.md",
            content, existing_claims,
        )
        assert not any("opsec" in i for i in issues)

    def test_short_title_with_verb_still_fails_under_4_words(self, existing_claims):
        """Even with a verb, titles under 4 words should fail."""
        content = """---
type: claim
domain: internet-finance
description: "test"
confidence: experimental
source: "test"
created: 2026-03-15
---

# futarchy works

Body content here with enough text to pass validation.
"""
        issues = validate_claim("futarchy-works.md", content, existing_claims)
        assert any("title_too_few_words" in i for i in issues)

    def test_entity_skips_title_check(self, existing_claims):
        issues = validate_claim("metadao.md", ENTITY_CONTENT, existing_claims)
        # Entities should NOT fail on short title or proposition check
        assert not any("title" in i for i in issues)


# ─── validate_and_fix_claims (integration) ────────────────────────────────


class TestValidateAndFixClaims:
    def test_valid_claims_pass_through(self, existing_claims):
        claims = [{
            "filename": "test-claim-about-futarchy-governance-mechanism-design.md",
            "domain": "internet-finance",
            "content": VALID_CLAIM,
        }]
        kept, rejected, stats = validate_and_fix_claims(
            claims, "internet-finance", "rio", existing_claims
        )
        assert len(kept) == 1
        assert len(rejected) == 0
        assert stats["kept"] == 1

    def test_fixable_claims_get_fixed(self, existing_claims):
        claims = [{
            "filename": "test-claim-about-something-important-in-finance.md",
            "domain": "internet-finance",
            "content": MISSING_FIELDS_CLAIM,
        }]
        kept, rejected, stats = validate_and_fix_claims(
            claims, "internet-finance", "rio", existing_claims
        )
        # Should be fixed (added missing fields) and kept, OR rejected if body too thin
        assert stats["total"] == 1
        # The fixer adds missing confidence, created, etc.
        assert stats["fixed"] > 0 or stats["rejected"] > 0

    def test_empty_claims_rejected(self, existing_claims):
        claims = [{"filename": "", "domain": "internet-finance", "content": ""}]
        kept, rejected, stats = validate_and_fix_claims(
            claims, "internet-finance", "rio", existing_claims
        )
        assert len(rejected) == 1
        assert stats["rejected"] == 1

    def test_intra_batch_dedup(self, existing_claims):
        """Claims within same batch should not flag each other as duplicates."""
        claims = [
            {
                "filename": "first-claim-about-novel-mechanism.md",
                "domain": "internet-finance",
                "content": """---
type: claim
domain: internet-finance
description: "First novel claim"
confidence: experimental
source: "test"
created: {today}
---

# first claim about novel mechanism design in futarchy governance

Argument with sufficient body content to pass validation checks for minimum length.

---

Relevant Notes:
- [[_map]]
""".format(today=date.today().isoformat()),
            },
            {
                "filename": "second-claim-about-different-mechanism.md",
                "domain": "internet-finance",
                "content": """---
type: claim
domain: internet-finance
description: "Second different claim"
confidence: experimental
source: "test"
created: {today}
---

# second claim about different mechanism in token economics

Different argument with sufficient body content for a completely separate claim.

---

Relevant Notes:
- [[_map]]
""".format(today=date.today().isoformat()),
            },
        ]
        kept, rejected, stats = validate_and_fix_claims(
            claims, "internet-finance", "rio", existing_claims
        )
        assert len(kept) == 2


# ─── validate_and_fix_entities ────────────────────────────────────────────


class TestValidateAndFixEntities:
    def test_valid_entity_passes(self):
        entities = [{
            "filename": "metadao.md",
            "domain": "internet-finance",
            "action": "create",
            "entity_type": "company",
            "content": ENTITY_CONTENT,
        }]
        kept, rejected, stats = validate_and_fix_entities(
            entities, "internet-finance", set()
        )
        assert len(kept) == 1

    def test_missing_entity_type_rejected(self):
        entities = [{
            "filename": "bad-entity.md",
            "domain": "internet-finance",
            "action": "create",
            "entity_type": "company",
            "content": """---
type: entity
domain: internet-finance
description: "test"
---

# Bad entity
""",
        }]
        kept, rejected, stats = validate_and_fix_entities(
            entities, "internet-finance", set()
        )
        assert len(rejected) == 1
        assert any("missing_entity_type" in i for i in stats["issues"])

    def test_update_without_timeline_rejected(self):
        entities = [{
            "filename": "metadao.md",
            "domain": "internet-finance",
            "action": "update",
            "entity_type": "company",
            "content": "",
            "timeline_entry": "",
        }]
        kept, rejected, stats = validate_and_fix_entities(
            entities, "internet-finance", set()
        )
        assert len(rejected) == 1

    def test_decision_market_missing_fields(self):
        entities = [{
            "filename": "metadao-test-proposal.md",
            "domain": "internet-finance",
            "action": "create",
            "entity_type": "decision_market",
            "content": """---
type: entity
entity_type: decision_market
name: "MetaDAO: Test Proposal"
domain: internet-finance
description: "Test"
---

# MetaDAO: Test Proposal
""",
        }]
        kept, rejected, stats = validate_and_fix_entities(
            entities, "internet-finance", set()
        )
        assert len(rejected) == 1
        assert any("dm_missing" in i for i in stats["issues"])


# ─── _yaml_line dict handling (attribution round-trip) ──────────────────


class TestYamlLineDict:
    """Verify _yaml_line produces valid YAML for nested dicts (attribution block)."""

    def test_attribution_round_trip(self):
        """Attribution dict → _yaml_line → parse_frontmatter should survive."""
        from lib.post_extract import _rebuild_content, parse_frontmatter

        fm = {
            "type": "claim",
            "domain": "ai-alignment",
            "description": "Test claim for round-trip",
            "confidence": "experimental",
            "source": "unit test",
            "created": "2026-03-28",
            "attribution": {
                "extractor": [{"handle": "rio", "agent_id": "760F7FE7"}],
                "sourcer": [{"handle": "someone", "context": "test source"}],
                "challenger": [],
                "synthesizer": [],
                "reviewer": [],
            },
        }
        body = "# Test claim for attribution round-trip\n\nBody text."

        rebuilt = _rebuild_content(fm, body)
        parsed_fm, parsed_body = parse_frontmatter(rebuilt)

        assert parsed_fm is not None
        # Attribution must survive as a dict, not a string
        attr = parsed_fm.get("attribution")
        assert isinstance(attr, dict), f"attribution is {type(attr)}, expected dict"
        assert attr["extractor"][0]["handle"] == "rio"
        assert attr["sourcer"][0]["handle"] == "someone"

    def test_empty_attribution_roles(self):
        """Empty role lists should serialize as [] and survive round-trip."""
        from lib.post_extract import _rebuild_content, parse_frontmatter

        fm = {
            "type": "claim",
            "domain": "ai-alignment",
            "description": "Test",
            "confidence": "experimental",
            "source": "test",
            "created": "2026-03-28",
            "attribution": {
                "extractor": [{"handle": "leo"}],
                "sourcer": [],
                "challenger": [],
                "synthesizer": [],
                "reviewer": [],
            },
        }
        body = "# Test claim with empty roles\n\nBody."

        rebuilt = _rebuild_content(fm, body)
        parsed_fm, _ = parse_frontmatter(rebuilt)

        assert parsed_fm is not None
        attr = parsed_fm.get("attribution")
        assert isinstance(attr, dict)
        assert attr["extractor"][0]["handle"] == "leo"
        assert attr.get("sourcer") == [] or attr.get("sourcer") is None