"""Tests for post-extraction validator — the $0 mechanical quality gate. Tests cover the fixers and validators that catch 73% of eval rejections: - Frontmatter fixing (missing fields, wrong dates, invalid values) - Wiki link stripping (broken links → plain text) - Title validation (proposition check, word count) - Duplicate detection (SequenceMatcher threshold) - Entity validation (schema, decision_market fields) - The full validate_and_fix_claims pipeline """ import pytest from datetime import date from lib.post_extract import ( parse_frontmatter, fix_frontmatter, fix_wiki_links, fix_trailing_newline, fix_h1_title_match, validate_claim, validate_and_fix_claims, validate_and_fix_entities, ) # ─── Fixtures ────────────────────────────────────────────────────────────── VALID_CLAIM = """--- type: claim domain: internet-finance description: "MetaDAO futarchy implementation demonstrates limited volume in uncontested decisions" confidence: experimental source: "Pine Analytics, Q4 2025 report" created: {today} --- # MetaDAO futarchy implementation shows limited trading volume in uncontested decisions Analysis of MetaDAO proposal markets shows that uncontested decisions attract minimal trading volume. When proposals have clear consensus (>80% pass rate), conditional token markets see <$1000 in volume. This suggests futarchy's information aggregation mechanism is most valuable when outcomes are uncertain. Evidence from Pine Analytics Q4 2025 report shows 15 proposals with >80% pass rate averaged $340 in total volume, while 3 contested proposals averaged $45,000. --- Relevant Notes: - [[metadao]] - [[futarchy-adoption-faces-friction]] Topics: - [[_map]] """.format(today=date.today().isoformat()) MISSING_FIELDS_CLAIM = """--- type: claim domain: internet-finance --- # Some claim title that is specific enough to argue about meaningfully Body text here. """ ENTITY_CONTENT = """--- type: entity entity_type: company name: "MetaDAO" domain: internet-finance description: "Futarchy governance platform on Solana" status: active tracked_by: rio --- # MetaDAO Overview of MetaDAO. ## Timeline - **2024-01-01** — Launch of Autocrat v0.1 """ @pytest.fixture def existing_claims(): """Sample existing claim stems for dedup/link checking.""" return { "metadao", "futarchy-adoption-faces-friction", "coin-price-is-the-fairest-objective-function-for-asset-futarchy", "futarchy-is-manipulation-resistant-because-attack-attempts-create-profitable-opportunities-for-defenders", "_map", } # ─── parse_frontmatter ──────────────────────────────────────────────────── class TestParseFrontmatter: def test_valid_frontmatter(self): fm, body = parse_frontmatter(VALID_CLAIM) assert fm is not None assert fm["type"] == "claim" assert fm["domain"] == "internet-finance" assert "# MetaDAO" in body def test_no_frontmatter(self): fm, body = parse_frontmatter("# Just a title\n\nSome body.") assert fm is None assert "Just a title" in body def test_empty_frontmatter(self): fm, body = parse_frontmatter("---\n---\nBody") # Empty YAML → None assert fm is None or fm == {} # ─── fix_frontmatter ────────────────────────────────────────────────────── class TestFixFrontmatter: def test_no_fixes_needed(self): fixed, fixes = fix_frontmatter(VALID_CLAIM, "internet-finance", "rio") assert len(fixes) == 0 def test_missing_created_date(self): content = MISSING_FIELDS_CLAIM fixed, fixes = fix_frontmatter(content, "internet-finance", "rio") assert any("added_created" in f or "added_confidence" in f for f in fixes) fm, _ = parse_frontmatter(fixed) assert fm["created"] == date.today().isoformat() def test_wrong_created_date(self): content = """--- type: claim domain: internet-finance description: "test" confidence: experimental source: "test" created: 2025-01-15 --- # test claim that is long enough to pass validation checks Body. """ fixed, fixes = fix_frontmatter(content, "internet-finance", "rio") assert any("set_created" in f for f in fixes) fm, _ = parse_frontmatter(fixed) assert fm["created"] == date.today().isoformat() def test_invalid_confidence(self): content = """--- type: claim domain: internet-finance description: "test" confidence: probable source: "test" created: 2026-03-15 --- # test claim body Body. """ fixed, fixes = fix_frontmatter(content, "internet-finance", "rio") assert any("fixed_confidence" in f for f in fixes) fm, _ = parse_frontmatter(fixed) assert fm["confidence"] == "experimental" def test_missing_domain_uses_provided(self): content = """--- type: claim description: "test" confidence: experimental source: "test" created: 2026-03-15 --- # test claim Body. """ fixed, fixes = fix_frontmatter(content, "health", "vida") assert any("fixed_domain" in f for f in fixes) fm, _ = parse_frontmatter(fixed) assert fm["domain"] == "health" # ─── fix_wiki_links ─────────────────────────────────────────────────────── class TestFixWikiLinks: def test_valid_links_preserved(self, existing_claims): content = "See [[metadao]] and [[_map]] for context." fixed, fixes = fix_wiki_links(content, existing_claims) assert "[[metadao]]" in fixed assert "[[_map]]" in fixed assert len(fixes) == 0 def test_broken_links_stripped(self, existing_claims): content = "See [[nonexistent-claim]] for details." fixed, fixes = fix_wiki_links(content, existing_claims) assert "[[nonexistent-claim]]" not in fixed assert "nonexistent-claim" in fixed # Text kept assert len(fixes) == 1 def test_mixed_links(self, existing_claims): content = "Both [[metadao]] and [[invented-link]] are relevant." fixed, fixes = fix_wiki_links(content, existing_claims) assert "[[metadao]]" in fixed assert "[[invented-link]]" not in fixed assert "invented-link" in fixed assert len(fixes) == 1 # ─── fix_trailing_newline ───────────────────────────────────────────────── class TestFixTrailingNewline: def test_adds_newline(self): fixed, fixes = fix_trailing_newline("content without newline") assert fixed.endswith("\n") assert len(fixes) == 1 def test_already_has_newline(self): fixed, fixes = fix_trailing_newline("content with newline\n") assert len(fixes) == 0 # ─── validate_claim ─────────────────────────────────────────────────────── class TestValidateClaim: def test_valid_claim_passes(self, existing_claims): issues = validate_claim( "metadao-futarchy-shows-limited-volume.md", VALID_CLAIM, existing_claims, ) assert len(issues) == 0 def test_no_frontmatter_fails(self, existing_claims): issues = validate_claim("test.md", "# Just text\n\nNo frontmatter.", existing_claims) assert "no_frontmatter" in issues def test_missing_required_fields(self, existing_claims): content = """--- type: claim --- # test Body. """ issues = validate_claim("test-claim.md", content, existing_claims) assert any("missing_field" in i for i in issues) def test_short_title_flagged(self, existing_claims): content = """--- type: claim domain: internet-finance description: "test description" confidence: experimental source: "test source" created: 2026-03-15 --- # short Body content here. """ issues = validate_claim("short.md", content, existing_claims) assert any("title_too_few_words" in i for i in issues) def test_near_duplicate_detected(self, existing_claims): # Title nearly identical to existing "futarchy-adoption-faces-friction" content = """--- type: claim domain: internet-finance description: "test" confidence: experimental source: "test" created: 2026-03-15 --- # futarchy adoption faces friction barriers Body content with enough text to pass body validation minimum length checks here. """ issues = validate_claim( "futarchy-adoption-faces-friction-barriers.md", content, existing_claims, ) assert any("near_duplicate" in i for i in issues) def test_opsec_flags_internal_deal_terms(self, existing_claims): content = """--- type: claim domain: internet-finance description: "LivingIP raised $5M at a $50M valuation in the seed round" confidence: experimental source: "internal memo" created: 2026-03-15 --- # LivingIP raised five million dollars at a fifty million dollar valuation The deal terms show LivingIP secured $5M from investors at a $50M valuation. --- Relevant Notes: - [[_map]] """ issues = validate_claim( "livingip-raised-five-million-at-fifty-million-valuation.md", content, existing_claims, ) assert any("opsec" in i for i in issues) def test_opsec_allows_general_market_data(self, existing_claims): content = """--- type: claim domain: internet-finance description: "MetaDAO treasury holds $2M in reserves" confidence: experimental source: "on-chain data" created: 2026-03-15 --- # MetaDAO treasury holds two million dollars in reserves based on on chain data analysis On-chain analysis shows the MetaDAO treasury holds approximately $2M across SOL and USDC positions, providing sufficient runway for operations. --- Relevant Notes: - [[metadao]] """ issues = validate_claim( "metadao-treasury-holds-two-million-in-reserves.md", content, existing_claims, ) assert not any("opsec" in i for i in issues) def test_short_title_with_verb_still_fails_under_4_words(self, existing_claims): """Even with a verb, titles under 4 words should fail.""" content = """--- type: claim domain: internet-finance description: "test" confidence: experimental source: "test" created: 2026-03-15 --- # futarchy works Body content here with enough text to pass validation. """ issues = validate_claim("futarchy-works.md", content, existing_claims) assert any("title_too_few_words" in i for i in issues) def test_entity_skips_title_check(self, existing_claims): issues = validate_claim("metadao.md", ENTITY_CONTENT, existing_claims) # Entities should NOT fail on short title or proposition check assert not any("title" in i for i in issues) # ─── validate_and_fix_claims (integration) ──────────────────────────────── class TestValidateAndFixClaims: def test_valid_claims_pass_through(self, existing_claims): claims = [{ "filename": "test-claim-about-futarchy-governance-mechanism-design.md", "domain": "internet-finance", "content": VALID_CLAIM, }] kept, rejected, stats = validate_and_fix_claims( claims, "internet-finance", "rio", existing_claims ) assert len(kept) == 1 assert len(rejected) == 0 assert stats["kept"] == 1 def test_fixable_claims_get_fixed(self, existing_claims): claims = [{ "filename": "test-claim-about-something-important-in-finance.md", "domain": "internet-finance", "content": MISSING_FIELDS_CLAIM, }] kept, rejected, stats = validate_and_fix_claims( claims, "internet-finance", "rio", existing_claims ) # Should be fixed (added missing fields) and kept, OR rejected if body too thin assert stats["total"] == 1 # The fixer adds missing confidence, created, etc. assert stats["fixed"] > 0 or stats["rejected"] > 0 def test_empty_claims_rejected(self, existing_claims): claims = [{"filename": "", "domain": "internet-finance", "content": ""}] kept, rejected, stats = validate_and_fix_claims( claims, "internet-finance", "rio", existing_claims ) assert len(rejected) == 1 assert stats["rejected"] == 1 def test_intra_batch_dedup(self, existing_claims): """Claims within same batch should not flag each other as duplicates.""" claims = [ { "filename": "first-claim-about-novel-mechanism.md", "domain": "internet-finance", "content": """--- type: claim domain: internet-finance description: "First novel claim" confidence: experimental source: "test" created: {today} --- # first claim about novel mechanism design in futarchy governance Argument with sufficient body content to pass validation checks for minimum length. --- Relevant Notes: - [[_map]] """.format(today=date.today().isoformat()), }, { "filename": "second-claim-about-different-mechanism.md", "domain": "internet-finance", "content": """--- type: claim domain: internet-finance description: "Second different claim" confidence: experimental source: "test" created: {today} --- # second claim about different mechanism in token economics Different argument with sufficient body content for a completely separate claim. --- Relevant Notes: - [[_map]] """.format(today=date.today().isoformat()), }, ] kept, rejected, stats = validate_and_fix_claims( claims, "internet-finance", "rio", existing_claims ) assert len(kept) == 2 # ─── validate_and_fix_entities ──────────────────────────────────────────── class TestValidateAndFixEntities: def test_valid_entity_passes(self): entities = [{ "filename": "metadao.md", "domain": "internet-finance", "action": "create", "entity_type": "company", "content": ENTITY_CONTENT, }] kept, rejected, stats = validate_and_fix_entities( entities, "internet-finance", set() ) assert len(kept) == 1 def test_missing_entity_type_rejected(self): entities = [{ "filename": "bad-entity.md", "domain": "internet-finance", "action": "create", "entity_type": "company", "content": """--- type: entity domain: internet-finance description: "test" --- # Bad entity """, }] kept, rejected, stats = validate_and_fix_entities( entities, "internet-finance", set() ) assert len(rejected) == 1 assert any("missing_entity_type" in i for i in stats["issues"]) def test_update_without_timeline_rejected(self): entities = [{ "filename": "metadao.md", "domain": "internet-finance", "action": "update", "entity_type": "company", "content": "", "timeline_entry": "", }] kept, rejected, stats = validate_and_fix_entities( entities, "internet-finance", set() ) assert len(rejected) == 1 def test_decision_market_missing_fields(self): entities = [{ "filename": "metadao-test-proposal.md", "domain": "internet-finance", "action": "create", "entity_type": "decision_market", "content": """--- type: entity entity_type: decision_market name: "MetaDAO: Test Proposal" domain: internet-finance description: "Test" --- # MetaDAO: Test Proposal """, }] kept, rejected, stats = validate_and_fix_entities( entities, "internet-finance", set() ) assert len(rejected) == 1 assert any("dm_missing" in i for i in stats["issues"]) # ─── _yaml_line dict handling (attribution round-trip) ────────────────── class TestYamlLineDict: """Verify _yaml_line produces valid YAML for nested dicts (attribution block).""" def test_attribution_round_trip(self): """Attribution dict → _yaml_line → parse_frontmatter should survive.""" from lib.post_extract import _rebuild_content, parse_frontmatter fm = { "type": "claim", "domain": "ai-alignment", "description": "Test claim for round-trip", "confidence": "experimental", "source": "unit test", "created": "2026-03-28", "attribution": { "extractor": [{"handle": "rio", "agent_id": "760F7FE7"}], "sourcer": [{"handle": "someone", "context": "test source"}], "challenger": [], "synthesizer": [], "reviewer": [], }, } body = "# Test claim for attribution round-trip\n\nBody text." rebuilt = _rebuild_content(fm, body) parsed_fm, parsed_body = parse_frontmatter(rebuilt) assert parsed_fm is not None # Attribution must survive as a dict, not a string attr = parsed_fm.get("attribution") assert isinstance(attr, dict), f"attribution is {type(attr)}, expected dict" assert attr["extractor"][0]["handle"] == "rio" assert attr["sourcer"][0]["handle"] == "someone" def test_empty_attribution_roles(self): """Empty role lists should serialize as [] and survive round-trip.""" from lib.post_extract import _rebuild_content, parse_frontmatter fm = { "type": "claim", "domain": "ai-alignment", "description": "Test", "confidence": "experimental", "source": "test", "created": "2026-03-28", "attribution": { "extractor": [{"handle": "leo"}], "sourcer": [], "challenger": [], "synthesizer": [], "reviewer": [], }, } body = "# Test claim with empty roles\n\nBody." rebuilt = _rebuild_content(fm, body) parsed_fm, _ = parse_frontmatter(rebuilt) assert parsed_fm is not None attr = parsed_fm.get("attribution") assert isinstance(attr, dict) assert attr["extractor"][0]["handle"] == "leo" assert attr.get("sourcer") == [] or attr.get("sourcer") is None