feat: content classification — domain routing + sub-tags for sources

All source creation functions now classify content by domain and
sub-topic instead of hardcoding internet-finance.

Domain routing: keyword matching (2+ hits) routes to ai-alignment,
health, space-development, entertainment. Default: internet-finance.

Sub-tags for internet-finance: futarchy, ownership-coins, defi,
governance, market-analysis, crypto-infra. Added to source frontmatter
tags array for granular filtering.

Applied to: standalone sources, inline SOURCE:/CLAIM:, conversation
archives, research archives.

Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
This commit is contained in:
m3taversal 2026-03-26 14:34:33 +00:00
parent 0854375fd0
commit b5aabe0364

View file

@ -110,6 +110,60 @@ chat_transcripts: dict[int, list[dict]] = {}
TRANSCRIPT_DIR = "/opt/teleo-eval/transcripts"
# ─── Content Classification ─────────────────────────────────────────────
# Sub-topic keywords for internet-finance sources
_TOPIC_KEYWORDS = {
"futarchy": ["futarchy", "autocrat", "conditional market", "twap", "pass/fail",
"decision market", "futard", "metadao governance"],
"ownership-coins": ["ownership coin", "ico", "fundraise", "launch", "launchpad",
"permissioned", "permissionless", "unruggable", "treasury management",
"buyback", "token split"],
"defi": ["amm", "liquidity", "swap", "lending", "borrowing", "yield", "tvl",
"dex", "lp", "staking", "vault", "protocol"],
"governance": ["proposal", "vote", "governance", "dao", "subcommittee",
"treasury", "resolution", "benevolent dictator"],
"market-analysis": ["price", "market cap", "fdv", "oversubscribed", "committed",
"trading", "volume", "bullish", "bearish", "thesis"],
"crypto-infra": ["solana", "ethereum", "base", "bridge", "wallet", "on-ramp",
"off-ramp", "fiat", "stablecoin", "usdc"],
}
# Domain keywords for non-internet-finance content
_DOMAIN_KEYWORDS = {
"ai-alignment": ["ai safety", "alignment", "superintelligence", "llm", "frontier model",
"interpretability", "rlhf", "anthropic", "openai", "deepmind"],
"health": ["glp-1", "healthcare", "clinical", "pharma", "biotech", "fda",
"medicare", "hospital", "diagnosis", "therapeutic"],
"space-development": ["spacex", "starship", "orbital", "lunar", "satellite",
"launch cost", "rocket", "nasa", "artemis"],
"entertainment": ["streaming", "creator economy", "ip", "nft", "gaming",
"content", "media", "studio", "audience"],
}
def _classify_content(text: str) -> tuple[str, list[str]]:
"""Classify content into domain + sub-tags based on keywords.
Returns (domain, [sub-tags]). Default: internet-finance with no sub-tags.
"""
text_lower = text.lower()
# Check non-IF domains first
for domain, keywords in _DOMAIN_KEYWORDS.items():
matches = sum(1 for kw in keywords if kw in text_lower)
if matches >= 2: # Need 2+ keyword matches to override default domain
return domain, []
# Default to internet-finance, classify sub-topics
sub_tags = []
for tag, keywords in _TOPIC_KEYWORDS.items():
if any(kw in text_lower for kw in keywords):
sub_tags.append(tag)
return "internet-finance", sub_tags
# ─── Transcript Management ──────────────────────────────────────────────
@ -210,12 +264,12 @@ source_type: telegram-contribution
title: "Source from @{username}{source_text[:80]}"
author: "@{username}"
date: {date_str}
domain: internet-finance
domain: {_classify_content(source_text + " " + user_message)[0]}
format: contribution
status: unprocessed
proposed_by: "@{username}"
contribution_type: source-submission
tags: [telegram-contribution, inline-source]
tags: {["telegram-contribution", "inline-source"] + _classify_content(source_text + " " + user_message)[1]}
---
# Source: {source_text[:100]}
@ -248,13 +302,15 @@ def _create_inline_claim(claim_text: str, user_message: str, user, msg):
if source_path.exists():
return
domain, sub_tags = _classify_content(claim_text + " " + user_message)
content = f"""---
type: source
source_type: telegram-claim
title: "Claim from @{username}{claim_text[:80]}"
author: "@{username}"
date: {date_str}
domain: internet-finance
domain: {domain}
format: claim-draft
status: unprocessed
proposed_by: "@{username}"
@ -1077,6 +1133,9 @@ def _archive_standalone_source(url: str, content: str, user):
if source_path.exists():
return
domain, sub_tags = _classify_content(content)
all_tags = ["telegram-shared", source_type] + sub_tags
source_content = f"""---
type: source
source_type: {source_type}
@ -1084,12 +1143,12 @@ title: "{author} — shared via Telegram by @{username}"
author: "{author}"
url: "{url}"
date: {date_str}
domain: internet-finance
domain: {domain}
format: {fmt}
status: unprocessed
proposed_by: "@{username}"
contribution_type: source-submission
tags: [telegram-shared, {source_type}]
tags: {all_tags}
---
# {author} — {'Article' if is_article else 'Tweet/Thread'}
@ -1198,6 +1257,8 @@ def _archive_exchange(user_text: str, rio_response: str, user, msg,
if url_content:
url_section = f"\n## Article Content (fetched)\n\n{url_content[:8000]}\n"
domain, sub_tags = _classify_content(user_text + " " + rio_response)
content = f"""---
type: source
source_type: telegram
@ -1205,7 +1266,7 @@ title: "Telegram: @{username} — {slug}"
author: "@{username}"
url: "{urls[0] if urls else ''}"
date: {date_str}
domain: internet-finance
domain: {domain}
format: conversation
status: unprocessed
priority: {priority}