feat: content classification — domain routing + sub-tags for sources

All source creation functions now classify content by domain and sub-topic instead of hardcoding internet-finance. Domain routing: keyword matching (2+ hits) routes to ai-alignment, health, space-development, entertainment. Default: internet-finance. Sub-tags for internet-finance: futarchy, ownership-coins, defi, governance, market-analysis, crypto-infra. Added to source frontmatter tags array for granular filtering. Applied to: standalone sources, inline SOURCE:/CLAIM:, conversation archives, research archives. Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
2026-03-26 14:34:33 +00:00 · 2026-03-26 14:34:33 +00:00 · b5aabe0364
commit b5aabe0364
parent 0854375fd0
1 changed files with 67 additions and 6 deletions
--- a/telegram/bot.py
+++ b/telegram/bot.py
@ -110,6 +110,60 @@ chat_transcripts: dict[int, list[dict]] = {}
 TRANSCRIPT_DIR = "/opt/teleo-eval/transcripts"


+# ─── Content Classification ─────────────────────────────────────────────
+
+# Sub-topic keywords for internet-finance sources
+_TOPIC_KEYWORDS = {
+    "futarchy": ["futarchy", "autocrat", "conditional market", "twap", "pass/fail",
+                 "decision market", "futard", "metadao governance"],
+    "ownership-coins": ["ownership coin", "ico", "fundraise", "launch", "launchpad",
+                        "permissioned", "permissionless", "unruggable", "treasury management",
+                        "buyback", "token split"],
+    "defi": ["amm", "liquidity", "swap", "lending", "borrowing", "yield", "tvl",
+             "dex", "lp", "staking", "vault", "protocol"],
+    "governance": ["proposal", "vote", "governance", "dao", "subcommittee",
+                   "treasury", "resolution", "benevolent dictator"],
+    "market-analysis": ["price", "market cap", "fdv", "oversubscribed", "committed",
+                        "trading", "volume", "bullish", "bearish", "thesis"],
+    "crypto-infra": ["solana", "ethereum", "base", "bridge", "wallet", "on-ramp",
+                     "off-ramp", "fiat", "stablecoin", "usdc"],
+}
+
+# Domain keywords for non-internet-finance content
+_DOMAIN_KEYWORDS = {
+    "ai-alignment": ["ai safety", "alignment", "superintelligence", "llm", "frontier model",
+                     "interpretability", "rlhf", "anthropic", "openai", "deepmind"],
+    "health": ["glp-1", "healthcare", "clinical", "pharma", "biotech", "fda",
+               "medicare", "hospital", "diagnosis", "therapeutic"],
+    "space-development": ["spacex", "starship", "orbital", "lunar", "satellite",
+                          "launch cost", "rocket", "nasa", "artemis"],
+    "entertainment": ["streaming", "creator economy", "ip", "nft", "gaming",
+                      "content", "media", "studio", "audience"],
+}
+
+
+def _classify_content(text: str) -> tuple[str, list[str]]:
+    """Classify content into domain + sub-tags based on keywords.
+
+    Returns (domain, [sub-tags]). Default: internet-finance with no sub-tags.
+    """
+    text_lower = text.lower()
+
+    # Check non-IF domains first
+    for domain, keywords in _DOMAIN_KEYWORDS.items():
+        matches = sum(1 for kw in keywords if kw in text_lower)
+        if matches >= 2:  # Need 2+ keyword matches to override default domain
+            return domain, []
+
+    # Default to internet-finance, classify sub-topics
+    sub_tags = []
+    for tag, keywords in _TOPIC_KEYWORDS.items():
+        if any(kw in text_lower for kw in keywords):
+            sub_tags.append(tag)
+
+    return "internet-finance", sub_tags
+
+
 # ─── Transcript Management ──────────────────────────────────────────────


@ -210,12 +264,12 @@ source_type: telegram-contribution
 title: "Source from @{username} — {source_text[:80]}"
 author: "@{username}"
 date: {date_str}
-domain: internet-finance
+domain: {_classify_content(source_text + " " + user_message)[0]}
 format: contribution
 status: unprocessed
 proposed_by: "@{username}"
 contribution_type: source-submission
-tags: [telegram-contribution, inline-source]
+tags: {["telegram-contribution", "inline-source"] + _classify_content(source_text + " " + user_message)[1]}
 ---

 # Source: {source_text[:100]}
@ -248,13 +302,15 @@ def _create_inline_claim(claim_text: str, user_message: str, user, msg):
        if source_path.exists():
            return

+        domain, sub_tags = _classify_content(claim_text + " " + user_message)
+
        content = f"""---
 type: source
 source_type: telegram-claim
 title: "Claim from @{username} — {claim_text[:80]}"
 author: "@{username}"
 date: {date_str}
-domain: internet-finance
+domain: {domain}
 format: claim-draft
 status: unprocessed
 proposed_by: "@{username}"
@ -1077,6 +1133,9 @@ def _archive_standalone_source(url: str, content: str, user):
        if source_path.exists():
            return

+        domain, sub_tags = _classify_content(content)
+        all_tags = ["telegram-shared", source_type] + sub_tags
+
        source_content = f"""---
 type: source
 source_type: {source_type}
@ -1084,12 +1143,12 @@ title: "{author} — shared via Telegram by @{username}"
 author: "{author}"
 url: "{url}"
 date: {date_str}
-domain: internet-finance
+domain: {domain}
 format: {fmt}
 status: unprocessed
 proposed_by: "@{username}"
 contribution_type: source-submission
-tags: [telegram-shared, {source_type}]
+tags: {all_tags}
 ---

 # {author} — {'Article' if is_article else 'Tweet/Thread'}
@ -1198,6 +1257,8 @@ def _archive_exchange(user_text: str, rio_response: str, user, msg,
        if url_content:
            url_section = f"\n## Article Content (fetched)\n\n{url_content[:8000]}\n"

+        domain, sub_tags = _classify_content(user_text + " " + rio_response)
+
        content = f"""---
 type: source
 source_type: telegram
@ -1205,7 +1266,7 @@ title: "Telegram: @{username} — {slug}"
 author: "@{username}"
 url: "{urls[0] if urls else ''}"
 date: {date_str}
-domain: internet-finance
+domain: {domain}
 format: conversation
 status: unprocessed
 priority: {priority}