From b5aabe03648326a925cb943a56b8786af5b56dfb Mon Sep 17 00:00:00 2001 From: m3taversal Date: Thu, 26 Mar 2026 14:34:33 +0000 Subject: [PATCH] =?UTF-8?q?feat:=20content=20classification=20=E2=80=94=20?= =?UTF-8?q?domain=20routing=20+=20sub-tags=20for=20sources?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All source creation functions now classify content by domain and sub-topic instead of hardcoding internet-finance. Domain routing: keyword matching (2+ hits) routes to ai-alignment, health, space-development, entertainment. Default: internet-finance. Sub-tags for internet-finance: futarchy, ownership-coins, defi, governance, market-analysis, crypto-infra. Added to source frontmatter tags array for granular filtering. Applied to: standalone sources, inline SOURCE:/CLAIM:, conversation archives, research archives. Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70> --- telegram/bot.py | 73 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 67 insertions(+), 6 deletions(-) diff --git a/telegram/bot.py b/telegram/bot.py index 95fd69f..779a1c0 100644 --- a/telegram/bot.py +++ b/telegram/bot.py @@ -110,6 +110,60 @@ chat_transcripts: dict[int, list[dict]] = {} TRANSCRIPT_DIR = "/opt/teleo-eval/transcripts" +# ─── Content Classification ───────────────────────────────────────────── + +# Sub-topic keywords for internet-finance sources +_TOPIC_KEYWORDS = { + "futarchy": ["futarchy", "autocrat", "conditional market", "twap", "pass/fail", + "decision market", "futard", "metadao governance"], + "ownership-coins": ["ownership coin", "ico", "fundraise", "launch", "launchpad", + "permissioned", "permissionless", "unruggable", "treasury management", + "buyback", "token split"], + "defi": ["amm", "liquidity", "swap", "lending", "borrowing", "yield", "tvl", + "dex", "lp", "staking", "vault", "protocol"], + "governance": ["proposal", "vote", "governance", "dao", "subcommittee", + "treasury", "resolution", "benevolent dictator"], + "market-analysis": ["price", "market cap", "fdv", "oversubscribed", "committed", + "trading", "volume", "bullish", "bearish", "thesis"], + "crypto-infra": ["solana", "ethereum", "base", "bridge", "wallet", "on-ramp", + "off-ramp", "fiat", "stablecoin", "usdc"], +} + +# Domain keywords for non-internet-finance content +_DOMAIN_KEYWORDS = { + "ai-alignment": ["ai safety", "alignment", "superintelligence", "llm", "frontier model", + "interpretability", "rlhf", "anthropic", "openai", "deepmind"], + "health": ["glp-1", "healthcare", "clinical", "pharma", "biotech", "fda", + "medicare", "hospital", "diagnosis", "therapeutic"], + "space-development": ["spacex", "starship", "orbital", "lunar", "satellite", + "launch cost", "rocket", "nasa", "artemis"], + "entertainment": ["streaming", "creator economy", "ip", "nft", "gaming", + "content", "media", "studio", "audience"], +} + + +def _classify_content(text: str) -> tuple[str, list[str]]: + """Classify content into domain + sub-tags based on keywords. + + Returns (domain, [sub-tags]). Default: internet-finance with no sub-tags. + """ + text_lower = text.lower() + + # Check non-IF domains first + for domain, keywords in _DOMAIN_KEYWORDS.items(): + matches = sum(1 for kw in keywords if kw in text_lower) + if matches >= 2: # Need 2+ keyword matches to override default domain + return domain, [] + + # Default to internet-finance, classify sub-topics + sub_tags = [] + for tag, keywords in _TOPIC_KEYWORDS.items(): + if any(kw in text_lower for kw in keywords): + sub_tags.append(tag) + + return "internet-finance", sub_tags + + # ─── Transcript Management ────────────────────────────────────────────── @@ -210,12 +264,12 @@ source_type: telegram-contribution title: "Source from @{username} — {source_text[:80]}" author: "@{username}" date: {date_str} -domain: internet-finance +domain: {_classify_content(source_text + " " + user_message)[0]} format: contribution status: unprocessed proposed_by: "@{username}" contribution_type: source-submission -tags: [telegram-contribution, inline-source] +tags: {["telegram-contribution", "inline-source"] + _classify_content(source_text + " " + user_message)[1]} --- # Source: {source_text[:100]} @@ -248,13 +302,15 @@ def _create_inline_claim(claim_text: str, user_message: str, user, msg): if source_path.exists(): return + domain, sub_tags = _classify_content(claim_text + " " + user_message) + content = f"""--- type: source source_type: telegram-claim title: "Claim from @{username} — {claim_text[:80]}" author: "@{username}" date: {date_str} -domain: internet-finance +domain: {domain} format: claim-draft status: unprocessed proposed_by: "@{username}" @@ -1077,6 +1133,9 @@ def _archive_standalone_source(url: str, content: str, user): if source_path.exists(): return + domain, sub_tags = _classify_content(content) + all_tags = ["telegram-shared", source_type] + sub_tags + source_content = f"""--- type: source source_type: {source_type} @@ -1084,12 +1143,12 @@ title: "{author} — shared via Telegram by @{username}" author: "{author}" url: "{url}" date: {date_str} -domain: internet-finance +domain: {domain} format: {fmt} status: unprocessed proposed_by: "@{username}" contribution_type: source-submission -tags: [telegram-shared, {source_type}] +tags: {all_tags} --- # {author} — {'Article' if is_article else 'Tweet/Thread'} @@ -1198,6 +1257,8 @@ def _archive_exchange(user_text: str, rio_response: str, user, msg, if url_content: url_section = f"\n## Article Content (fetched)\n\n{url_content[:8000]}\n" + domain, sub_tags = _classify_content(user_text + " " + rio_response) + content = f"""--- type: source source_type: telegram @@ -1205,7 +1266,7 @@ title: "Telegram: @{username} — {slug}" author: "@{username}" url: "{urls[0] if urls else ''}" date: {date_str} -domain: internet-finance +domain: {domain} format: conversation status: unprocessed priority: {priority}