feat: content classification — domain routing + sub-tags for sources
All source creation functions now classify content by domain and sub-topic instead of hardcoding internet-finance. Domain routing: keyword matching (2+ hits) routes to ai-alignment, health, space-development, entertainment. Default: internet-finance. Sub-tags for internet-finance: futarchy, ownership-coins, defi, governance, market-analysis, crypto-infra. Added to source frontmatter tags array for granular filtering. Applied to: standalone sources, inline SOURCE:/CLAIM:, conversation archives, research archives. Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
This commit is contained in:
parent
0854375fd0
commit
b5aabe0364
1 changed files with 67 additions and 6 deletions
|
|
@ -110,6 +110,60 @@ chat_transcripts: dict[int, list[dict]] = {}
|
|||
TRANSCRIPT_DIR = "/opt/teleo-eval/transcripts"
|
||||
|
||||
|
||||
# ─── Content Classification ─────────────────────────────────────────────
|
||||
|
||||
# Sub-topic keywords for internet-finance sources
|
||||
_TOPIC_KEYWORDS = {
|
||||
"futarchy": ["futarchy", "autocrat", "conditional market", "twap", "pass/fail",
|
||||
"decision market", "futard", "metadao governance"],
|
||||
"ownership-coins": ["ownership coin", "ico", "fundraise", "launch", "launchpad",
|
||||
"permissioned", "permissionless", "unruggable", "treasury management",
|
||||
"buyback", "token split"],
|
||||
"defi": ["amm", "liquidity", "swap", "lending", "borrowing", "yield", "tvl",
|
||||
"dex", "lp", "staking", "vault", "protocol"],
|
||||
"governance": ["proposal", "vote", "governance", "dao", "subcommittee",
|
||||
"treasury", "resolution", "benevolent dictator"],
|
||||
"market-analysis": ["price", "market cap", "fdv", "oversubscribed", "committed",
|
||||
"trading", "volume", "bullish", "bearish", "thesis"],
|
||||
"crypto-infra": ["solana", "ethereum", "base", "bridge", "wallet", "on-ramp",
|
||||
"off-ramp", "fiat", "stablecoin", "usdc"],
|
||||
}
|
||||
|
||||
# Domain keywords for non-internet-finance content
|
||||
_DOMAIN_KEYWORDS = {
|
||||
"ai-alignment": ["ai safety", "alignment", "superintelligence", "llm", "frontier model",
|
||||
"interpretability", "rlhf", "anthropic", "openai", "deepmind"],
|
||||
"health": ["glp-1", "healthcare", "clinical", "pharma", "biotech", "fda",
|
||||
"medicare", "hospital", "diagnosis", "therapeutic"],
|
||||
"space-development": ["spacex", "starship", "orbital", "lunar", "satellite",
|
||||
"launch cost", "rocket", "nasa", "artemis"],
|
||||
"entertainment": ["streaming", "creator economy", "ip", "nft", "gaming",
|
||||
"content", "media", "studio", "audience"],
|
||||
}
|
||||
|
||||
|
||||
def _classify_content(text: str) -> tuple[str, list[str]]:
|
||||
"""Classify content into domain + sub-tags based on keywords.
|
||||
|
||||
Returns (domain, [sub-tags]). Default: internet-finance with no sub-tags.
|
||||
"""
|
||||
text_lower = text.lower()
|
||||
|
||||
# Check non-IF domains first
|
||||
for domain, keywords in _DOMAIN_KEYWORDS.items():
|
||||
matches = sum(1 for kw in keywords if kw in text_lower)
|
||||
if matches >= 2: # Need 2+ keyword matches to override default domain
|
||||
return domain, []
|
||||
|
||||
# Default to internet-finance, classify sub-topics
|
||||
sub_tags = []
|
||||
for tag, keywords in _TOPIC_KEYWORDS.items():
|
||||
if any(kw in text_lower for kw in keywords):
|
||||
sub_tags.append(tag)
|
||||
|
||||
return "internet-finance", sub_tags
|
||||
|
||||
|
||||
# ─── Transcript Management ──────────────────────────────────────────────
|
||||
|
||||
|
||||
|
|
@ -210,12 +264,12 @@ source_type: telegram-contribution
|
|||
title: "Source from @{username} — {source_text[:80]}"
|
||||
author: "@{username}"
|
||||
date: {date_str}
|
||||
domain: internet-finance
|
||||
domain: {_classify_content(source_text + " " + user_message)[0]}
|
||||
format: contribution
|
||||
status: unprocessed
|
||||
proposed_by: "@{username}"
|
||||
contribution_type: source-submission
|
||||
tags: [telegram-contribution, inline-source]
|
||||
tags: {["telegram-contribution", "inline-source"] + _classify_content(source_text + " " + user_message)[1]}
|
||||
---
|
||||
|
||||
# Source: {source_text[:100]}
|
||||
|
|
@ -248,13 +302,15 @@ def _create_inline_claim(claim_text: str, user_message: str, user, msg):
|
|||
if source_path.exists():
|
||||
return
|
||||
|
||||
domain, sub_tags = _classify_content(claim_text + " " + user_message)
|
||||
|
||||
content = f"""---
|
||||
type: source
|
||||
source_type: telegram-claim
|
||||
title: "Claim from @{username} — {claim_text[:80]}"
|
||||
author: "@{username}"
|
||||
date: {date_str}
|
||||
domain: internet-finance
|
||||
domain: {domain}
|
||||
format: claim-draft
|
||||
status: unprocessed
|
||||
proposed_by: "@{username}"
|
||||
|
|
@ -1077,6 +1133,9 @@ def _archive_standalone_source(url: str, content: str, user):
|
|||
if source_path.exists():
|
||||
return
|
||||
|
||||
domain, sub_tags = _classify_content(content)
|
||||
all_tags = ["telegram-shared", source_type] + sub_tags
|
||||
|
||||
source_content = f"""---
|
||||
type: source
|
||||
source_type: {source_type}
|
||||
|
|
@ -1084,12 +1143,12 @@ title: "{author} — shared via Telegram by @{username}"
|
|||
author: "{author}"
|
||||
url: "{url}"
|
||||
date: {date_str}
|
||||
domain: internet-finance
|
||||
domain: {domain}
|
||||
format: {fmt}
|
||||
status: unprocessed
|
||||
proposed_by: "@{username}"
|
||||
contribution_type: source-submission
|
||||
tags: [telegram-shared, {source_type}]
|
||||
tags: {all_tags}
|
||||
---
|
||||
|
||||
# {author} — {'Article' if is_article else 'Tweet/Thread'}
|
||||
|
|
@ -1198,6 +1257,8 @@ def _archive_exchange(user_text: str, rio_response: str, user, msg,
|
|||
if url_content:
|
||||
url_section = f"\n## Article Content (fetched)\n\n{url_content[:8000]}\n"
|
||||
|
||||
domain, sub_tags = _classify_content(user_text + " " + rio_response)
|
||||
|
||||
content = f"""---
|
||||
type: source
|
||||
source_type: telegram
|
||||
|
|
@ -1205,7 +1266,7 @@ title: "Telegram: @{username} — {slug}"
|
|||
author: "@{username}"
|
||||
url: "{urls[0] if urls else ''}"
|
||||
date: {date_str}
|
||||
domain: internet-finance
|
||||
domain: {domain}
|
||||
format: conversation
|
||||
status: unprocessed
|
||||
priority: {priority}
|
||||
|
|
|
|||
Loading…
Reference in a new issue