feat: author handle domain signal + conversation skip at source (Ganymede)

1. Author handle map: known X accounts (MetaDAO, Anthropic, SpaceX etc.)
   count as 1 keyword match toward domain routing threshold. Lightweight,
   no URL parsing.

2. Conversation archives now write to conversations/ subdir instead of
   top-level staging dir. The cron only moves top-level *.md to queue,
   so conversations never enter the extraction pipeline. Skip happens
   at write time, not at batch-extract read time — eliminates wasted I/O
   every 15 minutes.

Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
This commit is contained in:
m3taversal 2026-03-26 14:39:15 +00:00
parent b5aabe0364
commit 1dfc6dcc5c

View file

@ -142,17 +142,43 @@ _DOMAIN_KEYWORDS = {
}
def _classify_content(text: str) -> tuple[str, list[str]]:
"""Classify content into domain + sub-tags based on keywords.
# Author handle → domain map (Ganymede: counts as 1 keyword match)
_AUTHOR_DOMAIN_MAP = {
"metadaoproject": "internet-finance",
"metadaofi": "internet-finance",
"futardio": "internet-finance",
"p2pdotme": "internet-finance",
"oxranga": "internet-finance",
"metanallok": "internet-finance",
"proph3t_": "internet-finance",
"01resolved": "internet-finance",
"anthropicai": "ai-alignment",
"openai": "ai-alignment",
"daborai": "ai-alignment",
"deepmind": "ai-alignment",
"spacex": "space-development",
"blaborig": "space-development",
"nasa": "space-development",
}
def _classify_content(text: str, author: str = "") -> tuple[str, list[str]]:
"""Classify content into domain + sub-tags based on keywords + author.
Returns (domain, [sub-tags]). Default: internet-finance with no sub-tags.
"""
text_lower = text.lower()
author_lower = author.lower().lstrip("@")
# Author handle gives 1 keyword match toward domain threshold
author_domain = _AUTHOR_DOMAIN_MAP.get(author_lower, "")
# Check non-IF domains first
for domain, keywords in _DOMAIN_KEYWORDS.items():
matches = sum(1 for kw in keywords if kw in text_lower)
if matches >= 2: # Need 2+ keyword matches to override default domain
if author_domain == domain:
matches += 1 # Author signal counts as 1 match
if matches >= 2:
return domain, []
# Default to internet-finance, classify sub-topics
@ -1234,15 +1260,19 @@ def _extract_urls(text: str) -> list[str]:
def _archive_exchange(user_text: str, rio_response: str, user, msg,
url_content: str | None = None, urls: list[str] | None = None):
"""Archive a tagged exchange to inbox/queue/ for pipeline processing."""
"""Archive a tagged exchange. Conversations go to telegram-archives/conversations/
(not queue skips extraction). Sources with URLs already have standalone files."""
try:
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
username = user.username if user else "anonymous"
slug = re.sub(r"[^a-z0-9]+", "-", user_text[:50].lower()).strip("-")
filename = f"{date_str}-telegram-{username}-{slug}.md"
archive_path = Path(ARCHIVE_DIR) / filename
archive_path.parent.mkdir(parents=True, exist_ok=True)
# Conversations go to conversations/ subdir (Ganymede: skip extraction at source).
# The cron only moves top-level ARCHIVE_DIR/*.md to queue — subdirs are untouched.
conv_dir = Path(ARCHIVE_DIR) / "conversations"
conv_dir.mkdir(parents=True, exist_ok=True)
archive_path = conv_dir / filename
# Extract rationale (the user's text minus the @mention and URL)
rationale = re.sub(r"@\w+", "", user_text).strip()