feat: author handle domain signal + conversation skip at source (Ganymede)

1. Author handle map: known X accounts (MetaDAO, Anthropic, SpaceX etc.) count as 1 keyword match toward domain routing threshold. Lightweight, no URL parsing. 2. Conversation archives now write to conversations/ subdir instead of top-level staging dir. The cron only moves top-level *.md to queue, so conversations never enter the extraction pipeline. Skip happens at write time, not at batch-extract read time — eliminates wasted I/O every 15 minutes. Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
2026-03-26 14:39:15 +00:00 · 2026-03-26 14:39:15 +00:00 · 1dfc6dcc5c
commit 1dfc6dcc5c
parent b5aabe0364
1 changed files with 36 additions and 6 deletions
--- a/telegram/bot.py
+++ b/telegram/bot.py
@ -142,17 +142,43 @@ _DOMAIN_KEYWORDS = {
 }


-def _classify_content(text: str) -> tuple[str, list[str]]:
-    """Classify content into domain + sub-tags based on keywords.
+# Author handle → domain map (Ganymede: counts as 1 keyword match)
+_AUTHOR_DOMAIN_MAP = {
+    "metadaoproject": "internet-finance",
+    "metadaofi": "internet-finance",
+    "futardio": "internet-finance",
+    "p2pdotme": "internet-finance",
+    "oxranga": "internet-finance",
+    "metanallok": "internet-finance",
+    "proph3t_": "internet-finance",
+    "01resolved": "internet-finance",
+    "anthropicai": "ai-alignment",
+    "openai": "ai-alignment",
+    "daborai": "ai-alignment",
+    "deepmind": "ai-alignment",
+    "spacex": "space-development",
+    "blaborig": "space-development",
+    "nasa": "space-development",
+}
+
+
+def _classify_content(text: str, author: str = "") -> tuple[str, list[str]]:
+    """Classify content into domain + sub-tags based on keywords + author.

    Returns (domain, [sub-tags]). Default: internet-finance with no sub-tags.
    """
    text_lower = text.lower()
+    author_lower = author.lower().lstrip("@")
+
+    # Author handle gives 1 keyword match toward domain threshold
+    author_domain = _AUTHOR_DOMAIN_MAP.get(author_lower, "")

    # Check non-IF domains first
    for domain, keywords in _DOMAIN_KEYWORDS.items():
        matches = sum(1 for kw in keywords if kw in text_lower)
-        if matches >= 2:  # Need 2+ keyword matches to override default domain
+        if author_domain == domain:
+            matches += 1  # Author signal counts as 1 match
+        if matches >= 2:
            return domain, []

    # Default to internet-finance, classify sub-topics
@ -1234,15 +1260,19 @@ def _extract_urls(text: str) -> list[str]:

 def _archive_exchange(user_text: str, rio_response: str, user, msg,
                      url_content: str | None = None, urls: list[str] | None = None):
-    """Archive a tagged exchange to inbox/queue/ for pipeline processing."""
+    """Archive a tagged exchange. Conversations go to telegram-archives/conversations/
+    (not queue — skips extraction). Sources with URLs already have standalone files."""
    try:
        date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
        username = user.username if user else "anonymous"
        slug = re.sub(r"[^a-z0-9]+", "-", user_text[:50].lower()).strip("-")
        filename = f"{date_str}-telegram-{username}-{slug}.md"

-        archive_path = Path(ARCHIVE_DIR) / filename
-        archive_path.parent.mkdir(parents=True, exist_ok=True)
+        # Conversations go to conversations/ subdir (Ganymede: skip extraction at source).
+        # The cron only moves top-level ARCHIVE_DIR/*.md to queue — subdirs are untouched.
+        conv_dir = Path(ARCHIVE_DIR) / "conversations"
+        conv_dir.mkdir(parents=True, exist_ok=True)
+        archive_path = conv_dir / filename

        # Extract rationale (the user's text minus the @mention and URL)
        rationale = re.sub(r"@\w+", "", user_text).strip()