diff --git a/telegram/bot.py b/telegram/bot.py index 779a1c0..e9907fb 100644 --- a/telegram/bot.py +++ b/telegram/bot.py @@ -142,17 +142,43 @@ _DOMAIN_KEYWORDS = { } -def _classify_content(text: str) -> tuple[str, list[str]]: - """Classify content into domain + sub-tags based on keywords. +# Author handle → domain map (Ganymede: counts as 1 keyword match) +_AUTHOR_DOMAIN_MAP = { + "metadaoproject": "internet-finance", + "metadaofi": "internet-finance", + "futardio": "internet-finance", + "p2pdotme": "internet-finance", + "oxranga": "internet-finance", + "metanallok": "internet-finance", + "proph3t_": "internet-finance", + "01resolved": "internet-finance", + "anthropicai": "ai-alignment", + "openai": "ai-alignment", + "daborai": "ai-alignment", + "deepmind": "ai-alignment", + "spacex": "space-development", + "blaborig": "space-development", + "nasa": "space-development", +} + + +def _classify_content(text: str, author: str = "") -> tuple[str, list[str]]: + """Classify content into domain + sub-tags based on keywords + author. Returns (domain, [sub-tags]). Default: internet-finance with no sub-tags. """ text_lower = text.lower() + author_lower = author.lower().lstrip("@") + + # Author handle gives 1 keyword match toward domain threshold + author_domain = _AUTHOR_DOMAIN_MAP.get(author_lower, "") # Check non-IF domains first for domain, keywords in _DOMAIN_KEYWORDS.items(): matches = sum(1 for kw in keywords if kw in text_lower) - if matches >= 2: # Need 2+ keyword matches to override default domain + if author_domain == domain: + matches += 1 # Author signal counts as 1 match + if matches >= 2: return domain, [] # Default to internet-finance, classify sub-topics @@ -1234,15 +1260,19 @@ def _extract_urls(text: str) -> list[str]: def _archive_exchange(user_text: str, rio_response: str, user, msg, url_content: str | None = None, urls: list[str] | None = None): - """Archive a tagged exchange to inbox/queue/ for pipeline processing.""" + """Archive a tagged exchange. Conversations go to telegram-archives/conversations/ + (not queue — skips extraction). Sources with URLs already have standalone files.""" try: date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d") username = user.username if user else "anonymous" slug = re.sub(r"[^a-z0-9]+", "-", user_text[:50].lower()).strip("-") filename = f"{date_str}-telegram-{username}-{slug}.md" - archive_path = Path(ARCHIVE_DIR) / filename - archive_path.parent.mkdir(parents=True, exist_ok=True) + # Conversations go to conversations/ subdir (Ganymede: skip extraction at source). + # The cron only moves top-level ARCHIVE_DIR/*.md to queue — subdirs are untouched. + conv_dir = Path(ARCHIVE_DIR) / "conversations" + conv_dir.mkdir(parents=True, exist_ok=True) + archive_path = conv_dir / filename # Extract rationale (the user's text minus the @mention and URL) rationale = re.sub(r"@\w+", "", user_text).strip()