refine: x-tweet vs x-article source_type, 500ms rate limit (Ganymede)

- Distinguish tweets (source_type: x-tweet, format: social-media) from articles (source_type: x-article, format: article) based on content length and article marker presence - 500ms delay between fetch_from_url calls in research path - Keep standalone sources pure (no Rio analysis — circular dependency) Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
2026-03-24 16:00:19 +00:00 · 2026-03-24 16:00:19 +00:00 · c2ff4996e3
commit c2ff4996e3
parent b3c635290f
1 changed files with 14 additions and 6 deletions
--- a/telegram/bot.py
+++ b/telegram/bot.py
@ -332,7 +332,9 @@ async def handle_research(msg, query: str, user, silent: bool = False):
    # Fetch full content for top tweets (not just search snippets)
    from x_client import fetch_from_url
-    for tweet in tweets[:5]:  # Top 5 by engagement
+    for i, tweet in enumerate(tweets[:5]):  # Top 5 by engagement
        if i > 0:
            await asyncio.sleep(0.5)  # Ganymede: 500ms between calls, polite to Ben's API
        url = tweet.get("url", "")
        if url:
            try:
@ -766,7 +768,8 @@ def _archive_standalone_source(url: str, content: str, user):
    Separate from the conversation archive — this is the actual article/tweet
    entering the extraction pipeline as a proper source, attributed to the
-    contributor who shared it.
+    contributor who shared it. Ganymede: keep pure (no Rio analysis), two
    source_types (x-tweet vs x-article).
    """
    try:
        username = user.username if user else "anonymous"
@ -778,6 +781,11 @@ def _archive_standalone_source(url: str, content: str, user):
        if author_match:
            author = f"@{author_match.group(1)}"
        # Distinguish tweet vs article (Ganymede: different extraction behavior)
        is_article = "--- Article Content ---" in content and len(content) > 1000
        source_type = "x-article" if is_article else "x-tweet"
        fmt = "article" if is_article else "social-media"
        slug = re.sub(r"[^a-z0-9]+", "-", f"{author}-{url.split('/')[-1][:30]}".lower()).strip("-")
        filename = f"{date_str}-tg-shared-{slug}.md"
        source_path = Path(ARCHIVE_DIR) / filename
@ -788,20 +796,20 @@ def _archive_standalone_source(url: str, content: str, user):
        source_content = f"""---
 type: source
-source_type: x-article
+source_type: {source_type}
 title: "{author} — shared via Telegram by @{username}"
 author: "{author}"
 url: "{url}"
 date: {date_str}
 domain: internet-finance
-format: article
+format: {fmt}
 status: unprocessed
 proposed_by: "@{username}"
 contribution_type: source-submission
-tags: [telegram-shared, x-article]
+tags: [telegram-shared, {source_type}]
 ---
-# {author} — Article/Thread
+# {author} — {'Article' if is_article else 'Tweet/Thread'}
 Shared by @{username} via Telegram.
 Source URL: {url}