refine: x-tweet vs x-article source_type, 500ms rate limit (Ganymede)

- Distinguish tweets (source_type: x-tweet, format: social-media) from
  articles (source_type: x-article, format: article) based on content
  length and article marker presence
- 500ms delay between fetch_from_url calls in research path
- Keep standalone sources pure (no Rio analysis — circular dependency)

Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
This commit is contained in:
m3taversal 2026-03-24 16:00:19 +00:00
parent b3c635290f
commit c2ff4996e3

View file

@ -332,7 +332,9 @@ async def handle_research(msg, query: str, user, silent: bool = False):
# Fetch full content for top tweets (not just search snippets)
from x_client import fetch_from_url
for tweet in tweets[:5]: # Top 5 by engagement
for i, tweet in enumerate(tweets[:5]): # Top 5 by engagement
if i > 0:
await asyncio.sleep(0.5) # Ganymede: 500ms between calls, polite to Ben's API
url = tweet.get("url", "")
if url:
try:
@ -766,7 +768,8 @@ def _archive_standalone_source(url: str, content: str, user):
Separate from the conversation archive this is the actual article/tweet
entering the extraction pipeline as a proper source, attributed to the
contributor who shared it.
contributor who shared it. Ganymede: keep pure (no Rio analysis), two
source_types (x-tweet vs x-article).
"""
try:
username = user.username if user else "anonymous"
@ -778,6 +781,11 @@ def _archive_standalone_source(url: str, content: str, user):
if author_match:
author = f"@{author_match.group(1)}"
# Distinguish tweet vs article (Ganymede: different extraction behavior)
is_article = "--- Article Content ---" in content and len(content) > 1000
source_type = "x-article" if is_article else "x-tweet"
fmt = "article" if is_article else "social-media"
slug = re.sub(r"[^a-z0-9]+", "-", f"{author}-{url.split('/')[-1][:30]}".lower()).strip("-")
filename = f"{date_str}-tg-shared-{slug}.md"
source_path = Path(ARCHIVE_DIR) / filename
@ -788,20 +796,20 @@ def _archive_standalone_source(url: str, content: str, user):
source_content = f"""---
type: source
source_type: x-article
source_type: {source_type}
title: "{author} — shared via Telegram by @{username}"
author: "{author}"
url: "{url}"
date: {date_str}
domain: internet-finance
format: article
format: {fmt}
status: unprocessed
proposed_by: "@{username}"
contribution_type: source-submission
tags: [telegram-shared, x-article]
tags: [telegram-shared, {source_type}]
---
# {author} — Article/Thread
# {author} — {'Article' if is_article else 'Tweet/Thread'}
Shared by @{username} via Telegram.
Source URL: {url}