refine: x-tweet vs x-article source_type, 500ms rate limit (Ganymede)
- Distinguish tweets (source_type: x-tweet, format: social-media) from articles (source_type: x-article, format: article) based on content length and article marker presence - 500ms delay between fetch_from_url calls in research path - Keep standalone sources pure (no Rio analysis — circular dependency) Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
This commit is contained in:
parent
b3c635290f
commit
c2ff4996e3
1 changed files with 14 additions and 6 deletions
|
|
@ -332,7 +332,9 @@ async def handle_research(msg, query: str, user, silent: bool = False):
|
|||
|
||||
# Fetch full content for top tweets (not just search snippets)
|
||||
from x_client import fetch_from_url
|
||||
for tweet in tweets[:5]: # Top 5 by engagement
|
||||
for i, tweet in enumerate(tweets[:5]): # Top 5 by engagement
|
||||
if i > 0:
|
||||
await asyncio.sleep(0.5) # Ganymede: 500ms between calls, polite to Ben's API
|
||||
url = tweet.get("url", "")
|
||||
if url:
|
||||
try:
|
||||
|
|
@ -766,7 +768,8 @@ def _archive_standalone_source(url: str, content: str, user):
|
|||
|
||||
Separate from the conversation archive — this is the actual article/tweet
|
||||
entering the extraction pipeline as a proper source, attributed to the
|
||||
contributor who shared it.
|
||||
contributor who shared it. Ganymede: keep pure (no Rio analysis), two
|
||||
source_types (x-tweet vs x-article).
|
||||
"""
|
||||
try:
|
||||
username = user.username if user else "anonymous"
|
||||
|
|
@ -778,6 +781,11 @@ def _archive_standalone_source(url: str, content: str, user):
|
|||
if author_match:
|
||||
author = f"@{author_match.group(1)}"
|
||||
|
||||
# Distinguish tweet vs article (Ganymede: different extraction behavior)
|
||||
is_article = "--- Article Content ---" in content and len(content) > 1000
|
||||
source_type = "x-article" if is_article else "x-tweet"
|
||||
fmt = "article" if is_article else "social-media"
|
||||
|
||||
slug = re.sub(r"[^a-z0-9]+", "-", f"{author}-{url.split('/')[-1][:30]}".lower()).strip("-")
|
||||
filename = f"{date_str}-tg-shared-{slug}.md"
|
||||
source_path = Path(ARCHIVE_DIR) / filename
|
||||
|
|
@ -788,20 +796,20 @@ def _archive_standalone_source(url: str, content: str, user):
|
|||
|
||||
source_content = f"""---
|
||||
type: source
|
||||
source_type: x-article
|
||||
source_type: {source_type}
|
||||
title: "{author} — shared via Telegram by @{username}"
|
||||
author: "{author}"
|
||||
url: "{url}"
|
||||
date: {date_str}
|
||||
domain: internet-finance
|
||||
format: article
|
||||
format: {fmt}
|
||||
status: unprocessed
|
||||
proposed_by: "@{username}"
|
||||
contribution_type: source-submission
|
||||
tags: [telegram-shared, x-article]
|
||||
tags: [telegram-shared, {source_type}]
|
||||
---
|
||||
|
||||
# {author} — Article/Thread
|
||||
# {author} — {'Article' if is_article else 'Tweet/Thread'}
|
||||
|
||||
Shared by @{username} via Telegram.
|
||||
Source URL: {url}
|
||||
|
|
|
|||
Loading…
Reference in a new issue