refine: x-tweet vs x-article source_type, 500ms rate limit (Ganymede)
- Distinguish tweets (source_type: x-tweet, format: social-media) from articles (source_type: x-article, format: article) based on content length and article marker presence - 500ms delay between fetch_from_url calls in research path - Keep standalone sources pure (no Rio analysis — circular dependency) Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
This commit is contained in:
parent
b3c635290f
commit
c2ff4996e3
1 changed files with 14 additions and 6 deletions
|
|
@ -332,7 +332,9 @@ async def handle_research(msg, query: str, user, silent: bool = False):
|
||||||
|
|
||||||
# Fetch full content for top tweets (not just search snippets)
|
# Fetch full content for top tweets (not just search snippets)
|
||||||
from x_client import fetch_from_url
|
from x_client import fetch_from_url
|
||||||
for tweet in tweets[:5]: # Top 5 by engagement
|
for i, tweet in enumerate(tweets[:5]): # Top 5 by engagement
|
||||||
|
if i > 0:
|
||||||
|
await asyncio.sleep(0.5) # Ganymede: 500ms between calls, polite to Ben's API
|
||||||
url = tweet.get("url", "")
|
url = tweet.get("url", "")
|
||||||
if url:
|
if url:
|
||||||
try:
|
try:
|
||||||
|
|
@ -766,7 +768,8 @@ def _archive_standalone_source(url: str, content: str, user):
|
||||||
|
|
||||||
Separate from the conversation archive — this is the actual article/tweet
|
Separate from the conversation archive — this is the actual article/tweet
|
||||||
entering the extraction pipeline as a proper source, attributed to the
|
entering the extraction pipeline as a proper source, attributed to the
|
||||||
contributor who shared it.
|
contributor who shared it. Ganymede: keep pure (no Rio analysis), two
|
||||||
|
source_types (x-tweet vs x-article).
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
username = user.username if user else "anonymous"
|
username = user.username if user else "anonymous"
|
||||||
|
|
@ -778,6 +781,11 @@ def _archive_standalone_source(url: str, content: str, user):
|
||||||
if author_match:
|
if author_match:
|
||||||
author = f"@{author_match.group(1)}"
|
author = f"@{author_match.group(1)}"
|
||||||
|
|
||||||
|
# Distinguish tweet vs article (Ganymede: different extraction behavior)
|
||||||
|
is_article = "--- Article Content ---" in content and len(content) > 1000
|
||||||
|
source_type = "x-article" if is_article else "x-tweet"
|
||||||
|
fmt = "article" if is_article else "social-media"
|
||||||
|
|
||||||
slug = re.sub(r"[^a-z0-9]+", "-", f"{author}-{url.split('/')[-1][:30]}".lower()).strip("-")
|
slug = re.sub(r"[^a-z0-9]+", "-", f"{author}-{url.split('/')[-1][:30]}".lower()).strip("-")
|
||||||
filename = f"{date_str}-tg-shared-{slug}.md"
|
filename = f"{date_str}-tg-shared-{slug}.md"
|
||||||
source_path = Path(ARCHIVE_DIR) / filename
|
source_path = Path(ARCHIVE_DIR) / filename
|
||||||
|
|
@ -788,20 +796,20 @@ def _archive_standalone_source(url: str, content: str, user):
|
||||||
|
|
||||||
source_content = f"""---
|
source_content = f"""---
|
||||||
type: source
|
type: source
|
||||||
source_type: x-article
|
source_type: {source_type}
|
||||||
title: "{author} — shared via Telegram by @{username}"
|
title: "{author} — shared via Telegram by @{username}"
|
||||||
author: "{author}"
|
author: "{author}"
|
||||||
url: "{url}"
|
url: "{url}"
|
||||||
date: {date_str}
|
date: {date_str}
|
||||||
domain: internet-finance
|
domain: internet-finance
|
||||||
format: article
|
format: {fmt}
|
||||||
status: unprocessed
|
status: unprocessed
|
||||||
proposed_by: "@{username}"
|
proposed_by: "@{username}"
|
||||||
contribution_type: source-submission
|
contribution_type: source-submission
|
||||||
tags: [telegram-shared, x-article]
|
tags: [telegram-shared, {source_type}]
|
||||||
---
|
---
|
||||||
|
|
||||||
# {author} — Article/Thread
|
# {author} — {'Article' if is_article else 'Tweet/Thread'}
|
||||||
|
|
||||||
Shared by @{username} via Telegram.
|
Shared by @{username} via Telegram.
|
||||||
Source URL: {url}
|
Source URL: {url}
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue