fix: use x_client.fetch_from_url for X URLs in archive pipeline

_fetch_url_content was doing raw HTTP GET on X URLs which returns
JavaScript, not article content. Now routes X/Twitter URLs through
Ben's API via x_client.fetch_from_url which returns structured
article content (contents[] array with typed blocks).

Article content gets included in the archived source file so the
extraction pipeline has the actual content, not just Rio's response.

Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
This commit is contained in:
m3taversal 2026-03-24 14:12:31 +00:00
parent 76f13de681
commit 2ec4c445b1

View file

@ -703,7 +703,49 @@ IMPORTANT: Two special tags you can append at the end of your response (after yo
async def _fetch_url_content(url: str) -> str | None:
"""Fetch article/page content from a URL for pipeline ingestion."""
"""Fetch article/page content from a URL for pipeline ingestion.
For X/Twitter URLs, uses Ben's API (x_client.fetch_from_url) which returns
structured article content. For other URLs, falls back to raw HTTP fetch.
"""
# X/Twitter URLs → use x_client for structured content
if "x.com/" in url or "twitter.com/" in url:
try:
from x_client import fetch_from_url
data = await fetch_from_url(url)
if not data:
logger.warning("x_client returned no data for %s", url)
return None
# Format structured content
parts = []
# Tweet text
tweet_text = data.get("text", "")
if tweet_text:
parts.append(tweet_text)
# Article content (contents[] array with typed blocks)
contents = data.get("contents", [])
if contents:
parts.append("\n--- Article Content ---\n")
for block in contents:
block_type = block.get("type", "unstyled")
block_text = block.get("text", "")
if not block_text:
continue
if block_type in ("header-one", "header-two", "header-three"):
parts.append(f"\n## {block_text}\n")
elif block_type == "blockquote":
parts.append(f"> {block_text}")
elif block_type == "list-item":
parts.append(f"- {block_text}")
else:
parts.append(block_text)
result = "\n".join(parts)
return result[:10000] if result else None
except Exception as e:
logger.warning("x_client fetch failed for %s: %s", url, e)
return None
# Non-X URLs → raw HTTP fetch with HTML stripping
import aiohttp
try:
async with aiohttp.ClientSession() as session:
@ -711,12 +753,11 @@ async def _fetch_url_content(url: str) -> str | None:
if resp.status >= 400:
return None
html = await resp.text()
# Strip HTML tags for plain text (basic — upgrade to readability later)
text = re.sub(r"<script.*?</script>", "", html, flags=re.DOTALL)
text = re.sub(r"<style.*?</style>", "", text, flags=re.DOTALL)
text = re.sub(r"<[^>]+>", " ", text)
text = re.sub(r"\s+", " ", text).strip()
return text[:10000] # Cap at 10K chars
return text[:10000]
except Exception as e:
logger.warning("Failed to fetch URL %s: %s", url, e)
return None