fix: use x_client.fetch_from_url for X URLs in archive pipeline
_fetch_url_content was doing raw HTTP GET on X URLs which returns JavaScript, not article content. Now routes X/Twitter URLs through Ben's API via x_client.fetch_from_url which returns structured article content (contents[] array with typed blocks). Article content gets included in the archived source file so the extraction pipeline has the actual content, not just Rio's response. Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
This commit is contained in:
parent
76f13de681
commit
2ec4c445b1
1 changed files with 44 additions and 3 deletions
|
|
@ -703,7 +703,49 @@ IMPORTANT: Two special tags you can append at the end of your response (after yo
|
|||
|
||||
|
||||
async def _fetch_url_content(url: str) -> str | None:
|
||||
"""Fetch article/page content from a URL for pipeline ingestion."""
|
||||
"""Fetch article/page content from a URL for pipeline ingestion.
|
||||
|
||||
For X/Twitter URLs, uses Ben's API (x_client.fetch_from_url) which returns
|
||||
structured article content. For other URLs, falls back to raw HTTP fetch.
|
||||
"""
|
||||
# X/Twitter URLs → use x_client for structured content
|
||||
if "x.com/" in url or "twitter.com/" in url:
|
||||
try:
|
||||
from x_client import fetch_from_url
|
||||
data = await fetch_from_url(url)
|
||||
if not data:
|
||||
logger.warning("x_client returned no data for %s", url)
|
||||
return None
|
||||
# Format structured content
|
||||
parts = []
|
||||
# Tweet text
|
||||
tweet_text = data.get("text", "")
|
||||
if tweet_text:
|
||||
parts.append(tweet_text)
|
||||
# Article content (contents[] array with typed blocks)
|
||||
contents = data.get("contents", [])
|
||||
if contents:
|
||||
parts.append("\n--- Article Content ---\n")
|
||||
for block in contents:
|
||||
block_type = block.get("type", "unstyled")
|
||||
block_text = block.get("text", "")
|
||||
if not block_text:
|
||||
continue
|
||||
if block_type in ("header-one", "header-two", "header-three"):
|
||||
parts.append(f"\n## {block_text}\n")
|
||||
elif block_type == "blockquote":
|
||||
parts.append(f"> {block_text}")
|
||||
elif block_type == "list-item":
|
||||
parts.append(f"- {block_text}")
|
||||
else:
|
||||
parts.append(block_text)
|
||||
result = "\n".join(parts)
|
||||
return result[:10000] if result else None
|
||||
except Exception as e:
|
||||
logger.warning("x_client fetch failed for %s: %s", url, e)
|
||||
return None
|
||||
|
||||
# Non-X URLs → raw HTTP fetch with HTML stripping
|
||||
import aiohttp
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
|
|
@ -711,12 +753,11 @@ async def _fetch_url_content(url: str) -> str | None:
|
|||
if resp.status >= 400:
|
||||
return None
|
||||
html = await resp.text()
|
||||
# Strip HTML tags for plain text (basic — upgrade to readability later)
|
||||
text = re.sub(r"<script.*?</script>", "", html, flags=re.DOTALL)
|
||||
text = re.sub(r"<style.*?</style>", "", text, flags=re.DOTALL)
|
||||
text = re.sub(r"<[^>]+>", " ", text)
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text[:10000] # Cap at 10K chars
|
||||
return text[:10000]
|
||||
except Exception as e:
|
||||
logger.warning("Failed to fetch URL %s: %s", url, e)
|
||||
return None
|
||||
|
|
|
|||
Loading…
Reference in a new issue