fix: use x_client.fetch_from_url for X URLs in archive pipeline

_fetch_url_content was doing raw HTTP GET on X URLs which returns JavaScript, not article content. Now routes X/Twitter URLs through Ben's API via x_client.fetch_from_url which returns structured article content (contents[] array with typed blocks). Article content gets included in the archived source file so the extraction pipeline has the actual content, not just Rio's response. Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
2026-03-24 14:12:31 +00:00 · 2026-03-24 14:12:31 +00:00 · 2ec4c445b1
commit 2ec4c445b1
parent 76f13de681
1 changed files with 44 additions and 3 deletions
--- a/telegram/bot.py
+++ b/telegram/bot.py
@ -703,7 +703,49 @@ IMPORTANT: Two special tags you can append at the end of your response (after yo


 async def _fetch_url_content(url: str) -> str | None:
-    """Fetch article/page content from a URL for pipeline ingestion."""
+    """Fetch article/page content from a URL for pipeline ingestion.
+
+    For X/Twitter URLs, uses Ben's API (x_client.fetch_from_url) which returns
+    structured article content. For other URLs, falls back to raw HTTP fetch.
+    """
+    # X/Twitter URLs → use x_client for structured content
+    if "x.com/" in url or "twitter.com/" in url:
+        try:
+            from x_client import fetch_from_url
+            data = await fetch_from_url(url)
+            if not data:
+                logger.warning("x_client returned no data for %s", url)
+                return None
+            # Format structured content
+            parts = []
+            # Tweet text
+            tweet_text = data.get("text", "")
+            if tweet_text:
+                parts.append(tweet_text)
+            # Article content (contents[] array with typed blocks)
+            contents = data.get("contents", [])
+            if contents:
+                parts.append("\n--- Article Content ---\n")
+                for block in contents:
+                    block_type = block.get("type", "unstyled")
+                    block_text = block.get("text", "")
+                    if not block_text:
+                        continue
+                    if block_type in ("header-one", "header-two", "header-three"):
+                        parts.append(f"\n## {block_text}\n")
+                    elif block_type == "blockquote":
+                        parts.append(f"> {block_text}")
+                    elif block_type == "list-item":
+                        parts.append(f"- {block_text}")
+                    else:
+                        parts.append(block_text)
+            result = "\n".join(parts)
+            return result[:10000] if result else None
+        except Exception as e:
+            logger.warning("x_client fetch failed for %s: %s", url, e)
+            return None
+
+    # Non-X URLs → raw HTTP fetch with HTML stripping
    import aiohttp
    try:
        async with aiohttp.ClientSession() as session:
@ -711,12 +753,11 @@ async def _fetch_url_content(url: str) -> str | None:
                if resp.status >= 400:
                    return None
                html = await resp.text()
-                # Strip HTML tags for plain text (basic — upgrade to readability later)
                text = re.sub(r"<script.*?</script>", "", html, flags=re.DOTALL)
                text = re.sub(r"<style.*?</style>", "", text, flags=re.DOTALL)
                text = re.sub(r"<[^>]+>", " ", text)
                text = re.sub(r"\s+", " ", text).strip()
-                return text[:10000]  # Cap at 10K chars
+                return text[:10000]
    except Exception as e:
        logger.warning("Failed to fetch URL %s: %s", url, e)
        return None