From 2ec4c445b17966a866c9c009b72bd32ddd9dce76 Mon Sep 17 00:00:00 2001
From: m3taversal <m3taversal@gmail.com>
Date: Tue, 24 Mar 2026 14:12:31 +0000
Subject: [PATCH] fix: use x_client.fetch_from_url for X URLs in archive
 pipeline

_fetch_url_content was doing raw HTTP GET on X URLs which returns
JavaScript, not article content. Now routes X/Twitter URLs through
Ben's API via x_client.fetch_from_url which returns structured
article content (contents[] array with typed blocks).

Article content gets included in the archived source file so the
extraction pipeline has the actual content, not just Rio's response.

Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
---
 telegram/bot.py | 47 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 44 insertions(+), 3 deletions(-)
diff --git a/telegram/bot.py b/telegram/bot.py
index f9040ee..d65a7b6 100644
--- a/telegram/bot.py
+++ b/telegram/bot.py
@@ -703,7 +703,49 @@ IMPORTANT: Two special tags you can append at the end of your response (after yo
 
 
 async def _fetch_url_content(url: str) -> str | None:
-    """Fetch article/page content from a URL for pipeline ingestion."""
+    """Fetch article/page content from a URL for pipeline ingestion.
+
+    For X/Twitter URLs, uses Ben's API (x_client.fetch_from_url) which returns
+    structured article content. For other URLs, falls back to raw HTTP fetch.
+    """
+    # X/Twitter URLs → use x_client for structured content
+    if "x.com/" in url or "twitter.com/" in url:
+        try:
+            from x_client import fetch_from_url
+            data = await fetch_from_url(url)
+            if not data:
+                logger.warning("x_client returned no data for %s", url)
+                return None
+            # Format structured content
+            parts = []
+            # Tweet text
+            tweet_text = data.get("text", "")
+            if tweet_text:
+                parts.append(tweet_text)
+            # Article content (contents[] array with typed blocks)
+            contents = data.get("contents", [])
+            if contents:
+                parts.append("\n--- Article Content ---\n")
+                for block in contents:
+                    block_type = block.get("type", "unstyled")
+                    block_text = block.get("text", "")
+                    if not block_text:
+                        continue
+                    if block_type in ("header-one", "header-two", "header-three"):
+                        parts.append(f"\n## {block_text}\n")
+                    elif block_type == "blockquote":
+                        parts.append(f"> {block_text}")
+                    elif block_type == "list-item":
+                        parts.append(f"- {block_text}")
+                    else:
+                        parts.append(block_text)
+            result = "\n".join(parts)
+            return result[:10000] if result else None
+        except Exception as e:
+            logger.warning("x_client fetch failed for %s: %s", url, e)
+            return None
+
+    # Non-X URLs → raw HTTP fetch with HTML stripping
     import aiohttp
     try:
         async with aiohttp.ClientSession() as session:
@@ -711,12 +753,11 @@ async def _fetch_url_content(url: str) -> str | None:
                 if resp.status >= 400:
                     return None
                 html = await resp.text()
-                # Strip HTML tags for plain text (basic — upgrade to readability later)
                 text = re.sub(r"<script.*?</script>", "", html, flags=re.DOTALL)
                 text = re.sub(r"<style.*?</style>", "", text, flags=re.DOTALL)
                 text = re.sub(r"<[^>]+>", " ", text)
                 text = re.sub(r"\s+", " ", text).strip()
-                return text[:10000]  # Cap at 10K chars
+                return text[:10000]
     except Exception as e:
         logger.warning("Failed to fetch URL %s: %s", url, e)
         return None