From 2ec4c445b17966a866c9c009b72bd32ddd9dce76 Mon Sep 17 00:00:00 2001 From: m3taversal Date: Tue, 24 Mar 2026 14:12:31 +0000 Subject: [PATCH] fix: use x_client.fetch_from_url for X URLs in archive pipeline _fetch_url_content was doing raw HTTP GET on X URLs which returns JavaScript, not article content. Now routes X/Twitter URLs through Ben's API via x_client.fetch_from_url which returns structured article content (contents[] array with typed blocks). Article content gets included in the archived source file so the extraction pipeline has the actual content, not just Rio's response. Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70> --- telegram/bot.py | 47 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/telegram/bot.py b/telegram/bot.py index f9040ee..d65a7b6 100644 --- a/telegram/bot.py +++ b/telegram/bot.py @@ -703,7 +703,49 @@ IMPORTANT: Two special tags you can append at the end of your response (after yo async def _fetch_url_content(url: str) -> str | None: - """Fetch article/page content from a URL for pipeline ingestion.""" + """Fetch article/page content from a URL for pipeline ingestion. + + For X/Twitter URLs, uses Ben's API (x_client.fetch_from_url) which returns + structured article content. For other URLs, falls back to raw HTTP fetch. + """ + # X/Twitter URLs → use x_client for structured content + if "x.com/" in url or "twitter.com/" in url: + try: + from x_client import fetch_from_url + data = await fetch_from_url(url) + if not data: + logger.warning("x_client returned no data for %s", url) + return None + # Format structured content + parts = [] + # Tweet text + tweet_text = data.get("text", "") + if tweet_text: + parts.append(tweet_text) + # Article content (contents[] array with typed blocks) + contents = data.get("contents", []) + if contents: + parts.append("\n--- Article Content ---\n") + for block in contents: + block_type = block.get("type", "unstyled") + block_text = block.get("text", "") + if not block_text: + continue + if block_type in ("header-one", "header-two", "header-three"): + parts.append(f"\n## {block_text}\n") + elif block_type == "blockquote": + parts.append(f"> {block_text}") + elif block_type == "list-item": + parts.append(f"- {block_text}") + else: + parts.append(block_text) + result = "\n".join(parts) + return result[:10000] if result else None + except Exception as e: + logger.warning("x_client fetch failed for %s: %s", url, e) + return None + + # Non-X URLs → raw HTTP fetch with HTML stripping import aiohttp try: async with aiohttp.ClientSession() as session: @@ -711,12 +753,11 @@ async def _fetch_url_content(url: str) -> str | None: if resp.status >= 400: return None html = await resp.text() - # Strip HTML tags for plain text (basic — upgrade to readability later) text = re.sub(r"", "", html, flags=re.DOTALL) text = re.sub(r"", "", text, flags=re.DOTALL) text = re.sub(r"<[^>]+>", " ", text) text = re.sub(r"\s+", " ", text).strip() - return text[:10000] # Cap at 10K chars + return text[:10000] except Exception as e: logger.warning("Failed to fetch URL %s: %s", url, e) return None