feat: full content fetch for research + standalone source for shared URLs

Two fixes for article ingestion: 1. Research path: top 5 search results now get full content via fetch_from_url before archiving. Articles get full text, not just search snippets. Threads get complete text. 2. URL sharing: when a user shares a URL, creates a standalone source file (type: source, format: article) separate from the conversation archive. Enters extraction pipeline as proper source material, attributed to the TG user who shared it. Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
2026-03-24 15:57:58 +00:00 · 2026-03-24 15:57:58 +00:00 · b3c635290f
commit b3c635290f
parent 8ff4784fcb
1 changed files with 92 additions and 0 deletions
--- a/telegram/bot.py
+++ b/telegram/bot.py
@ -330,6 +330,40 @@ async def handle_research(msg, query: str, user, silent: bool = False):
            await msg.reply_text(f"No recent tweets found for '{query}'.")
        return

+    # Fetch full content for top tweets (not just search snippets)
+    from x_client import fetch_from_url
+    for tweet in tweets[:5]:  # Top 5 by engagement
+        url = tweet.get("url", "")
+        if url:
+            try:
+                full_data = await fetch_from_url(url)
+                if full_data:
+                    # Replace snippet with full text
+                    full_text = full_data.get("text", "")
+                    if full_text and len(full_text) > len(tweet.get("text", "")):
+                        tweet["text"] = full_text
+                    # Include article content if available
+                    contents = full_data.get("contents", [])
+                    if contents:
+                        article_parts = []
+                        for block in contents:
+                            block_text = block.get("text", "")
+                            if not block_text:
+                                continue
+                            block_type = block.get("type", "unstyled")
+                            if block_type in ("header-one", "header-two", "header-three"):
+                                article_parts.append(f"\n## {block_text}\n")
+                            elif block_type == "blockquote":
+                                article_parts.append(f"> {block_text}")
+                            elif block_type == "list-item":
+                                article_parts.append(f"- {block_text}")
+                            else:
+                                article_parts.append(block_text)
+                        if article_parts:
+                            tweet["text"] += "\n\n--- Article Content ---\n" + "\n".join(article_parts)
+            except Exception as e:
+                logger.warning("Failed to fetch full content for %s: %s", url, e)
+
    # Archive all tweets as ONE source file per research query
    # (not per-tweet — one extraction PR produces claims from the best material)
    try:
@ -719,11 +753,69 @@ IMPORTANT: Two special tags you can append at the end of your response (after yo
        url_content = await _fetch_url_content(urls[0])
        if url_content:
            logger.info("Fetched %d chars from %s", len(url_content), urls[0])
+            # Create standalone source file for the article (separate from conversation)
+            # This enters the extraction pipeline as a proper source, attributed to contributor
+            _archive_standalone_source(urls[0], url_content, user)

    # Archive the exchange as a source for pipeline (slow path)
    _archive_exchange(text, response, user, msg, url_content=url_content, urls=urls)


+def _archive_standalone_source(url: str, content: str, user):
+    """Create a standalone source file for a URL shared in Telegram.
+
+    Separate from the conversation archive — this is the actual article/tweet
+    entering the extraction pipeline as a proper source, attributed to the
+    contributor who shared it.
+    """
+    try:
+        username = user.username if user else "anonymous"
+        date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
+
+        # Extract author from URL or content
+        author = "unknown"
+        author_match = re.search(r"x\.com/(\w+)/", url) or re.search(r"twitter\.com/(\w+)/", url)
+        if author_match:
+            author = f"@{author_match.group(1)}"
+
+        slug = re.sub(r"[^a-z0-9]+", "-", f"{author}-{url.split('/')[-1][:30]}".lower()).strip("-")
+        filename = f"{date_str}-tg-shared-{slug}.md"
+        source_path = Path(ARCHIVE_DIR) / filename
+
+        # Don't overwrite if already archived
+        if source_path.exists():
+            return
+
+        source_content = f"""---
+type: source
+source_type: x-article
+title: "{author} — shared via Telegram by @{username}"
+author: "{author}"
+url: "{url}"
+date: {date_str}
+domain: internet-finance
+format: article
+status: unprocessed
+proposed_by: "@{username}"
+contribution_type: source-submission
+tags: [telegram-shared, x-article]
+---
+
+# {author} — Article/Thread
+
+Shared by @{username} via Telegram.
+Source URL: {url}
+
+## Content
+
+{content}
+"""
+        source_path.write_text(source_content)
+        logger.info("Standalone source archived: %s (shared by @%s)", filename, username)
+    except Exception as e:
+        logger.warning("Failed to archive standalone source %s: %s", url, e)
+
+
 async def _fetch_url_content(url: str) -> str | None:
    """Fetch article/page content from a URL for pipeline ingestion.