2026-03-30 11:03:35 +00:00
1 changed files with 92 additions and 0 deletions
--- a/telegram/bot.py
+++ b/telegram/bot.py
@ -330,6 +330,40 @@ async def handle_research(msg, query: str, user, silent: bool = False):
            await msg.reply_text(f"No recent tweets found for '{query}'.")
        return
    # Fetch full content for top tweets (not just search snippets)
    from x_client import fetch_from_url
    for tweet in tweets[:5]:  # Top 5 by engagement
        url = tweet.get("url", "")
        if url:
            try:
                full_data = await fetch_from_url(url)
                if full_data:
                    # Replace snippet with full text
                    full_text = full_data.get("text", "")
                    if full_text and len(full_text) > len(tweet.get("text", "")):
                        tweet["text"] = full_text
                    # Include article content if available
                    contents = full_data.get("contents", [])
                    if contents:
                        article_parts = []
                        for block in contents:
                            block_text = block.get("text", "")
                            if not block_text:
                                continue
                            block_type = block.get("type", "unstyled")
                            if block_type in ("header-one", "header-two", "header-three"):
                                article_parts.append(f"\n## {block_text}\n")
                            elif block_type == "blockquote":
                                article_parts.append(f"> {block_text}")
                            elif block_type == "list-item":
                                article_parts.append(f"- {block_text}")
                            else:
                                article_parts.append(block_text)
                        if article_parts:
                            tweet["text"] += "\n\n--- Article Content ---\n" + "\n".join(article_parts)
            except Exception as e:
                logger.warning("Failed to fetch full content for %s: %s", url, e)
    # Archive all tweets as ONE source file per research query
    # (not per-tweet — one extraction PR produces claims from the best material)
    try:
@ -719,11 +753,69 @@ IMPORTANT: Two special tags you can append at the end of your response (after yo
        url_content = await _fetch_url_content(urls[0])
        if url_content:
            logger.info("Fetched %d chars from %s", len(url_content), urls[0])
            # Create standalone source file for the article (separate from conversation)
            # This enters the extraction pipeline as a proper source, attributed to contributor
            _archive_standalone_source(urls[0], url_content, user)
    # Archive the exchange as a source for pipeline (slow path)
    _archive_exchange(text, response, user, msg, url_content=url_content, urls=urls)
 def _archive_standalone_source(url: str, content: str, user):
    """Create a standalone source file for a URL shared in Telegram.
    Separate from the conversation archive — this is the actual article/tweet
    entering the extraction pipeline as a proper source, attributed to the
    contributor who shared it.
    """
    try:
        username = user.username if user else "anonymous"
        date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
        # Extract author from URL or content
        author = "unknown"
        author_match = re.search(r"x\.com/(\w+)/", url) or re.search(r"twitter\.com/(\w+)/", url)
        if author_match:
            author = f"@{author_match.group(1)}"
        slug = re.sub(r"[^a-z0-9]+", "-", f"{author}-{url.split('/')[-1][:30]}".lower()).strip("-")
        filename = f"{date_str}-tg-shared-{slug}.md"
        source_path = Path(ARCHIVE_DIR) / filename
        # Don't overwrite if already archived
        if source_path.exists():
            return
        source_content = f"""---
 type: source
 source_type: x-article
 title: "{author} — shared via Telegram by @{username}"
 author: "{author}"
 url: "{url}"
 date: {date_str}
 domain: internet-finance
 format: article
 status: unprocessed
 proposed_by: "@{username}"
 contribution_type: source-submission
 tags: [telegram-shared, x-article]
 ---
 # {author} — Article/Thread
 Shared by @{username} via Telegram.
 Source URL: {url}
 ## Content
 {content}
 """
        source_path.write_text(source_content)
        logger.info("Standalone source archived: %s (shared by @%s)", filename, username)
    except Exception as e:
        logger.warning("Failed to archive standalone source %s: %s", url, e)
 async def _fetch_url_content(url: str) -> str | None:
    """Fetch article/page content from a URL for pipeline ingestion.