diff --git a/telegram/bot.py b/telegram/bot.py index 6649fda..25606c4 100644 --- a/telegram/bot.py +++ b/telegram/bot.py @@ -330,6 +330,40 @@ async def handle_research(msg, query: str, user, silent: bool = False): await msg.reply_text(f"No recent tweets found for '{query}'.") return + # Fetch full content for top tweets (not just search snippets) + from x_client import fetch_from_url + for tweet in tweets[:5]: # Top 5 by engagement + url = tweet.get("url", "") + if url: + try: + full_data = await fetch_from_url(url) + if full_data: + # Replace snippet with full text + full_text = full_data.get("text", "") + if full_text and len(full_text) > len(tweet.get("text", "")): + tweet["text"] = full_text + # Include article content if available + contents = full_data.get("contents", []) + if contents: + article_parts = [] + for block in contents: + block_text = block.get("text", "") + if not block_text: + continue + block_type = block.get("type", "unstyled") + if block_type in ("header-one", "header-two", "header-three"): + article_parts.append(f"\n## {block_text}\n") + elif block_type == "blockquote": + article_parts.append(f"> {block_text}") + elif block_type == "list-item": + article_parts.append(f"- {block_text}") + else: + article_parts.append(block_text) + if article_parts: + tweet["text"] += "\n\n--- Article Content ---\n" + "\n".join(article_parts) + except Exception as e: + logger.warning("Failed to fetch full content for %s: %s", url, e) + # Archive all tweets as ONE source file per research query # (not per-tweet — one extraction PR produces claims from the best material) try: @@ -719,11 +753,69 @@ IMPORTANT: Two special tags you can append at the end of your response (after yo url_content = await _fetch_url_content(urls[0]) if url_content: logger.info("Fetched %d chars from %s", len(url_content), urls[0]) + # Create standalone source file for the article (separate from conversation) + # This enters the extraction pipeline as a proper source, attributed to contributor + _archive_standalone_source(urls[0], url_content, user) # Archive the exchange as a source for pipeline (slow path) _archive_exchange(text, response, user, msg, url_content=url_content, urls=urls) +def _archive_standalone_source(url: str, content: str, user): + """Create a standalone source file for a URL shared in Telegram. + + Separate from the conversation archive — this is the actual article/tweet + entering the extraction pipeline as a proper source, attributed to the + contributor who shared it. + """ + try: + username = user.username if user else "anonymous" + date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d") + + # Extract author from URL or content + author = "unknown" + author_match = re.search(r"x\.com/(\w+)/", url) or re.search(r"twitter\.com/(\w+)/", url) + if author_match: + author = f"@{author_match.group(1)}" + + slug = re.sub(r"[^a-z0-9]+", "-", f"{author}-{url.split('/')[-1][:30]}".lower()).strip("-") + filename = f"{date_str}-tg-shared-{slug}.md" + source_path = Path(ARCHIVE_DIR) / filename + + # Don't overwrite if already archived + if source_path.exists(): + return + + source_content = f"""--- +type: source +source_type: x-article +title: "{author} — shared via Telegram by @{username}" +author: "{author}" +url: "{url}" +date: {date_str} +domain: internet-finance +format: article +status: unprocessed +proposed_by: "@{username}" +contribution_type: source-submission +tags: [telegram-shared, x-article] +--- + +# {author} — Article/Thread + +Shared by @{username} via Telegram. +Source URL: {url} + +## Content + +{content} +""" + source_path.write_text(source_content) + logger.info("Standalone source archived: %s (shared by @%s)", filename, username) + except Exception as e: + logger.warning("Failed to archive standalone source %s: %s", url, e) + + async def _fetch_url_content(url: str) -> str | None: """Fetch article/page content from a URL for pipeline ingestion.