feat: full content fetch for research + standalone source for shared URLs

Two fixes for article ingestion:

1. Research path: top 5 search results now get full content via
   fetch_from_url before archiving. Articles get full text, not just
   search snippets. Threads get complete text.

2. URL sharing: when a user shares a URL, creates a standalone source
   file (type: source, format: article) separate from the conversation
   archive. Enters extraction pipeline as proper source material,
   attributed to the TG user who shared it.

Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
This commit is contained in:
m3taversal 2026-03-24 15:57:58 +00:00
parent 8ff4784fcb
commit b3c635290f

View file

@ -330,6 +330,40 @@ async def handle_research(msg, query: str, user, silent: bool = False):
await msg.reply_text(f"No recent tweets found for '{query}'.")
return
# Fetch full content for top tweets (not just search snippets)
from x_client import fetch_from_url
for tweet in tweets[:5]: # Top 5 by engagement
url = tweet.get("url", "")
if url:
try:
full_data = await fetch_from_url(url)
if full_data:
# Replace snippet with full text
full_text = full_data.get("text", "")
if full_text and len(full_text) > len(tweet.get("text", "")):
tweet["text"] = full_text
# Include article content if available
contents = full_data.get("contents", [])
if contents:
article_parts = []
for block in contents:
block_text = block.get("text", "")
if not block_text:
continue
block_type = block.get("type", "unstyled")
if block_type in ("header-one", "header-two", "header-three"):
article_parts.append(f"\n## {block_text}\n")
elif block_type == "blockquote":
article_parts.append(f"> {block_text}")
elif block_type == "list-item":
article_parts.append(f"- {block_text}")
else:
article_parts.append(block_text)
if article_parts:
tweet["text"] += "\n\n--- Article Content ---\n" + "\n".join(article_parts)
except Exception as e:
logger.warning("Failed to fetch full content for %s: %s", url, e)
# Archive all tweets as ONE source file per research query
# (not per-tweet — one extraction PR produces claims from the best material)
try:
@ -719,11 +753,69 @@ IMPORTANT: Two special tags you can append at the end of your response (after yo
url_content = await _fetch_url_content(urls[0])
if url_content:
logger.info("Fetched %d chars from %s", len(url_content), urls[0])
# Create standalone source file for the article (separate from conversation)
# This enters the extraction pipeline as a proper source, attributed to contributor
_archive_standalone_source(urls[0], url_content, user)
# Archive the exchange as a source for pipeline (slow path)
_archive_exchange(text, response, user, msg, url_content=url_content, urls=urls)
def _archive_standalone_source(url: str, content: str, user):
"""Create a standalone source file for a URL shared in Telegram.
Separate from the conversation archive this is the actual article/tweet
entering the extraction pipeline as a proper source, attributed to the
contributor who shared it.
"""
try:
username = user.username if user else "anonymous"
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
# Extract author from URL or content
author = "unknown"
author_match = re.search(r"x\.com/(\w+)/", url) or re.search(r"twitter\.com/(\w+)/", url)
if author_match:
author = f"@{author_match.group(1)}"
slug = re.sub(r"[^a-z0-9]+", "-", f"{author}-{url.split('/')[-1][:30]}".lower()).strip("-")
filename = f"{date_str}-tg-shared-{slug}.md"
source_path = Path(ARCHIVE_DIR) / filename
# Don't overwrite if already archived
if source_path.exists():
return
source_content = f"""---
type: source
source_type: x-article
title: "{author} — shared via Telegram by @{username}"
author: "{author}"
url: "{url}"
date: {date_str}
domain: internet-finance
format: article
status: unprocessed
proposed_by: "@{username}"
contribution_type: source-submission
tags: [telegram-shared, x-article]
---
# {author} — Article/Thread
Shared by @{username} via Telegram.
Source URL: {url}
## Content
{content}
"""
source_path.write_text(source_content)
logger.info("Standalone source archived: %s (shared by @%s)", filename, username)
except Exception as e:
logger.warning("Failed to archive standalone source %s: %s", url, e)
async def _fetch_url_content(url: str) -> str | None:
"""Fetch article/page content from a URL for pipeline ingestion.