feat: full content fetch for research + standalone source for shared URLs
Two fixes for article ingestion: 1. Research path: top 5 search results now get full content via fetch_from_url before archiving. Articles get full text, not just search snippets. Threads get complete text. 2. URL sharing: when a user shares a URL, creates a standalone source file (type: source, format: article) separate from the conversation archive. Enters extraction pipeline as proper source material, attributed to the TG user who shared it. Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
This commit is contained in:
parent
8ff4784fcb
commit
b3c635290f
1 changed files with 92 additions and 0 deletions
|
|
@ -330,6 +330,40 @@ async def handle_research(msg, query: str, user, silent: bool = False):
|
|||
await msg.reply_text(f"No recent tweets found for '{query}'.")
|
||||
return
|
||||
|
||||
# Fetch full content for top tweets (not just search snippets)
|
||||
from x_client import fetch_from_url
|
||||
for tweet in tweets[:5]: # Top 5 by engagement
|
||||
url = tweet.get("url", "")
|
||||
if url:
|
||||
try:
|
||||
full_data = await fetch_from_url(url)
|
||||
if full_data:
|
||||
# Replace snippet with full text
|
||||
full_text = full_data.get("text", "")
|
||||
if full_text and len(full_text) > len(tweet.get("text", "")):
|
||||
tweet["text"] = full_text
|
||||
# Include article content if available
|
||||
contents = full_data.get("contents", [])
|
||||
if contents:
|
||||
article_parts = []
|
||||
for block in contents:
|
||||
block_text = block.get("text", "")
|
||||
if not block_text:
|
||||
continue
|
||||
block_type = block.get("type", "unstyled")
|
||||
if block_type in ("header-one", "header-two", "header-three"):
|
||||
article_parts.append(f"\n## {block_text}\n")
|
||||
elif block_type == "blockquote":
|
||||
article_parts.append(f"> {block_text}")
|
||||
elif block_type == "list-item":
|
||||
article_parts.append(f"- {block_text}")
|
||||
else:
|
||||
article_parts.append(block_text)
|
||||
if article_parts:
|
||||
tweet["text"] += "\n\n--- Article Content ---\n" + "\n".join(article_parts)
|
||||
except Exception as e:
|
||||
logger.warning("Failed to fetch full content for %s: %s", url, e)
|
||||
|
||||
# Archive all tweets as ONE source file per research query
|
||||
# (not per-tweet — one extraction PR produces claims from the best material)
|
||||
try:
|
||||
|
|
@ -719,11 +753,69 @@ IMPORTANT: Two special tags you can append at the end of your response (after yo
|
|||
url_content = await _fetch_url_content(urls[0])
|
||||
if url_content:
|
||||
logger.info("Fetched %d chars from %s", len(url_content), urls[0])
|
||||
# Create standalone source file for the article (separate from conversation)
|
||||
# This enters the extraction pipeline as a proper source, attributed to contributor
|
||||
_archive_standalone_source(urls[0], url_content, user)
|
||||
|
||||
# Archive the exchange as a source for pipeline (slow path)
|
||||
_archive_exchange(text, response, user, msg, url_content=url_content, urls=urls)
|
||||
|
||||
|
||||
def _archive_standalone_source(url: str, content: str, user):
|
||||
"""Create a standalone source file for a URL shared in Telegram.
|
||||
|
||||
Separate from the conversation archive — this is the actual article/tweet
|
||||
entering the extraction pipeline as a proper source, attributed to the
|
||||
contributor who shared it.
|
||||
"""
|
||||
try:
|
||||
username = user.username if user else "anonymous"
|
||||
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||||
|
||||
# Extract author from URL or content
|
||||
author = "unknown"
|
||||
author_match = re.search(r"x\.com/(\w+)/", url) or re.search(r"twitter\.com/(\w+)/", url)
|
||||
if author_match:
|
||||
author = f"@{author_match.group(1)}"
|
||||
|
||||
slug = re.sub(r"[^a-z0-9]+", "-", f"{author}-{url.split('/')[-1][:30]}".lower()).strip("-")
|
||||
filename = f"{date_str}-tg-shared-{slug}.md"
|
||||
source_path = Path(ARCHIVE_DIR) / filename
|
||||
|
||||
# Don't overwrite if already archived
|
||||
if source_path.exists():
|
||||
return
|
||||
|
||||
source_content = f"""---
|
||||
type: source
|
||||
source_type: x-article
|
||||
title: "{author} — shared via Telegram by @{username}"
|
||||
author: "{author}"
|
||||
url: "{url}"
|
||||
date: {date_str}
|
||||
domain: internet-finance
|
||||
format: article
|
||||
status: unprocessed
|
||||
proposed_by: "@{username}"
|
||||
contribution_type: source-submission
|
||||
tags: [telegram-shared, x-article]
|
||||
---
|
||||
|
||||
# {author} — Article/Thread
|
||||
|
||||
Shared by @{username} via Telegram.
|
||||
Source URL: {url}
|
||||
|
||||
## Content
|
||||
|
||||
{content}
|
||||
"""
|
||||
source_path.write_text(source_content)
|
||||
logger.info("Standalone source archived: %s (shared by @%s)", filename, username)
|
||||
except Exception as e:
|
||||
logger.warning("Failed to archive standalone source %s: %s", url, e)
|
||||
|
||||
|
||||
async def _fetch_url_content(url: str) -> str | None:
|
||||
"""Fetch article/page content from a URL for pipeline ingestion.
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue