feat: atomic extract-and-connect + stale PR monitor + response audit #4
1 changed files with 92 additions and 0 deletions
|
|
@ -330,6 +330,40 @@ async def handle_research(msg, query: str, user, silent: bool = False):
|
||||||
await msg.reply_text(f"No recent tweets found for '{query}'.")
|
await msg.reply_text(f"No recent tweets found for '{query}'.")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Fetch full content for top tweets (not just search snippets)
|
||||||
|
from x_client import fetch_from_url
|
||||||
|
for tweet in tweets[:5]: # Top 5 by engagement
|
||||||
|
url = tweet.get("url", "")
|
||||||
|
if url:
|
||||||
|
try:
|
||||||
|
full_data = await fetch_from_url(url)
|
||||||
|
if full_data:
|
||||||
|
# Replace snippet with full text
|
||||||
|
full_text = full_data.get("text", "")
|
||||||
|
if full_text and len(full_text) > len(tweet.get("text", "")):
|
||||||
|
tweet["text"] = full_text
|
||||||
|
# Include article content if available
|
||||||
|
contents = full_data.get("contents", [])
|
||||||
|
if contents:
|
||||||
|
article_parts = []
|
||||||
|
for block in contents:
|
||||||
|
block_text = block.get("text", "")
|
||||||
|
if not block_text:
|
||||||
|
continue
|
||||||
|
block_type = block.get("type", "unstyled")
|
||||||
|
if block_type in ("header-one", "header-two", "header-three"):
|
||||||
|
article_parts.append(f"\n## {block_text}\n")
|
||||||
|
elif block_type == "blockquote":
|
||||||
|
article_parts.append(f"> {block_text}")
|
||||||
|
elif block_type == "list-item":
|
||||||
|
article_parts.append(f"- {block_text}")
|
||||||
|
else:
|
||||||
|
article_parts.append(block_text)
|
||||||
|
if article_parts:
|
||||||
|
tweet["text"] += "\n\n--- Article Content ---\n" + "\n".join(article_parts)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Failed to fetch full content for %s: %s", url, e)
|
||||||
|
|
||||||
# Archive all tweets as ONE source file per research query
|
# Archive all tweets as ONE source file per research query
|
||||||
# (not per-tweet — one extraction PR produces claims from the best material)
|
# (not per-tweet — one extraction PR produces claims from the best material)
|
||||||
try:
|
try:
|
||||||
|
|
@ -719,11 +753,69 @@ IMPORTANT: Two special tags you can append at the end of your response (after yo
|
||||||
url_content = await _fetch_url_content(urls[0])
|
url_content = await _fetch_url_content(urls[0])
|
||||||
if url_content:
|
if url_content:
|
||||||
logger.info("Fetched %d chars from %s", len(url_content), urls[0])
|
logger.info("Fetched %d chars from %s", len(url_content), urls[0])
|
||||||
|
# Create standalone source file for the article (separate from conversation)
|
||||||
|
# This enters the extraction pipeline as a proper source, attributed to contributor
|
||||||
|
_archive_standalone_source(urls[0], url_content, user)
|
||||||
|
|
||||||
# Archive the exchange as a source for pipeline (slow path)
|
# Archive the exchange as a source for pipeline (slow path)
|
||||||
_archive_exchange(text, response, user, msg, url_content=url_content, urls=urls)
|
_archive_exchange(text, response, user, msg, url_content=url_content, urls=urls)
|
||||||
|
|
||||||
|
|
||||||
|
def _archive_standalone_source(url: str, content: str, user):
|
||||||
|
"""Create a standalone source file for a URL shared in Telegram.
|
||||||
|
|
||||||
|
Separate from the conversation archive — this is the actual article/tweet
|
||||||
|
entering the extraction pipeline as a proper source, attributed to the
|
||||||
|
contributor who shared it.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
username = user.username if user else "anonymous"
|
||||||
|
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||||||
|
|
||||||
|
# Extract author from URL or content
|
||||||
|
author = "unknown"
|
||||||
|
author_match = re.search(r"x\.com/(\w+)/", url) or re.search(r"twitter\.com/(\w+)/", url)
|
||||||
|
if author_match:
|
||||||
|
author = f"@{author_match.group(1)}"
|
||||||
|
|
||||||
|
slug = re.sub(r"[^a-z0-9]+", "-", f"{author}-{url.split('/')[-1][:30]}".lower()).strip("-")
|
||||||
|
filename = f"{date_str}-tg-shared-{slug}.md"
|
||||||
|
source_path = Path(ARCHIVE_DIR) / filename
|
||||||
|
|
||||||
|
# Don't overwrite if already archived
|
||||||
|
if source_path.exists():
|
||||||
|
return
|
||||||
|
|
||||||
|
source_content = f"""---
|
||||||
|
type: source
|
||||||
|
source_type: x-article
|
||||||
|
title: "{author} — shared via Telegram by @{username}"
|
||||||
|
author: "{author}"
|
||||||
|
url: "{url}"
|
||||||
|
date: {date_str}
|
||||||
|
domain: internet-finance
|
||||||
|
format: article
|
||||||
|
status: unprocessed
|
||||||
|
proposed_by: "@{username}"
|
||||||
|
contribution_type: source-submission
|
||||||
|
tags: [telegram-shared, x-article]
|
||||||
|
---
|
||||||
|
|
||||||
|
# {author} — Article/Thread
|
||||||
|
|
||||||
|
Shared by @{username} via Telegram.
|
||||||
|
Source URL: {url}
|
||||||
|
|
||||||
|
## Content
|
||||||
|
|
||||||
|
{content}
|
||||||
|
"""
|
||||||
|
source_path.write_text(source_content)
|
||||||
|
logger.info("Standalone source archived: %s (shared by @%s)", filename, username)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Failed to archive standalone source %s: %s", url, e)
|
||||||
|
|
||||||
|
|
||||||
async def _fetch_url_content(url: str) -> str | None:
|
async def _fetch_url_content(url: str) -> str | None:
|
||||||
"""Fetch article/page content from a URL for pipeline ingestion.
|
"""Fetch article/page content from a URL for pipeline ingestion.
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue