epimetheus: fix article content parsing — contents[] array, not text field
Article endpoint returns body in "contents" array of typed blocks (unstyled, header-two, markdown, list-item, blockquote, etc). Was looking for article.text which is empty. Now parses all block types into readable text. Also extracts engagement stats (likes, views). Fixes: "Claude + Obsidian" article returned title but empty text. Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
This commit is contained in:
parent
bcbe54a0a3
commit
c59db5812f
1 changed files with 65 additions and 16 deletions
|
|
@ -151,15 +151,42 @@ async def get_article(tweet_id: str) -> Optional[dict]:
|
|||
article = data.get("article")
|
||||
if not article:
|
||||
return None
|
||||
# Article body is in "contents" array (not "text" field)
|
||||
contents = article.get("contents", [])
|
||||
text_parts = []
|
||||
for block in contents:
|
||||
block_text = block.get("text", "")
|
||||
if not block_text:
|
||||
continue
|
||||
block_type = block.get("type", "unstyled")
|
||||
if block_type.startswith("header"):
|
||||
text_parts.append(f"\n## {block_text}\n")
|
||||
elif block_type == "markdown":
|
||||
text_parts.append(block_text)
|
||||
elif block_type in ("unordered-list-item",):
|
||||
text_parts.append(f"- {block_text}")
|
||||
elif block_type in ("ordered-list-item",):
|
||||
text_parts.append(f"* {block_text}")
|
||||
elif block_type == "blockquote":
|
||||
text_parts.append(f"> {block_text}")
|
||||
else:
|
||||
text_parts.append(block_text)
|
||||
full_text = "\n".join(text_parts)
|
||||
author_data = article.get("author", {})
|
||||
likes = article.get("likeCount", 0) or 0
|
||||
retweets = article.get("retweetCount", 0) or 0
|
||||
return {
|
||||
"text": article.get("text", article.get("content", "")),
|
||||
"text": full_text,
|
||||
"title": article.get("title", ""),
|
||||
"author": article.get("author", {}).get("userName", ""),
|
||||
"author_name": article.get("author", {}).get("name", ""),
|
||||
"author_followers": article.get("author", {}).get("followers", 0),
|
||||
"author": author_data.get("userName", ""),
|
||||
"author_name": author_data.get("name", ""),
|
||||
"author_followers": author_data.get("followers", 0),
|
||||
"tweet_date": article.get("createdAt", ""),
|
||||
"is_article": True,
|
||||
"engagement": 0,
|
||||
"engagement": likes + retweets,
|
||||
"likes": likes,
|
||||
"retweets": retweets,
|
||||
"views": article.get("viewCount", 0) or 0,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.warning("get_article(%s) error: %s", tweet_id, e)
|
||||
|
|
@ -260,19 +287,41 @@ async def fetch_from_url(url: str) -> Optional[dict]:
|
|||
tweet_id = match.group(2)
|
||||
|
||||
# Try tweet first (most X URLs are tweets)
|
||||
result = await get_tweet(tweet_id)
|
||||
if result:
|
||||
result["url"] = url
|
||||
return result
|
||||
tweet_result = await get_tweet(tweet_id)
|
||||
|
||||
# Try article (X long-form posts)
|
||||
result = await get_article(tweet_id)
|
||||
if result:
|
||||
result["url"] = url
|
||||
result["author"] = result.get("author") or username
|
||||
return result
|
||||
if tweet_result:
|
||||
tweet_text = tweet_result.get("text", "").strip()
|
||||
is_just_url = tweet_text.startswith("http") and len(tweet_text.split()) <= 2
|
||||
|
||||
# Both failed — return placeholder so caller can surface the failure
|
||||
if not is_just_url:
|
||||
# Regular tweet with real content — return it
|
||||
tweet_result["url"] = url
|
||||
return tweet_result
|
||||
|
||||
# Tweet was empty/URL-only, or tweet lookup failed — try article endpoint
|
||||
article_result = await get_article(tweet_id)
|
||||
if article_result:
|
||||
article_result["url"] = url
|
||||
article_result["author"] = article_result.get("author") or username
|
||||
# Article endpoint may return title but not full text
|
||||
if article_result.get("title") and not article_result.get("text"):
|
||||
article_result["text"] = (
|
||||
f'This is an X Article titled "{article_result["title"]}" by @{username}. '
|
||||
f"The API returned the title but not the full content. "
|
||||
f"Ask the user to paste the key points so you can analyze them."
|
||||
)
|
||||
return article_result
|
||||
|
||||
# If we got the tweet but it was just a URL, return with helpful context
|
||||
if tweet_result:
|
||||
tweet_result["url"] = url
|
||||
tweet_result["text"] = (
|
||||
f"Tweet by @{username} links to content but contains no text. "
|
||||
f"This may be an X Article. Ask the user to paste the key points."
|
||||
)
|
||||
return tweet_result
|
||||
|
||||
# Everything failed
|
||||
return {
|
||||
"text": f"[Could not fetch content from @{username}]",
|
||||
"url": url,
|
||||
|
|
|
|||
Loading…
Reference in a new issue