teleo-infrastructure/telegram/x-ingest.py

#!/usr/bin/env python3
"""Pull all tweets from specified X accounts and save as JSON archives."""
import json
import sys
import time
import urllib.request
from pathlib import Path

API_KEY_FILE = "/opt/teleo-eval/secrets/twitterapi-io-key"

def _load_api_key():
    try:
        return Path(API_KEY_FILE).read_text().strip()
    except FileNotFoundError:
        print(f"ERROR: API key not found at {API_KEY_FILE}", file=sys.stderr)
        sys.exit(1)

API_KEY = _load_api_key()
BASE = "https://api.twitterapi.io/twitter/user/last_tweets"
OUT_DIR = "/opt/teleo-eval/x-archives"

ACCOUNTS = [
    "m3taversal",
    "Living_IP",
    "teLEOhuman",
    "aiCLAYno",
    "futaRdIO_ai",
]

import os
os.makedirs(OUT_DIR, exist_ok=True)

def fetch_page(username, cursor=None):
    url = f"{BASE}?userName={username}"
    if cursor:
        url += f"&cursor={cursor}"
    req = urllib.request.Request(url, headers={"X-API-Key": API_KEY})
    try:
        with urllib.request.urlopen(req, timeout=30) as resp:
            return json.loads(resp.read())
    except Exception as e:
        print(f"  ERROR fetching {username}: {e}")
        return None

def pull_all_tweets(username):
    all_tweets = []
    cursor = None
    page = 0
    while True:
        page += 1
        print(f"  Page {page} (cursor: {'yes' if cursor else 'start'})...", end=" ")
        data = fetch_page(username, cursor)
        if not data or data.get("status") != "success":
            print(f"FAILED: {data}")
            break

        tweets = data.get("data", {}).get("tweets", [])
        next_cursor = data.get("data", {}).get("next_cursor")

        # Deduplicate
        seen_ids = {t["id"] for t in all_tweets}
        new_tweets = [t for t in tweets if t["id"] not in seen_ids]
        all_tweets.extend(new_tweets)
        print(f"{len(new_tweets)} new tweets (total: {len(all_tweets)})")

        if not next_cursor or not new_tweets:
            break
        cursor = next_cursor
        time.sleep(1)  # Rate limit courtesy

    return all_tweets

for account in ACCOUNTS:
    print(f"\n=== @{account} ===")
    tweets = pull_all_tweets(account)

    # Save raw
    outfile = os.path.join(OUT_DIR, f"{account}-tweets.json")
    with open(outfile, "w") as f:
        json.dump({"account": account, "tweet_count": len(tweets), "tweets": tweets}, f, indent=2)
    print(f"  Saved {len(tweets)} tweets to {outfile}")

    # Quick stats
    originals = [t for t in tweets if not t.get("text", "").startswith("RT @") and not t.get("isReply")]
    replies = [t for t in tweets if t.get("isReply")]
    rts = [t for t in tweets if t.get("text", "").startswith("RT @")]
    print(f"  Breakdown: {len(originals)} original, {len(replies)} replies, {len(rts)} RTs")

    if originals:
        top = sorted(originals, key=lambda t: int(t.get("viewCount", 0) or 0), reverse=True)[:5]
        print(f"  Top 5 by views:")
        for t in top:
            text = t["text"][:80].replace("\n", " ")
            print(f"    {t.get('viewCount', '?')} views | {t.get('likeCount', '?')} likes | {text}...")

print("\n=== DONE ===")