teleo-codex/ops/research-session.sh

#!/bin/bash
# Run a self-directed research session for one agent.
# Usage: ./research-session.sh <agent-name>
# Example: ./research-session.sh clay
#
# What it does:
#   1. Pulls latest tweets from the agent's network accounts (X API)
#   2. Gives Claude the agent's identity, beliefs, and current KB state
#   3. Agent picks a research direction and archives sources with notes
#   4. Commits source archives to a branch, pushes, opens PR
#   5. Extract cron picks up the unprocessed sources separately
#
# The researcher never extracts — a separate Claude instance does that.
# This prevents motivated reasoning in extraction.

set -euo pipefail

AGENT="${1:?Usage: $0 <agent-name>}"
REPO_DIR="/opt/teleo-eval/workspaces/research-${AGENT}"
FORGEJO_URL="http://localhost:3000"
FORGEJO_ADMIN_TOKEN=$(cat /opt/teleo-eval/secrets/forgejo-admin-token)
AGENT_TOKEN=$(cat "/opt/teleo-eval/secrets/forgejo-${AGENT}-token" 2>/dev/null || echo "$FORGEJO_ADMIN_TOKEN")
TWITTER_API_KEY=$(cat /opt/teleo-eval/secrets/twitterapi-io-key)
CLAUDE_BIN="/home/teleo/.local/bin/claude"
LOG_DIR="/opt/teleo-eval/logs"
LOG="$LOG_DIR/research-${AGENT}.log"
LOCKFILE="/tmp/research-${AGENT}.lock"
DATE=$(date +%Y-%m-%d)
BRANCH="${AGENT}/research-${DATE}"
RAW_DIR="/opt/teleo-eval/research-raw/${AGENT}"

log() { echo "[$(date -Iseconds)] $*" >> "$LOG"; }

# --- Lock (prevent concurrent sessions for same agent) ---
if [ -f "$LOCKFILE" ]; then
    pid=$(cat "$LOCKFILE" 2>/dev/null)
    if kill -0 "$pid" 2>/dev/null; then
        log "SKIP: research session already running for $AGENT (pid $pid)"
        exit 0
    fi
    log "WARN: stale lockfile for $AGENT, removing"
    rm -f "$LOCKFILE"
fi
echo $$ > "$LOCKFILE"
trap 'rm -f "$LOCKFILE"' EXIT

log "=== Starting research session for $AGENT ==="

# --- Ensure directories ---
mkdir -p "$RAW_DIR" "$LOG_DIR"

# --- Clone or update repo ---
if [ ! -d "$REPO_DIR/.git" ]; then
    log "Cloning repo for $AGENT research..."
    git -c http.extraHeader="Authorization: token $FORGEJO_ADMIN_TOKEN" \
        clone "${FORGEJO_URL}/teleo/teleo-codex.git" "$REPO_DIR" >> "$LOG" 2>&1
fi

cd "$REPO_DIR"
git config credential.helper "!f() { echo username=m3taversal; echo password=$FORGEJO_ADMIN_TOKEN; }; f"
git remote set-url origin "${FORGEJO_URL}/teleo/teleo-codex.git" 2>/dev/null || true
git checkout main >> "$LOG" 2>&1
git pull --rebase >> "$LOG" 2>&1

# --- Map agent to domain ---
case "$AGENT" in
    rio) DOMAIN="internet-finance" ;;
    clay) DOMAIN="entertainment" ;;
    theseus) DOMAIN="ai-alignment" ;;
    vida) DOMAIN="health" ;;
    astra) DOMAIN="space-development" ;;
    leo) DOMAIN="grand-strategy" ;;
    *) log "ERROR: Unknown agent $AGENT"; exit 1 ;;
esac

# --- Pull tweets from agent's network ---
# Check if agent has a network file in the repo
NETWORK_FILE="agents/${AGENT}/network.json"
if [ ! -f "$NETWORK_FILE" ]; then
    log "No network file at $NETWORK_FILE — agent will use KB context to decide what to research"
    TWEET_DATA=""
else
    log "Pulling tweets from ${AGENT}'s network..."
    ACCOUNTS=$(python3 -c "
import json
with open('$NETWORK_FILE') as f:
    data = json.load(f)
for acct in data.get('accounts', []):
    if acct.get('tier') in ('core', 'extended'):
        print(acct['username'])
" 2>/dev/null || true)

    TWEET_DATA=""
    for USERNAME in $ACCOUNTS; do
        OUTFILE="$RAW_DIR/${USERNAME}.json"
        # Only pull if file doesn't exist or is older than 12 hours
        if [ ! -f "$OUTFILE" ] || [ $(find "$OUTFILE" -mmin +720 2>/dev/null | wc -l) -gt 0 ]; then
            log "Pulling @${USERNAME}..."
            curl -s "https://api.twitterapi.io/twitter/user/last_tweets?userName=${USERNAME}" \
                -H "X-API-Key: ${TWITTER_API_KEY}" \
                -o "$OUTFILE" 2>/dev/null || {
                log "WARN: Failed to pull @${USERNAME}"
                continue
            }
            sleep 2  # Rate limit courtesy
        fi
        if [ -f "$OUTFILE" ]; then
            TWEET_DATA="${TWEET_DATA}
--- @${USERNAME} tweets ---
$(python3 -c "
import json, sys
try:
    d = json.load(open('$OUTFILE'))
    tweets = d.get('tweets', d.get('data', []))
    for t in tweets[:20]:
        text = t.get('text', '')[:500]
        likes = t.get('likeCount', t.get('public_metrics', {}).get('like_count', 0))
        date = t.get('createdAt', t.get('created_at', 'unknown'))
        url = t.get('twitterUrl', t.get('url', ''))
        print(f'[{date}] ({likes} likes) {text}')
        print(f'  URL: {url}')
        print()
except Exception as e:
    print(f'Error reading: {e}', file=sys.stderr)
" 2>/dev/null || echo "(failed to parse)")"
        fi
    done
fi

# --- Also check for any raw JSON dumps in inbox-raw ---
INBOX_RAW="/opt/teleo-eval/inbox-raw/${AGENT}"
if [ -d "$INBOX_RAW" ] && ls "$INBOX_RAW"/*.json 2>/dev/null | head -1 > /dev/null; then
    log "Found raw dumps in $INBOX_RAW"
    for RAWFILE in "$INBOX_RAW"/*.json; do
        USERNAME=$(basename "$RAWFILE" .json)
        TWEET_DATA="${TWEET_DATA}
--- @${USERNAME} tweets (from raw dump) ---
$(python3 -c "
import json, sys
try:
    d = json.load(open('$RAWFILE'))
    tweets = d.get('tweets', d.get('data', []))
    for t in tweets[:20]:
        text = t.get('text', '')[:500]
        likes = t.get('likeCount', t.get('public_metrics', {}).get('like_count', 0))
        date = t.get('createdAt', t.get('created_at', 'unknown'))
        url = t.get('twitterUrl', t.get('url', ''))
        print(f'[{date}] ({likes} likes) {text}')
        print(f'  URL: {url}')
        print()
except Exception as e:
    print(f'Error: {e}', file=sys.stderr)
" 2>/dev/null || echo "(failed to parse)")"
    done
fi

# --- Create branch ---
git branch -D "$BRANCH" 2>/dev/null || true
git checkout -b "$BRANCH" >> "$LOG" 2>&1
log "On branch $BRANCH"

# --- Build the research prompt ---
# Write tweet data to a temp file so Claude can read it
TWEET_FILE="/tmp/research-tweets-${AGENT}.md"
echo "$TWEET_DATA" > "$TWEET_FILE"

RESEARCH_PROMPT="You are ${AGENT}, a Teleo knowledge base agent. Domain: ${DOMAIN}.

## Your Task: Self-Directed Research Session

You have ~90 minutes of compute. Use it wisely.

### Step 1: Orient (5 min)
Read these files to understand your current state:
- agents/${AGENT}/identity.md (who you are)
- agents/${AGENT}/beliefs.md (what you believe)
- agents/${AGENT}/reasoning.md (how you think)
- domains/${DOMAIN}/_map.md (your domain's current claims)

### Step 2: Review Recent Tweets (10 min)
Read ${TWEET_FILE} — these are recent tweets from accounts in your domain.
Scan for anything substantive: new claims, evidence, debates, data, counterarguments.

### Step 3: Pick ONE Research Direction (5 min)
Based on your beliefs, your domain's gaps, and what you found in the tweets, pick ONE research direction that would most advance your domain understanding. Consider:
- Gaps in your beliefs that need evidence
- Claims in the KB that new evidence might challenge
- Cross-domain connections flagged by other agents
- New developments that change the landscape

Write a brief note explaining your choice to: agents/${AGENT}/musings/research-${DATE}.md

### Step 4: Archive Sources (60 min)
For each relevant tweet/thread, create an archive file:

Path: inbox/archive/YYYY-MM-DD-{author-handle}-{brief-slug}.md

Use this frontmatter:
---
type: source
title: \"Descriptive title\"
author: \"Display Name (@handle)\"
url: https://original-url
date: YYYY-MM-DD
domain: ${DOMAIN}
secondary_domains: []
format: tweet | thread
status: unprocessed
priority: high | medium | low
tags: [topic1, topic2]
---

## Content
[Full text of tweet/thread]

## Agent Notes
**Why this matters:** [1-2 sentences]
**KB connections:** [Which existing claims relate?]
**Extraction hints:** [What claims might an extractor pull?]
**Context:** [Who is the author, what debate is this part of?]

### Rules:
- Archive EVERYTHING substantive, not just what supports your views
- Set all sources to status: unprocessed (a DIFFERENT instance will extract)
- Flag cross-domain sources with flagged_for_{agent}: [\"reason\"]
- Do NOT extract claims yourself — write good notes so the extractor can
- Check inbox/archive/ for duplicates before creating new archives
- Aim for 5-15 source archives per session

### Step 5: Commit and Push (5 min)
Stage your archives and musing, commit with:
  ${AGENT}: research session ${DATE} — {brief description of direction}

  Pentagon-Agent: $(echo ${AGENT} | sed 's/./\U&/') <HEADLESS>

Then stop. Do not open a PR — the script handles that."

# --- Run Claude research session ---
log "Starting Claude research session..."
timeout 5400 "$CLAUDE_BIN" -p "$RESEARCH_PROMPT" \
    --allowedTools 'Read,Write,Edit,Glob,Grep,Bash' \
    --model sonnet \
    --permission-mode bypassPermissions \
    >> "$LOG" 2>&1 || {
    log "WARN: Research session failed or timed out for $AGENT"
    git checkout main >> "$LOG" 2>&1
    exit 1
}

log "Claude session complete"

# --- Check for changes ---
CHANGED_FILES=$(git status --porcelain)
if [ -z "$CHANGED_FILES" ]; then
    log "No sources archived by $AGENT"
    git checkout main >> "$LOG" 2>&1
    exit 0
fi

# --- Stage and commit if Claude didn't already ---
if ! git log --oneline -1 | grep -q "research session"; then
    # Claude didn't commit — do it manually
    git add inbox/archive/ agents/${AGENT}/musings/ 2>/dev/null || true

    if git diff --cached --quiet; then
        log "No valid changes to commit"
        git checkout main >> "$LOG" 2>&1
        exit 0
    fi

    AGENT_UPPER=$(echo "$AGENT" | sed 's/./\U&/')
    SOURCE_COUNT=$(git diff --cached --name-only | grep -c "^inbox/archive/" || echo "0")
    git commit -m "${AGENT}: research session ${DATE} — ${SOURCE_COUNT} sources archived

Pentagon-Agent: ${AGENT_UPPER} <HEADLESS>" >> "$LOG" 2>&1
fi

# --- Push ---
git push -u origin "$BRANCH" --force >> "$LOG" 2>&1
log "Pushed $BRANCH"

# --- Open PR ---
PR_JSON=$(python3 -c "
import json
data = {
    'title': '${AGENT}: research session ${DATE}',
    'body': '## Self-Directed Research\\n\\nAutomated research session for ${AGENT} (${DOMAIN}).\\n\\nSources archived with status: unprocessed — extract cron will handle claim extraction separately.\\n\\nResearcher and extractor are different Claude instances to prevent motivated reasoning.',
    'base': 'main',
    'head': '${BRANCH}'
}
print(json.dumps(data))
")

PR_RESULT=$(curl -s -X POST "${FORGEJO_URL}/api/v1/repos/teleo/teleo-codex/pulls" \
    -H "Authorization: token $AGENT_TOKEN" \
    -H "Content-Type: application/json" \
    -d "$PR_JSON" 2>&1)

PR_NUMBER=$(echo "$PR_RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('number','unknown'))" 2>/dev/null || echo "unknown")
log "PR #${PR_NUMBER} opened for ${AGENT}'s research session"

# --- Back to main ---
git checkout main >> "$LOG" 2>&1
log "=== Research session complete for $AGENT ==="