Compare commits
4 commits
main
...
ship/metad
| Author | SHA1 | Date | |
|---|---|---|---|
| 353c4a57b9 | |||
| dde055fdbf | |||
| 800d1d8b8e | |||
| b8fba8195f |
4 changed files with 522 additions and 96 deletions
|
|
@ -51,7 +51,7 @@ fi
|
||||||
|
|
||||||
# Syntax check all Python files before copying
|
# Syntax check all Python files before copying
|
||||||
ERRORS=0
|
ERRORS=0
|
||||||
for f in lib/*.py *.py diagnostics/*.py telegram/*.py tests/*.py; do
|
for f in lib/*.py *.py diagnostics/*.py telegram/*.py tests/*.py scripts/*.py; do
|
||||||
[ -f "$f" ] || continue
|
[ -f "$f" ] || continue
|
||||||
if ! python3 -c "import ast, sys; ast.parse(open(sys.argv[1]).read())" "$f" 2>&1; then
|
if ! python3 -c "import ast, sys; ast.parse(open(sys.argv[1]).read())" "$f" 2>&1; then
|
||||||
log "SYNTAX ERROR: $f"
|
log "SYNTAX ERROR: $f"
|
||||||
|
|
@ -77,6 +77,7 @@ rsync "${RSYNC_OPTS[@]}" telegram/ "$PIPELINE_DIR/telegram/"
|
||||||
rsync "${RSYNC_OPTS[@]}" diagnostics/ "$DIAGNOSTICS_DIR/"
|
rsync "${RSYNC_OPTS[@]}" diagnostics/ "$DIAGNOSTICS_DIR/"
|
||||||
rsync "${RSYNC_OPTS[@]}" agent-state/ "$AGENT_STATE_DIR/"
|
rsync "${RSYNC_OPTS[@]}" agent-state/ "$AGENT_STATE_DIR/"
|
||||||
rsync "${RSYNC_OPTS[@]}" tests/ "$PIPELINE_DIR/tests/"
|
rsync "${RSYNC_OPTS[@]}" tests/ "$PIPELINE_DIR/tests/"
|
||||||
|
rsync "${RSYNC_OPTS[@]}" scripts/ "$PIPELINE_DIR/scripts/"
|
||||||
[ -f research/research-session.sh ] && rsync "${RSYNC_OPTS[@]}" research/research-session.sh /opt/teleo-eval/research-session.sh
|
[ -f research/research-session.sh ] && rsync "${RSYNC_OPTS[@]}" research/research-session.sh /opt/teleo-eval/research-session.sh
|
||||||
|
|
||||||
# Safety net: ensure all .sh files are executable after rsync
|
# Safety net: ensure all .sh files are executable after rsync
|
||||||
|
|
|
||||||
|
|
@ -41,7 +41,7 @@ echo ""
|
||||||
# Syntax check all Python files before deploying
|
# Syntax check all Python files before deploying
|
||||||
echo "=== Pre-deploy syntax check ==="
|
echo "=== Pre-deploy syntax check ==="
|
||||||
ERRORS=0
|
ERRORS=0
|
||||||
for f in "$REPO_ROOT/lib/"*.py "$REPO_ROOT/"*.py "$REPO_ROOT/diagnostics/"*.py "$REPO_ROOT/telegram/"*.py; do
|
for f in "$REPO_ROOT/lib/"*.py "$REPO_ROOT/"*.py "$REPO_ROOT/diagnostics/"*.py "$REPO_ROOT/telegram/"*.py "$REPO_ROOT/scripts/"*.py; do
|
||||||
[ -f "$f" ] || continue
|
[ -f "$f" ] || continue
|
||||||
if ! python3 -c "import ast, sys; ast.parse(open(sys.argv[1]).read())" "$f" 2>/dev/null; then
|
if ! python3 -c "import ast, sys; ast.parse(open(sys.argv[1]).read())" "$f" 2>/dev/null; then
|
||||||
echo "SYNTAX ERROR: $f"
|
echo "SYNTAX ERROR: $f"
|
||||||
|
|
@ -80,6 +80,10 @@ echo "=== Tests ==="
|
||||||
rsync "${RSYNC_OPTS[@]}" "$REPO_ROOT/tests/" "$VPS_HOST:$VPS_PIPELINE/tests/"
|
rsync "${RSYNC_OPTS[@]}" "$REPO_ROOT/tests/" "$VPS_HOST:$VPS_PIPELINE/tests/"
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
|
echo "=== Scripts ==="
|
||||||
|
rsync "${RSYNC_OPTS[@]}" "$REPO_ROOT/scripts/" "$VPS_HOST:$VPS_PIPELINE/scripts/"
|
||||||
|
echo ""
|
||||||
|
|
||||||
echo "=== Diagnostics ==="
|
echo "=== Diagnostics ==="
|
||||||
rsync "${RSYNC_OPTS[@]}" "$REPO_ROOT/diagnostics/" "$VPS_HOST:$VPS_DIAGNOSTICS/"
|
rsync "${RSYNC_OPTS[@]}" "$REPO_ROOT/diagnostics/" "$VPS_HOST:$VPS_DIAGNOSTICS/"
|
||||||
echo ""
|
echo ""
|
||||||
|
|
|
||||||
|
|
@ -9,16 +9,6 @@ DB_PATH = "/opt/teleo-eval/pipeline/pipeline.db"
|
||||||
_cache = {"data": None, "ts": 0}
|
_cache = {"data": None, "ts": 0}
|
||||||
CACHE_TTL = 60 # 1 minute — activity should feel fresh
|
CACHE_TTL = 60 # 1 minute — activity should feel fresh
|
||||||
|
|
||||||
# commit_types we surface in the activity feed. `pipeline` is system
|
|
||||||
# maintenance (reweave/fix auto-runs, zombie cleanup) and stays hidden.
|
|
||||||
_FEED_COMMIT_TYPES = ("knowledge", "enrich", "challenge", "research", "entity", "extract", "reweave")
|
|
||||||
|
|
||||||
# Source-archive slugs follow YYYY-MM-DD-publisher-topic-HASH4 — they're
|
|
||||||
# inbox archive filenames, not claim slugs. Used as a fallback signal when
|
|
||||||
# branch/description heuristics miss (e.g. populated descriptions that
|
|
||||||
# happen to be source titles, not claim insights).
|
|
||||||
_SOURCE_SLUG_PATTERN = re.compile(r"^\d{4}-\d{2}-\d{2}-.+-[a-f0-9]{4}$")
|
|
||||||
|
|
||||||
|
|
||||||
def _get_conn():
|
def _get_conn():
|
||||||
conn = sqlite3.connect(DB_PATH)
|
conn = sqlite3.connect(DB_PATH)
|
||||||
|
|
@ -27,52 +17,19 @@ def _get_conn():
|
||||||
return conn
|
return conn
|
||||||
|
|
||||||
|
|
||||||
def _is_source_slug(slug):
|
def _classify_event(branch, description, commit_type):
|
||||||
return bool(slug and _SOURCE_SLUG_PATTERN.match(slug))
|
if commit_type != "knowledge":
|
||||||
|
|
||||||
|
|
||||||
def _classify_event(branch, description, commit_type, candidate_slug=None):
|
|
||||||
"""Return one of: create | enrich | challenge | source | None.
|
|
||||||
|
|
||||||
Source-archive PRs are extract/* branches that filed a source into
|
|
||||||
inbox/archive/ but didn't produce a claim. Two signals classify them
|
|
||||||
as 'source' (defense in depth):
|
|
||||||
1. extract/* branch with empty description (no claim title produced)
|
|
||||||
2. candidate_slug matches YYYY-MM-DD-...-HASH4 (inbox filename pattern)
|
|
||||||
"""
|
|
||||||
commit_type_l = (commit_type or "").lower()
|
|
||||||
branch = branch or ""
|
|
||||||
description_lower = (description or "").lower()
|
|
||||||
has_desc = bool(description and description.strip())
|
|
||||||
|
|
||||||
if commit_type_l not in _FEED_COMMIT_TYPES:
|
|
||||||
return None
|
return None
|
||||||
|
if branch and branch.startswith("extract/"):
|
||||||
# Explicit challenge signals win first.
|
return "create"
|
||||||
if (commit_type_l == "challenge"
|
if branch and branch.startswith("reweave/"):
|
||||||
or branch.startswith("challenge/")
|
return "enrich"
|
||||||
or "challenged_by" in description_lower):
|
if branch and branch.startswith("challenge/"):
|
||||||
return "challenge"
|
return "challenge"
|
||||||
|
if description and "challenged_by" in description.lower():
|
||||||
# Enrichment: reweave edge-connects, enrich/ branches, or commit_type=enrich.
|
return "challenge"
|
||||||
if (commit_type_l == "enrich"
|
if branch and branch.startswith("enrich/"):
|
||||||
or branch.startswith("enrich/")
|
|
||||||
or branch.startswith("reweave/")):
|
|
||||||
return "enrich"
|
return "enrich"
|
||||||
|
|
||||||
# Source-only: extract/* with no claim description means inbox archive
|
|
||||||
# landed but no domain claim was written.
|
|
||||||
if branch.startswith("extract/") and not has_desc:
|
|
||||||
return "source"
|
|
||||||
|
|
||||||
# Belt-and-suspenders: if the slug we'd surface to the frontend looks
|
|
||||||
# like an inbox archive filename (date-prefix-hash), treat as source
|
|
||||||
# regardless of branch/commit_type/description state. Catches cases
|
|
||||||
# where description leaked but is just a source title, not a claim.
|
|
||||||
if _is_source_slug(candidate_slug):
|
|
||||||
return "source"
|
|
||||||
|
|
||||||
# Everything else with a description is a new claim.
|
|
||||||
return "create"
|
return "create"
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -124,60 +81,33 @@ def _hot_score(challenge_count, enrich_count, signal_count, hours_since):
|
||||||
def _build_events():
|
def _build_events():
|
||||||
conn = _get_conn()
|
conn = _get_conn()
|
||||||
try:
|
try:
|
||||||
placeholders = ",".join("?" * len(_FEED_COMMIT_TYPES))
|
rows = conn.execute("""
|
||||||
rows = conn.execute(f"""
|
|
||||||
SELECT p.number, p.branch, p.domain, p.agent, p.submitted_by,
|
SELECT p.number, p.branch, p.domain, p.agent, p.submitted_by,
|
||||||
p.merged_at, p.description, p.commit_type, p.cost_usd,
|
p.merged_at, p.description, p.commit_type, p.cost_usd,
|
||||||
p.source_channel, p.source_path
|
p.source_channel
|
||||||
FROM prs p
|
FROM prs p
|
||||||
WHERE p.status = 'merged'
|
WHERE p.status = 'merged'
|
||||||
AND p.commit_type IN ({placeholders})
|
AND p.commit_type = 'knowledge'
|
||||||
AND p.merged_at IS NOT NULL
|
AND p.merged_at IS NOT NULL
|
||||||
ORDER BY p.merged_at DESC
|
ORDER BY p.merged_at DESC
|
||||||
LIMIT 2000
|
LIMIT 2000
|
||||||
""", _FEED_COMMIT_TYPES).fetchall()
|
""").fetchall()
|
||||||
|
|
||||||
events = []
|
events = []
|
||||||
claim_activity = {} # slug -> {challenges, enriches, signals, first_seen}
|
claim_activity = {} # slug -> {challenges, enriches, signals, first_seen}
|
||||||
|
|
||||||
for row in rows:
|
for row in rows:
|
||||||
slugs = _extract_claim_slugs(row["description"], row["branch"])
|
event_type = _classify_event(row["branch"], row["description"], row["commit_type"])
|
||||||
candidate_slug = slugs[0] if slugs else ""
|
|
||||||
event_type = _classify_event(
|
|
||||||
row["branch"], row["description"], row["commit_type"],
|
|
||||||
candidate_slug=candidate_slug,
|
|
||||||
)
|
|
||||||
if not event_type:
|
if not event_type:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
contributor = _normalize_contributor(row["submitted_by"], row["agent"])
|
contributor = _normalize_contributor(row["submitted_by"], row["agent"])
|
||||||
|
slugs = _extract_claim_slugs(row["description"], row["branch"])
|
||||||
merged_at = row["merged_at"] or ""
|
merged_at = row["merged_at"] or ""
|
||||||
|
|
||||||
ci_map = {"create": 0.35, "enrich": 0.25, "challenge": 0.40, "source": 0.15}
|
ci_map = {"create": 0.35, "enrich": 0.25, "challenge": 0.40}
|
||||||
ci_earned = ci_map.get(event_type, 0)
|
ci_earned = ci_map.get(event_type, 0)
|
||||||
|
|
||||||
# Source events never carry a claim_slug — no claim was written —
|
|
||||||
# so the frontend can't produce a 404-ing claim link.
|
|
||||||
if event_type == "source":
|
|
||||||
summary_text = _summary_from_branch(row["branch"])
|
|
||||||
source_slug = (
|
|
||||||
_summary_from_branch(row["branch"]).lower().replace(" ", "-")
|
|
||||||
or row["branch"]
|
|
||||||
)
|
|
||||||
events.append({
|
|
||||||
"type": "source",
|
|
||||||
"claim_slug": "",
|
|
||||||
"source_slug": source_slug,
|
|
||||||
"domain": row["domain"] or "unknown",
|
|
||||||
"contributor": contributor,
|
|
||||||
"timestamp": merged_at,
|
|
||||||
"ci_earned": round(ci_earned, 2),
|
|
||||||
"summary": summary_text,
|
|
||||||
"pr_number": row["number"],
|
|
||||||
"source_channel": row["source_channel"] or "unknown",
|
|
||||||
})
|
|
||||||
continue
|
|
||||||
|
|
||||||
for slug in slugs:
|
for slug in slugs:
|
||||||
if slug not in claim_activity:
|
if slug not in claim_activity:
|
||||||
claim_activity[slug] = {
|
claim_activity[slug] = {
|
||||||
|
|
@ -234,8 +164,8 @@ def _sort_events(events, claim_activity, sort_mode, now_ts):
|
||||||
return _hot_score(ca["challenges"], ca["enriches"], ca["signals"], hours)
|
return _hot_score(ca["challenges"], ca["enriches"], ca["signals"], hours)
|
||||||
events.sort(key=hot_key, reverse=True)
|
events.sort(key=hot_key, reverse=True)
|
||||||
elif sort_mode == "important":
|
elif sort_mode == "important":
|
||||||
type_rank = {"challenge": 0, "enrich": 1, "create": 2, "source": 3}
|
type_rank = {"challenge": 0, "enrich": 1, "create": 2}
|
||||||
events.sort(key=lambda e: (type_rank.get(e["type"], 4), -len(e["summary"])))
|
events.sort(key=lambda e: (type_rank.get(e["type"], 3), -len(e["summary"])))
|
||||||
return events
|
return events
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -245,8 +175,6 @@ async def handle_activity_feed(request):
|
||||||
sort_mode = "recent"
|
sort_mode = "recent"
|
||||||
domain = request.query.get("domain", "")
|
domain = request.query.get("domain", "")
|
||||||
contributor = request.query.get("contributor", "")
|
contributor = request.query.get("contributor", "")
|
||||||
type_param = request.query.get("type", "")
|
|
||||||
type_filter = {t.strip() for t in type_param.split(",") if t.strip()} if type_param else None
|
|
||||||
try:
|
try:
|
||||||
limit = min(int(request.query.get("limit", "20")), 100)
|
limit = min(int(request.query.get("limit", "20")), 100)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
|
|
@ -268,8 +196,6 @@ async def handle_activity_feed(request):
|
||||||
filtered = [e for e in filtered if e["domain"] == domain]
|
filtered = [e for e in filtered if e["domain"] == domain]
|
||||||
if contributor:
|
if contributor:
|
||||||
filtered = [e for e in filtered if e["contributor"] == contributor]
|
filtered = [e for e in filtered if e["contributor"] == contributor]
|
||||||
if type_filter:
|
|
||||||
filtered = [e for e in filtered if e["type"] in type_filter]
|
|
||||||
|
|
||||||
sorted_events = _sort_events(list(filtered), claim_activity, sort_mode, now)
|
sorted_events = _sort_events(list(filtered), claim_activity, sort_mode, now)
|
||||||
total = len(sorted_events)
|
total = len(sorted_events)
|
||||||
|
|
|
||||||
495
scripts/metadao-scrape.py
Executable file
495
scripts/metadao-scrape.py
Executable file
|
|
@ -0,0 +1,495 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""metadao-scrape.py — pull active/recent proposals from metadao.fi into source markdown.
|
||||||
|
|
||||||
|
Replaces the broken futard.io GraphQL ingestion (Cloud Run → teleo-api).
|
||||||
|
metadao.fi is a Vercel-protected Next.js App Router site; direct curl is blocked
|
||||||
|
by the anti-bot challenge. A real headless browser passes the challenge cleanly,
|
||||||
|
and once cookies are issued for the context we can call /api/decode-proposal/{addr}
|
||||||
|
from inside the browser to get structured instruction data.
|
||||||
|
|
||||||
|
Discovery flow:
|
||||||
|
1. visit / to prime Vercel cookies
|
||||||
|
2. visit /projects, scrape distinct /projects/{slug} hrefs
|
||||||
|
3. for each project, visit /projects/{slug}, scrape proposal addresses from DOM
|
||||||
|
4. for each NEW proposal (basename not already in --archive-dir):
|
||||||
|
a. visit proposal page, capture rendered prose
|
||||||
|
b. call /api/decode-proposal/{addr} via in-browser fetch for instructions
|
||||||
|
c. write source markdown to --output-dir
|
||||||
|
|
||||||
|
Idempotent. Skips proposals whose basename is already present in archive-dir
|
||||||
|
or output-dir. Designed to run from a systemd timer or one-shot.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 metadao-scrape.py --archive-dir /opt/teleo-eval/workspaces/main/inbox/archive \\
|
||||||
|
--output-dir /opt/teleo-eval/workspaces/main/inbox/queue \\
|
||||||
|
[--dry-run] [--limit 10] [--project solomon]
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from datetime import date, datetime
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s %(levelname)s %(message)s",
|
||||||
|
)
|
||||||
|
log = logging.getLogger("metadao-scrape")
|
||||||
|
|
||||||
|
BASE = "https://www.metadao.fi"
|
||||||
|
USER_AGENT = (
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def slugify(text: str, max_len: int = 60) -> str:
|
||||||
|
s = text.lower().strip()
|
||||||
|
s = re.sub(r"[^a-z0-9\s-]", "", s)
|
||||||
|
s = re.sub(r"\s+", "-", s)
|
||||||
|
s = re.sub(r"-+", "-", s)
|
||||||
|
return s.strip("-")[:max_len].rstrip("-")
|
||||||
|
|
||||||
|
|
||||||
|
def _yaml_str(s: str) -> str:
|
||||||
|
"""Quote-safe YAML string. JSON strings are valid YAML strings."""
|
||||||
|
return json.dumps(s, ensure_ascii=False)
|
||||||
|
|
||||||
|
|
||||||
|
def existing_basenames(*dirs: Path) -> set[str]:
|
||||||
|
"""Collect all .md basenames (without extension) across the given dirs (recursive)."""
|
||||||
|
seen: set[str] = set()
|
||||||
|
for d in dirs:
|
||||||
|
if not d.exists():
|
||||||
|
continue
|
||||||
|
for p in d.rglob("*.md"):
|
||||||
|
seen.add(p.stem)
|
||||||
|
return seen
|
||||||
|
|
||||||
|
|
||||||
|
PROP_ADDR_RE = re.compile(r"proposal_address:\s*[\"']?([A-Za-z0-9]{32,44})[\"']?")
|
||||||
|
URL_ADDR_RE = re.compile(r"(?:futard\.io|metadao\.fi)(?:/[^/\s\"']*)*?/proposal/([A-Za-z0-9]{32,44})")
|
||||||
|
|
||||||
|
|
||||||
|
def existing_proposal_addresses(*dirs: Path) -> set[str]:
|
||||||
|
"""Scan frontmatter / URLs in existing source files to collect known proposal addresses.
|
||||||
|
|
||||||
|
Reads only the first 4KB of each file (frontmatter + URL line are at the top)
|
||||||
|
to keep this fast on large archives.
|
||||||
|
"""
|
||||||
|
addrs: set[str] = set()
|
||||||
|
for d in dirs:
|
||||||
|
if not d.exists():
|
||||||
|
continue
|
||||||
|
for p in d.rglob("*.md"):
|
||||||
|
try:
|
||||||
|
head = p.read_text(errors="replace")[:4096]
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
for m in PROP_ADDR_RE.finditer(head):
|
||||||
|
addrs.add(m.group(1))
|
||||||
|
for m in URL_ADDR_RE.finditer(head):
|
||||||
|
addrs.add(m.group(1))
|
||||||
|
return addrs
|
||||||
|
|
||||||
|
|
||||||
|
def list_project_slugs(page) -> list[str]:
|
||||||
|
"""Read /projects and extract distinct project slugs."""
|
||||||
|
page.goto(f"{BASE}/projects", wait_until="domcontentloaded", timeout=30000)
|
||||||
|
page.wait_for_timeout(1500)
|
||||||
|
hrefs = page.evaluate(
|
||||||
|
"""() => {
|
||||||
|
const links = Array.from(document.querySelectorAll('a[href^="/projects/"]'));
|
||||||
|
const slugs = new Set();
|
||||||
|
for (const a of links) {
|
||||||
|
const m = a.getAttribute('href').match(/^\\/projects\\/([a-z0-9-]+)(?:\\/|$)/);
|
||||||
|
if (m && m[1]) slugs.add(m[1]);
|
||||||
|
}
|
||||||
|
return [...slugs];
|
||||||
|
}"""
|
||||||
|
)
|
||||||
|
return list(hrefs)
|
||||||
|
|
||||||
|
|
||||||
|
def get_project_metadata(page, slug: str) -> dict:
|
||||||
|
"""Visit a project page and return basic metadata + proposal addresses + card text.
|
||||||
|
Card text typically contains 'SOLO-004 ENDED DP-00003 (MEM): The Gigabus Proposal Pass $0.64...'
|
||||||
|
so we capture it for downstream title parsing.
|
||||||
|
"""
|
||||||
|
url = f"{BASE}/projects/{slug}"
|
||||||
|
page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
||||||
|
page.wait_for_timeout(1500)
|
||||||
|
|
||||||
|
proposals = page.evaluate(
|
||||||
|
"""() => {
|
||||||
|
const links = Array.from(document.querySelectorAll('a[href*="/proposal/"]'));
|
||||||
|
const seen = new Set();
|
||||||
|
const out = [];
|
||||||
|
const TARGET_ADDR_RE = /\\/proposal\\/([A-Za-z0-9]+)/;
|
||||||
|
for (const a of links) {
|
||||||
|
const m = a.getAttribute('href').match(TARGET_ADDR_RE);
|
||||||
|
if (!m) continue;
|
||||||
|
if (seen.has(m[1])) continue;
|
||||||
|
seen.add(m[1]);
|
||||||
|
const addr = m[1];
|
||||||
|
// Walk up only while the ancestor contains exactly one proposal link
|
||||||
|
// (so we get the card, not a parent that contains all cards).
|
||||||
|
let card = a;
|
||||||
|
while (card.parentElement) {
|
||||||
|
const parent = card.parentElement;
|
||||||
|
const propLinks = parent.querySelectorAll('a[href*="/proposal/"]');
|
||||||
|
if (propLinks.length > 1) break;
|
||||||
|
card = parent;
|
||||||
|
}
|
||||||
|
out.push({
|
||||||
|
address: addr,
|
||||||
|
link_text: (a.innerText || '').trim().slice(0, 600),
|
||||||
|
card_text: (card.innerText || '').trim().slice(0, 1500),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}"""
|
||||||
|
)
|
||||||
|
|
||||||
|
# Try to read project name from h1 / title
|
||||||
|
project_name = page.evaluate(
|
||||||
|
"""() => {
|
||||||
|
const h = document.querySelector('h1');
|
||||||
|
return h ? h.innerText.trim() : '';
|
||||||
|
}"""
|
||||||
|
) or slug.title()
|
||||||
|
|
||||||
|
return {"slug": slug, "name": project_name, "url": url, "proposals": proposals}
|
||||||
|
|
||||||
|
|
||||||
|
# Strict pattern: DP-NNNNN (CAT): Title — the canonical proposal heading.
|
||||||
|
DP_STRICT_RE = re.compile(r"DP-\d+\s*\([A-Z]+\)\s*[:\-]\s*[^\n\r]+", re.MULTILINE)
|
||||||
|
# Loose pattern: any line starting with DP-NNNNN followed by something.
|
||||||
|
DP_LOOSE_RE = re.compile(r"DP-\d+\s*(?:\([A-Z]+\))?\s*[:\-]?\s*[^\n\r]+", re.MULTILINE)
|
||||||
|
STAT_BLEED_RE = re.compile(
|
||||||
|
# Stat keywords only bleed when followed by a numeric/symbolic stat token,
|
||||||
|
# so word-only sequences like "Active Capital" or "Live Streaming Service" pass.
|
||||||
|
r"\s+\b(?:Pass|Fail|Passed|Failed|Active|Pending|Ended|Live|TOTAL|VOLUME|STATUS|MCAP|PRICE|SPOT)\b\s+(?:\$|\+|-|\d)"
|
||||||
|
r"|\s*(?:\$\d|\+\d{2,}|\d+\.\d+%|\d{5,})",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _clean_title_candidate(line: str) -> str:
|
||||||
|
line = line.strip()
|
||||||
|
# Find first bleed match past offset 10. re.search returns leftmost, but the
|
||||||
|
# DP-NNNNN digit sequence always wins first place; we want the first POST-title
|
||||||
|
# match instead. Walk all matches and trim at the earliest one past the guard.
|
||||||
|
for bleed in STAT_BLEED_RE.finditer(line):
|
||||||
|
if bleed.start() > 10:
|
||||||
|
line = line[: bleed.start()].rstrip(" :-—")
|
||||||
|
break
|
||||||
|
return line.strip()[:200]
|
||||||
|
|
||||||
|
|
||||||
|
def extract_dp_title(*texts: str) -> str:
|
||||||
|
"""Find the canonical 'DP-NNNNN (CAT): Title' line.
|
||||||
|
|
||||||
|
Strategy:
|
||||||
|
1. Try strict pattern (with parenthetical category code) across all sources.
|
||||||
|
Take the SHORTEST hit — prose continuations of an already-correct title
|
||||||
|
tend to be longer than the title itself.
|
||||||
|
2. Fall back to loose pattern, longest match.
|
||||||
|
"""
|
||||||
|
strict: list[str] = []
|
||||||
|
loose: list[str] = []
|
||||||
|
for t in texts:
|
||||||
|
if not t:
|
||||||
|
continue
|
||||||
|
for m in DP_STRICT_RE.finditer(t):
|
||||||
|
cleaned = _clean_title_candidate(m.group(0))
|
||||||
|
if cleaned:
|
||||||
|
strict.append(cleaned)
|
||||||
|
for m in DP_LOOSE_RE.finditer(t):
|
||||||
|
cleaned = _clean_title_candidate(m.group(0))
|
||||||
|
if cleaned:
|
||||||
|
loose.append(cleaned)
|
||||||
|
if strict:
|
||||||
|
return min(strict, key=len)
|
||||||
|
if loose:
|
||||||
|
return max(loose, key=len)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_proposal(page, project_slug: str, addr: str, card_text: str = "") -> dict | None:
|
||||||
|
"""Visit proposal page, capture rendered text + decode instructions via in-browser fetch."""
|
||||||
|
url = f"{BASE}/projects/{project_slug}/proposal/{addr}"
|
||||||
|
log.info("fetching proposal %s/%s", project_slug, addr[:8])
|
||||||
|
try:
|
||||||
|
page.goto(url, wait_until="domcontentloaded", timeout=45000)
|
||||||
|
except PWTimeout:
|
||||||
|
log.warning("timeout loading %s — using whatever rendered", url)
|
||||||
|
page.wait_for_timeout(2500) # let RSC stream finish
|
||||||
|
|
||||||
|
body_text = page.evaluate("() => document.body.innerText || ''")
|
||||||
|
|
||||||
|
# Title preference: card_text (from project page) → body_text DP-NNNNN match → first h1/h2
|
||||||
|
title_block = extract_dp_title(card_text, body_text)
|
||||||
|
if not title_block:
|
||||||
|
title_block = page.evaluate(
|
||||||
|
"""() => {
|
||||||
|
const h = document.querySelector('h1, h2');
|
||||||
|
return h ? h.innerText.trim() : '';
|
||||||
|
}"""
|
||||||
|
) or f"proposal-{addr[:8]}"
|
||||||
|
|
||||||
|
# Status: 'Passed' / 'Failed' / 'Active' / 'Pending'
|
||||||
|
status = page.evaluate(
|
||||||
|
"""() => {
|
||||||
|
const text = document.body.innerText || '';
|
||||||
|
const m = text.match(/\\n(Passed|Failed|Active|Pending|Live|Ended)\\b/);
|
||||||
|
return m ? m[1] : '';
|
||||||
|
}"""
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get the structured /api/decode-proposal data
|
||||||
|
decoded = None
|
||||||
|
try:
|
||||||
|
decoded = page.evaluate(
|
||||||
|
f"""async () => {{
|
||||||
|
try {{
|
||||||
|
const r = await fetch('/api/decode-proposal/{addr}');
|
||||||
|
if (!r.ok) return null;
|
||||||
|
return await r.json();
|
||||||
|
}} catch (e) {{ return null; }}
|
||||||
|
}}"""
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
log.debug("decode fetch failed for %s: %s", addr, e)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"address": addr,
|
||||||
|
"project_slug": project_slug,
|
||||||
|
"url": url,
|
||||||
|
"title": title_block,
|
||||||
|
"status": status,
|
||||||
|
"body_text": body_text,
|
||||||
|
"decoded": decoded,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_dp_code(title: str) -> tuple[str, str]:
|
||||||
|
"""Parse 'DP-00003 (MEM): The Gigabus Proposal' → ('dp-00003-mem', 'The Gigabus Proposal').
|
||||||
|
Falls back gracefully if format doesn't match.
|
||||||
|
"""
|
||||||
|
# Match leading DP-NNNNN[space(category)]?[:]?[space]? plus the rest
|
||||||
|
m = re.match(r"^(DP-\d+(?:\s*\([A-Z]+\))?)\s*[:\-]?\s*(.*)$", title.strip())
|
||||||
|
if m:
|
||||||
|
code = re.sub(r"[^a-z0-9]+", "-", m.group(1).lower()).strip("-")
|
||||||
|
rest = m.group(2).strip()
|
||||||
|
return code, rest
|
||||||
|
return "", title.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def build_filename(project_slug: str, proposal: dict, today: str) -> str:
|
||||||
|
"""YYYY-MM-DD-metadao-{slug}-{title-fragment}-{addr8}.md
|
||||||
|
|
||||||
|
Embedding the address fragment makes filenames stable across runs even when
|
||||||
|
the title isn't unique (e.g. projects that don't use DP-NNNNN naming).
|
||||||
|
"""
|
||||||
|
title = proposal.get("title") or ""
|
||||||
|
code, rest = parse_dp_code(title)
|
||||||
|
parts: list[str] = []
|
||||||
|
if code:
|
||||||
|
parts.append(code)
|
||||||
|
if rest:
|
||||||
|
parts.append(slugify(rest, max_len=40))
|
||||||
|
body_slug = "-".join(p for p in parts if p)[:60].rstrip("-")
|
||||||
|
addr_frag = proposal["address"][:8].lower()
|
||||||
|
if body_slug:
|
||||||
|
return f"{today}-metadao-{project_slug}-{body_slug}-{addr_frag}.md"
|
||||||
|
return f"{today}-metadao-{project_slug}-{addr_frag}.md"
|
||||||
|
|
||||||
|
|
||||||
|
def build_source_markdown(project: dict, proposal: dict, today: str) -> str:
|
||||||
|
"""Build the source markdown matching the existing schema."""
|
||||||
|
title = proposal.get("title") or f"{project['name']} proposal {proposal['address'][:8]}"
|
||||||
|
body_text = (proposal.get("body_text") or "").strip()
|
||||||
|
decoded = proposal.get("decoded") or {}
|
||||||
|
|
||||||
|
# Build YAML frontmatter — all free-text values escaped via _yaml_str (json.dumps).
|
||||||
|
# project_slug is constrained to [a-z0-9-] by slugify upstream, but pass through
|
||||||
|
# the same path for consistency.
|
||||||
|
full_title = f"MetaDAO: {project['name']} — {title}"
|
||||||
|
fm_lines = [
|
||||||
|
"---",
|
||||||
|
"type: source",
|
||||||
|
f"title: {_yaml_str(full_title)}",
|
||||||
|
f"author: {_yaml_str('metadao.fi')}",
|
||||||
|
f"url: {_yaml_str(proposal['url'])}",
|
||||||
|
f"date: {today}",
|
||||||
|
"domain: internet-finance",
|
||||||
|
"format: data",
|
||||||
|
"status: unprocessed",
|
||||||
|
f"tags: [futardio, metadao, futarchy, solana, governance, {project['slug']}]",
|
||||||
|
"event_type: proposal",
|
||||||
|
f"project_slug: {_yaml_str(project['slug'])}",
|
||||||
|
f"proposal_address: {_yaml_str(proposal['address'])}",
|
||||||
|
]
|
||||||
|
if proposal.get("status"):
|
||||||
|
fm_lines.append(f"proposal_status: {_yaml_str(proposal['status'])}")
|
||||||
|
if decoded.get("squadsProposal"):
|
||||||
|
fm_lines.append(f"squads_proposal: {_yaml_str(decoded['squadsProposal'])}")
|
||||||
|
if decoded.get("squadsStatus"):
|
||||||
|
fm_lines.append(f"squads_status: {_yaml_str(decoded['squadsStatus'])}")
|
||||||
|
fm_lines.append("---")
|
||||||
|
fm_lines.append("")
|
||||||
|
|
||||||
|
# Header section — quick facts
|
||||||
|
body_md = [
|
||||||
|
f"# {title}",
|
||||||
|
"",
|
||||||
|
"## Proposal Details",
|
||||||
|
f"- Project: {project['name']} (`{project['slug']}`)",
|
||||||
|
f"- Proposal: {title}",
|
||||||
|
f"- Address: `{proposal['address']}`",
|
||||||
|
]
|
||||||
|
if proposal.get("status"):
|
||||||
|
body_md.append(f"- Status: {proposal['status']}")
|
||||||
|
body_md.append(f"- URL: {proposal['url']}")
|
||||||
|
|
||||||
|
# Proposal prose body (rendered text from the page)
|
||||||
|
body_md.append("")
|
||||||
|
body_md.append("## Proposal Body")
|
||||||
|
body_md.append("")
|
||||||
|
body_md.append(body_text or "_(no body captured)_")
|
||||||
|
|
||||||
|
# Decoded on-chain instructions
|
||||||
|
if decoded:
|
||||||
|
body_md.append("")
|
||||||
|
body_md.append("## On-chain Decoded")
|
||||||
|
if decoded.get("squadsUrl"):
|
||||||
|
body_md.append(f"- Squads: {decoded['squadsUrl']}")
|
||||||
|
instrs = decoded.get("instructions") or []
|
||||||
|
if instrs:
|
||||||
|
body_md.append("")
|
||||||
|
body_md.append("### Instructions")
|
||||||
|
for i, instr in enumerate(instrs, 1):
|
||||||
|
body_md.append(f"{i}. **{instr.get('description', instr.get('type', 'instruction'))}** ({instr.get('program', '')})")
|
||||||
|
for f in instr.get("fields", []) or []:
|
||||||
|
val = f.get("fullValue") or f.get("value") or ""
|
||||||
|
body_md.append(f" - {f.get('label', '')}: `{val}`")
|
||||||
|
if instr.get("summary"):
|
||||||
|
body_md.append(f" - Summary: {instr['summary']}")
|
||||||
|
|
||||||
|
return "\n".join(fm_lines + body_md) + "\n"
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
p = argparse.ArgumentParser(description="Scrape MetaDAO proposals into inbox source files")
|
||||||
|
p.add_argument("--archive-dir", required=True, help="existing archive dir (skip if basename exists here)")
|
||||||
|
p.add_argument("--output-dir", required=True, help="dir to write new source markdown into")
|
||||||
|
p.add_argument("--project", help="restrict to a single project slug (default: scan all)")
|
||||||
|
p.add_argument("--limit", type=int, default=0, help="max number of new proposals to capture (0 = unlimited)")
|
||||||
|
p.add_argument("--dry-run", action="store_true", help="print intended writes instead of writing")
|
||||||
|
p.add_argument("--headless", action="store_true", default=True)
|
||||||
|
args = p.parse_args()
|
||||||
|
|
||||||
|
archive_dir = Path(args.archive_dir).resolve()
|
||||||
|
output_dir = Path(args.output_dir).resolve()
|
||||||
|
seen_basenames = existing_basenames(archive_dir, output_dir)
|
||||||
|
seen_addresses = existing_proposal_addresses(archive_dir, output_dir)
|
||||||
|
log.info("loaded %d existing basenames + %d known proposal addresses from %s + %s",
|
||||||
|
len(seen_basenames), len(seen_addresses), archive_dir, output_dir)
|
||||||
|
|
||||||
|
today = date.today().isoformat()
|
||||||
|
|
||||||
|
written: list[str] = []
|
||||||
|
skipped_existing = 0
|
||||||
|
|
||||||
|
with sync_playwright() as pw:
|
||||||
|
browser = pw.chromium.launch(headless=args.headless)
|
||||||
|
ctx = browser.new_context(user_agent=USER_AGENT)
|
||||||
|
page = ctx.new_page()
|
||||||
|
|
||||||
|
# Prime cookies
|
||||||
|
log.info("priming Vercel session via homepage")
|
||||||
|
page.goto(f"{BASE}/", wait_until="domcontentloaded", timeout=30000)
|
||||||
|
page.wait_for_timeout(1500)
|
||||||
|
|
||||||
|
# Discovery
|
||||||
|
if args.project:
|
||||||
|
project_slugs = [args.project]
|
||||||
|
else:
|
||||||
|
project_slugs = list_project_slugs(page)
|
||||||
|
log.info("discovered %d project slugs: %s", len(project_slugs), project_slugs)
|
||||||
|
|
||||||
|
for slug in project_slugs:
|
||||||
|
try:
|
||||||
|
project = get_project_metadata(page, slug)
|
||||||
|
except Exception:
|
||||||
|
log.exception("failed to read project %s", slug)
|
||||||
|
continue
|
||||||
|
log.info(" %s — %d proposals", slug, len(project["proposals"]))
|
||||||
|
|
||||||
|
for prop in project["proposals"]:
|
||||||
|
addr = prop["address"]
|
||||||
|
# Pre-check #1: known proposal address (cheapest, no browser visit)
|
||||||
|
if addr in seen_addresses:
|
||||||
|
skipped_existing += 1
|
||||||
|
continue
|
||||||
|
# Pre-check #2: address fragment in an existing basename
|
||||||
|
addr_frag = addr[:8].lower()
|
||||||
|
if any(addr_frag in b.lower() for b in seen_basenames):
|
||||||
|
skipped_existing += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
proposal_data = fetch_proposal(page, slug, addr, card_text=prop.get("card_text", ""))
|
||||||
|
except Exception:
|
||||||
|
log.exception("failed to fetch proposal %s/%s", slug, addr)
|
||||||
|
continue
|
||||||
|
if not proposal_data:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Minimum-render gate: skip partial renders rather than archiving stubs.
|
||||||
|
# Successful captures are 20KB+; require either a real body or a DP-N title.
|
||||||
|
body_len = len(proposal_data.get("body_text") or "")
|
||||||
|
has_dp_match = bool(re.search(r"DP-\d+", proposal_data.get("title", "") or ""))
|
||||||
|
if body_len < 500 and not has_dp_match:
|
||||||
|
log.warning(" skip (insufficient render): %s body=%dB title=%r",
|
||||||
|
addr, body_len, proposal_data.get("title", ""))
|
||||||
|
continue
|
||||||
|
|
||||||
|
fname = build_filename(slug, proposal_data, today)
|
||||||
|
|
||||||
|
if Path(fname).stem in seen_basenames:
|
||||||
|
skipped_existing += 1
|
||||||
|
log.info(" skip (already archived by title): %s", fname)
|
||||||
|
continue
|
||||||
|
|
||||||
|
content = build_source_markdown(project, proposal_data, today)
|
||||||
|
target = output_dir / fname
|
||||||
|
if args.dry_run:
|
||||||
|
log.info(" DRY: would write %s (%d bytes)", target, len(content))
|
||||||
|
else:
|
||||||
|
target.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
target.write_text(content)
|
||||||
|
log.info(" wrote %s (%d bytes)", target, len(content))
|
||||||
|
written.append(fname)
|
||||||
|
|
||||||
|
if args.limit and len(written) >= args.limit:
|
||||||
|
log.info("hit limit=%d, stopping", args.limit)
|
||||||
|
browser.close()
|
||||||
|
print(json.dumps({"written": written, "skipped_existing": skipped_existing, "dry_run": args.dry_run}))
|
||||||
|
return 0
|
||||||
|
|
||||||
|
browser.close()
|
||||||
|
|
||||||
|
print(json.dumps({"written": written, "skipped_existing": skipped_existing, "dry_run": args.dry_run}))
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
Loading…
Reference in a new issue