#!/usr/bin/env python3 """metadao-scrape.py — pull active/recent proposals from metadao.fi into source markdown. Replaces the broken futard.io GraphQL ingestion (Cloud Run → teleo-api). metadao.fi is a Vercel-protected Next.js App Router site; direct curl is blocked by the anti-bot challenge. A real headless browser passes the challenge cleanly, and once cookies are issued for the context we can call /api/decode-proposal/{addr} from inside the browser to get structured instruction data. Discovery flow: 1. visit / to prime Vercel cookies 2. visit /projects, scrape distinct /projects/{slug} hrefs 3. for each project, visit /projects/{slug}, scrape proposal addresses from DOM 4. for each NEW proposal (basename not already in --archive-dir): a. visit proposal page, capture rendered prose b. call /api/decode-proposal/{addr} via in-browser fetch for instructions c. write source markdown to --output-dir Idempotent. Skips proposals whose basename is already present in archive-dir or output-dir. Designed to run from a systemd timer or one-shot. Usage: python3 metadao-scrape.py --archive-dir /opt/teleo-eval/workspaces/main/inbox/archive \\ --output-dir /opt/teleo-eval/workspaces/main/inbox/queue \\ [--dry-run] [--limit 10] [--project solomon] """ from __future__ import annotations import argparse import json import logging import re import sys from datetime import date, datetime from pathlib import Path from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s", ) log = logging.getLogger("metadao-scrape") BASE = "https://www.metadao.fi" USER_AGENT = ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36" ) def slugify(text: str, max_len: int = 60) -> str: s = text.lower().strip() s = re.sub(r"[^a-z0-9\s-]", "", s) s = re.sub(r"\s+", "-", s) s = re.sub(r"-+", "-", s) return s.strip("-")[:max_len].rstrip("-") def _yaml_str(s: str) -> str: """Quote-safe YAML string. JSON strings are valid YAML strings.""" return json.dumps(s, ensure_ascii=False) def existing_basenames(*dirs: Path) -> set[str]: """Collect all .md basenames (without extension) across the given dirs (recursive).""" seen: set[str] = set() for d in dirs: if not d.exists(): continue for p in d.rglob("*.md"): seen.add(p.stem) return seen PROP_ADDR_RE = re.compile(r"proposal_address:\s*[\"']?([A-Za-z0-9]{32,44})[\"']?") URL_ADDR_RE = re.compile(r"(?:futard\.io|metadao\.fi)(?:/[^/\s\"']*)*?/proposal/([A-Za-z0-9]{32,44})") def existing_proposal_addresses(*dirs: Path) -> set[str]: """Scan frontmatter / URLs in existing source files to collect known proposal addresses. Reads only the first 4KB of each file (frontmatter + URL line are at the top) to keep this fast on large archives. """ addrs: set[str] = set() for d in dirs: if not d.exists(): continue for p in d.rglob("*.md"): try: head = p.read_text(errors="replace")[:4096] except Exception: continue for m in PROP_ADDR_RE.finditer(head): addrs.add(m.group(1)) for m in URL_ADDR_RE.finditer(head): addrs.add(m.group(1)) return addrs def list_project_slugs(page) -> list[str]: """Read /projects and extract distinct project slugs.""" page.goto(f"{BASE}/projects", wait_until="domcontentloaded", timeout=30000) page.wait_for_timeout(1500) hrefs = page.evaluate( """() => { const links = Array.from(document.querySelectorAll('a[href^="/projects/"]')); const slugs = new Set(); for (const a of links) { const m = a.getAttribute('href').match(/^\\/projects\\/([a-z0-9-]+)(?:\\/|$)/); if (m && m[1]) slugs.add(m[1]); } return [...slugs]; }""" ) return list(hrefs) def get_project_metadata(page, slug: str) -> dict: """Visit a project page and return basic metadata + proposal addresses + card text. Card text typically contains 'SOLO-004 ENDED DP-00003 (MEM): The Gigabus Proposal Pass $0.64...' so we capture it for downstream title parsing. """ url = f"{BASE}/projects/{slug}" page.goto(url, wait_until="domcontentloaded", timeout=30000) page.wait_for_timeout(1500) proposals = page.evaluate( """() => { const links = Array.from(document.querySelectorAll('a[href*="/proposal/"]')); const seen = new Set(); const out = []; const TARGET_ADDR_RE = /\\/proposal\\/([A-Za-z0-9]+)/; for (const a of links) { const m = a.getAttribute('href').match(TARGET_ADDR_RE); if (!m) continue; if (seen.has(m[1])) continue; seen.add(m[1]); const addr = m[1]; // Walk up only while the ancestor contains exactly one proposal link // (so we get the card, not a parent that contains all cards). let card = a; while (card.parentElement) { const parent = card.parentElement; const propLinks = parent.querySelectorAll('a[href*="/proposal/"]'); if (propLinks.length > 1) break; card = parent; } out.push({ address: addr, link_text: (a.innerText || '').trim().slice(0, 600), card_text: (card.innerText || '').trim().slice(0, 1500), }); } return out; }""" ) # Try to read project name from h1 / title project_name = page.evaluate( """() => { const h = document.querySelector('h1'); return h ? h.innerText.trim() : ''; }""" ) or slug.title() return {"slug": slug, "name": project_name, "url": url, "proposals": proposals} # Strict pattern: DP-NNNNN (CAT): Title — the canonical proposal heading. DP_STRICT_RE = re.compile(r"DP-\d+\s*\([A-Z]+\)\s*[:\-]\s*[^\n\r]+", re.MULTILINE) # Loose pattern: any line starting with DP-NNNNN followed by something. DP_LOOSE_RE = re.compile(r"DP-\d+\s*(?:\([A-Z]+\))?\s*[:\-]?\s*[^\n\r]+", re.MULTILINE) STAT_BLEED_RE = re.compile( # Stat keywords only bleed when followed by a numeric/symbolic stat token, # so word-only sequences like "Active Capital" or "Live Streaming Service" pass. r"\s+\b(?:Pass|Fail|Passed|Failed|Active|Pending|Ended|Live|TOTAL|VOLUME|STATUS|MCAP|PRICE|SPOT)\b\s+(?:\$|\+|-|\d)" r"|\s*(?:\$\d|\+\d{2,}|\d+\.\d+%|\d{5,})", re.IGNORECASE, ) def _clean_title_candidate(line: str) -> str: line = line.strip() # Find first bleed match past offset 10. re.search returns leftmost, but the # DP-NNNNN digit sequence always wins first place; we want the first POST-title # match instead. Walk all matches and trim at the earliest one past the guard. for bleed in STAT_BLEED_RE.finditer(line): if bleed.start() > 10: line = line[: bleed.start()].rstrip(" :-—") break return line.strip()[:200] def extract_dp_title(*texts: str) -> str: """Find the canonical 'DP-NNNNN (CAT): Title' line. Strategy: 1. Try strict pattern (with parenthetical category code) across all sources. Take the SHORTEST hit — prose continuations of an already-correct title tend to be longer than the title itself. 2. Fall back to loose pattern, longest match. """ strict: list[str] = [] loose: list[str] = [] for t in texts: if not t: continue for m in DP_STRICT_RE.finditer(t): cleaned = _clean_title_candidate(m.group(0)) if cleaned: strict.append(cleaned) for m in DP_LOOSE_RE.finditer(t): cleaned = _clean_title_candidate(m.group(0)) if cleaned: loose.append(cleaned) if strict: return min(strict, key=len) if loose: return max(loose, key=len) return "" def fetch_proposal(page, project_slug: str, addr: str, card_text: str = "") -> dict | None: """Visit proposal page, capture rendered text + decode instructions via in-browser fetch.""" url = f"{BASE}/projects/{project_slug}/proposal/{addr}" log.info("fetching proposal %s/%s", project_slug, addr[:8]) try: page.goto(url, wait_until="domcontentloaded", timeout=45000) except PWTimeout: log.warning("timeout loading %s — using whatever rendered", url) page.wait_for_timeout(2500) # let RSC stream finish body_text = page.evaluate("() => document.body.innerText || ''") # Title preference: card_text (from project page) → body_text DP-NNNNN match → first h1/h2 title_block = extract_dp_title(card_text, body_text) if not title_block: title_block = page.evaluate( """() => { const h = document.querySelector('h1, h2'); return h ? h.innerText.trim() : ''; }""" ) or f"proposal-{addr[:8]}" # Status: 'Passed' / 'Failed' / 'Active' / 'Pending' status = page.evaluate( """() => { const text = document.body.innerText || ''; const m = text.match(/\\n(Passed|Failed|Active|Pending|Live|Ended)\\b/); return m ? m[1] : ''; }""" ) # Get the structured /api/decode-proposal data decoded = None try: decoded = page.evaluate( f"""async () => {{ try {{ const r = await fetch('/api/decode-proposal/{addr}'); if (!r.ok) return null; return await r.json(); }} catch (e) {{ return null; }} }}""" ) except Exception as e: log.debug("decode fetch failed for %s: %s", addr, e) return { "address": addr, "project_slug": project_slug, "url": url, "title": title_block, "status": status, "body_text": body_text, "decoded": decoded, } def parse_dp_code(title: str) -> tuple[str, str]: """Parse 'DP-00003 (MEM): The Gigabus Proposal' → ('dp-00003-mem', 'The Gigabus Proposal'). Falls back gracefully if format doesn't match. """ # Match leading DP-NNNNN[space(category)]?[:]?[space]? plus the rest m = re.match(r"^(DP-\d+(?:\s*\([A-Z]+\))?)\s*[:\-]?\s*(.*)$", title.strip()) if m: code = re.sub(r"[^a-z0-9]+", "-", m.group(1).lower()).strip("-") rest = m.group(2).strip() return code, rest return "", title.strip() def build_filename(project_slug: str, proposal: dict, today: str) -> str: """YYYY-MM-DD-metadao-{slug}-{title-fragment}-{addr8}.md Embedding the address fragment makes filenames stable across runs even when the title isn't unique (e.g. projects that don't use DP-NNNNN naming). """ title = proposal.get("title") or "" code, rest = parse_dp_code(title) parts: list[str] = [] if code: parts.append(code) if rest: parts.append(slugify(rest, max_len=40)) body_slug = "-".join(p for p in parts if p)[:60].rstrip("-") addr_frag = proposal["address"][:8].lower() if body_slug: return f"{today}-metadao-{project_slug}-{body_slug}-{addr_frag}.md" return f"{today}-metadao-{project_slug}-{addr_frag}.md" def build_source_markdown(project: dict, proposal: dict, today: str) -> str: """Build the source markdown matching the existing schema.""" title = proposal.get("title") or f"{project['name']} proposal {proposal['address'][:8]}" body_text = (proposal.get("body_text") or "").strip() decoded = proposal.get("decoded") or {} # Build YAML frontmatter — all free-text values escaped via _yaml_str (json.dumps). # project_slug is constrained to [a-z0-9-] by slugify upstream, but pass through # the same path for consistency. full_title = f"MetaDAO: {project['name']} — {title}" fm_lines = [ "---", "type: source", f"title: {_yaml_str(full_title)}", f"author: {_yaml_str('metadao.fi')}", f"url: {_yaml_str(proposal['url'])}", f"date: {today}", "domain: internet-finance", "format: data", "status: unprocessed", f"tags: [futardio, metadao, futarchy, solana, governance, {project['slug']}]", "event_type: proposal", f"project_slug: {_yaml_str(project['slug'])}", f"proposal_address: {_yaml_str(proposal['address'])}", ] if proposal.get("status"): fm_lines.append(f"proposal_status: {_yaml_str(proposal['status'])}") if decoded.get("squadsProposal"): fm_lines.append(f"squads_proposal: {_yaml_str(decoded['squadsProposal'])}") if decoded.get("squadsStatus"): fm_lines.append(f"squads_status: {_yaml_str(decoded['squadsStatus'])}") fm_lines.append("---") fm_lines.append("") # Header section — quick facts body_md = [ f"# {title}", "", "## Proposal Details", f"- Project: {project['name']} (`{project['slug']}`)", f"- Proposal: {title}", f"- Address: `{proposal['address']}`", ] if proposal.get("status"): body_md.append(f"- Status: {proposal['status']}") body_md.append(f"- URL: {proposal['url']}") # Proposal prose body (rendered text from the page) body_md.append("") body_md.append("## Proposal Body") body_md.append("") body_md.append(body_text or "_(no body captured)_") # Decoded on-chain instructions if decoded: body_md.append("") body_md.append("## On-chain Decoded") if decoded.get("squadsUrl"): body_md.append(f"- Squads: {decoded['squadsUrl']}") instrs = decoded.get("instructions") or [] if instrs: body_md.append("") body_md.append("### Instructions") for i, instr in enumerate(instrs, 1): body_md.append(f"{i}. **{instr.get('description', instr.get('type', 'instruction'))}** ({instr.get('program', '')})") for f in instr.get("fields", []) or []: val = f.get("fullValue") or f.get("value") or "" body_md.append(f" - {f.get('label', '')}: `{val}`") if instr.get("summary"): body_md.append(f" - Summary: {instr['summary']}") return "\n".join(fm_lines + body_md) + "\n" def main() -> int: p = argparse.ArgumentParser(description="Scrape MetaDAO proposals into inbox source files") p.add_argument("--archive-dir", required=True, help="existing archive dir (skip if basename exists here)") p.add_argument("--output-dir", required=True, help="dir to write new source markdown into") p.add_argument("--project", help="restrict to a single project slug (default: scan all)") p.add_argument("--limit", type=int, default=0, help="max number of new proposals to capture (0 = unlimited)") p.add_argument("--dry-run", action="store_true", help="print intended writes instead of writing") p.add_argument("--headless", action="store_true", default=True) args = p.parse_args() archive_dir = Path(args.archive_dir).resolve() output_dir = Path(args.output_dir).resolve() seen_basenames = existing_basenames(archive_dir, output_dir) seen_addresses = existing_proposal_addresses(archive_dir, output_dir) log.info("loaded %d existing basenames + %d known proposal addresses from %s + %s", len(seen_basenames), len(seen_addresses), archive_dir, output_dir) today = date.today().isoformat() written: list[str] = [] skipped_existing = 0 with sync_playwright() as pw: browser = pw.chromium.launch(headless=args.headless) ctx = browser.new_context(user_agent=USER_AGENT) page = ctx.new_page() # Prime cookies log.info("priming Vercel session via homepage") page.goto(f"{BASE}/", wait_until="domcontentloaded", timeout=30000) page.wait_for_timeout(1500) # Discovery if args.project: project_slugs = [args.project] else: project_slugs = list_project_slugs(page) log.info("discovered %d project slugs: %s", len(project_slugs), project_slugs) for slug in project_slugs: try: project = get_project_metadata(page, slug) except Exception: log.exception("failed to read project %s", slug) continue log.info(" %s — %d proposals", slug, len(project["proposals"])) for prop in project["proposals"]: addr = prop["address"] # Pre-check #1: known proposal address (cheapest, no browser visit) if addr in seen_addresses: skipped_existing += 1 continue # Pre-check #2: address fragment in an existing basename addr_frag = addr[:8].lower() if any(addr_frag in b.lower() for b in seen_basenames): skipped_existing += 1 continue try: proposal_data = fetch_proposal(page, slug, addr, card_text=prop.get("card_text", "")) except Exception: log.exception("failed to fetch proposal %s/%s", slug, addr) continue if not proposal_data: continue # Minimum-render gate: skip partial renders rather than archiving stubs. # Successful captures are 20KB+; require either a real body or a DP-N title. body_len = len(proposal_data.get("body_text") or "") has_dp_match = bool(re.search(r"DP-\d+", proposal_data.get("title", "") or "")) if body_len < 500 and not has_dp_match: log.warning(" skip (insufficient render): %s body=%dB title=%r", addr, body_len, proposal_data.get("title", "")) continue fname = build_filename(slug, proposal_data, today) if Path(fname).stem in seen_basenames: skipped_existing += 1 log.info(" skip (already archived by title): %s", fname) continue content = build_source_markdown(project, proposal_data, today) target = output_dir / fname if args.dry_run: log.info(" DRY: would write %s (%d bytes)", target, len(content)) else: target.parent.mkdir(parents=True, exist_ok=True) target.write_text(content) log.info(" wrote %s (%d bytes)", target, len(content)) written.append(fname) if args.limit and len(written) >= args.limit: log.info("hit limit=%d, stopping", args.limit) browser.close() print(json.dumps({"written": written, "skipped_existing": skipped_existing, "dry_run": args.dry_run})) return 0 browser.close() print(json.dumps({"written": written, "skipped_existing": skipped_existing, "dry_run": args.dry_run})) return 0 if __name__ == "__main__": sys.exit(main())