feat(ingestion): metadao.fi scraper to replace broken futard.io ingestion
Some checks are pending
CI / lint-and-test (pull_request) Waiting to run

Background:
- futard.io retired its /api/graphql endpoint between Apr 17–20
- Cloud Scheduler ingest-futard has been firing into 500s ever since
  (the AttributeError on e.url masked the real 404 for 5 days; fixed
   in living-ip/teleo-api@b8eb441 which surfaced the actual root cause)
- The ecosystem migrated to metadao.fi, which is Vercel-protected
- Direct curl is blocked by Vercel's anti-bot challenge regardless of
  headers; a real headless browser passes it cleanly

Approach:
- Playwright-driven scraper, runs as a one-shot
- Discovery: scrape /projects DOM for project slugs, then each
  /projects/{slug} for proposal addresses
- For each NEW proposal: visit page for prose body + call
  /api/decode-proposal/{addr} via in-browser fetch (bypasses challenge
  via the primed Vercel cookies in the browser context) for structured
  on-chain instructions
- Idempotent: dedup against existing proposal addresses in archive
  frontmatter AND filename basenames
- Filename embeds 8-char address fragment for stable cross-run dedup
  even on projects that don't use DP-NNNNN naming convention

Tested locally against 6 active projects (p2p-protocol, paystream,
zklsol, loyal, ranger, solomon). Captured 13 new proposals — including
the Solomon Gigabus DP-00003 that triggered this work — with proper
titles, status, on-chain instruction decoding (Squads transactions,
SPL transfers, memos), and project metadata.

Output schema matches existing futardio source files (type: source,
event_type: proposal, domain: internet-finance, status: unprocessed)
so the existing extract pipeline picks them up unchanged.

Architectural note: this script is intentionally NOT wired to systemd
yet — VPS deploy needs Playwright + Chromium system libs which require
apt sudo (currently scoped to teleo-* services only). Reviewing the
script first; deploy path is a separate decision.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
m3taversal 2026-04-25 13:09:31 +01:00
parent 3fe524dd14
commit b8fba8195f

471
scripts/metadao-scrape.py Executable file
View file

@ -0,0 +1,471 @@
#!/usr/bin/env python3
"""metadao-scrape.py — pull active/recent proposals from metadao.fi into source markdown.
Replaces the broken futard.io GraphQL ingestion (Cloud Run teleo-api).
metadao.fi is a Vercel-protected Next.js App Router site; direct curl is blocked
by the anti-bot challenge. A real headless browser passes the challenge cleanly,
and once cookies are issued for the context we can call /api/decode-proposal/{addr}
from inside the browser to get structured instruction data.
Discovery flow:
1. visit / to prime Vercel cookies
2. visit /projects, scrape distinct /projects/{slug} hrefs
3. for each project, visit /projects/{slug}, scrape proposal addresses from DOM
4. for each NEW proposal (basename not already in --archive-dir):
a. visit proposal page, capture rendered prose
b. call /api/decode-proposal/{addr} via in-browser fetch for instructions
c. write source markdown to --output-dir
Idempotent. Skips proposals whose basename is already present in archive-dir
or output-dir. Designed to run from a systemd timer or one-shot.
Usage:
python3 metadao-scrape.py --archive-dir /opt/teleo-eval/workspaces/main/inbox/archive \\
--output-dir /opt/teleo-eval/workspaces/main/inbox/queue \\
[--dry-run] [--limit 10] [--project solomon]
"""
from __future__ import annotations
import argparse
import json
import logging
import re
import sys
from datetime import date, datetime
from pathlib import Path
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
)
log = logging.getLogger("metadao-scrape")
BASE = "https://www.metadao.fi"
USER_AGENT = (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
)
def slugify(text: str, max_len: int = 60) -> str:
s = text.lower().strip()
s = re.sub(r"[^a-z0-9\s-]", "", s)
s = re.sub(r"\s+", "-", s)
s = re.sub(r"-+", "-", s)
return s.strip("-")[:max_len].rstrip("-")
def existing_basenames(*dirs: Path) -> set[str]:
"""Collect all .md basenames (without extension) across the given dirs (recursive)."""
seen: set[str] = set()
for d in dirs:
if not d.exists():
continue
for p in d.rglob("*.md"):
seen.add(p.stem)
return seen
PROP_ADDR_RE = re.compile(r"proposal_address:\s*[\"']?([A-Za-z0-9]{32,44})[\"']?")
URL_ADDR_RE = re.compile(r"(?:futard\.io|metadao\.fi)/[^/\s\"']*proposal/([A-Za-z0-9]{32,44})")
def existing_proposal_addresses(*dirs: Path) -> set[str]:
"""Scan frontmatter / URLs in existing source files to collect known proposal addresses.
Reads only the first 4KB of each file (frontmatter + URL line are at the top)
to keep this fast on large archives.
"""
addrs: set[str] = set()
for d in dirs:
if not d.exists():
continue
for p in d.rglob("*.md"):
try:
head = p.read_text(errors="replace")[:4096]
except Exception:
continue
for m in PROP_ADDR_RE.finditer(head):
addrs.add(m.group(1))
for m in URL_ADDR_RE.finditer(head):
addrs.add(m.group(1))
return addrs
def list_project_slugs(page) -> list[str]:
"""Read /projects and extract distinct project slugs."""
page.goto(f"{BASE}/projects", wait_until="domcontentloaded", timeout=30000)
page.wait_for_timeout(1500)
hrefs = page.evaluate(
"""() => {
const links = Array.from(document.querySelectorAll('a[href^="/projects/"]'));
const slugs = new Set();
for (const a of links) {
const m = a.getAttribute('href').match(/^\\/projects\\/([a-z0-9-]+)(?:\\/|$)/);
if (m && m[1]) slugs.add(m[1]);
}
return [...slugs];
}"""
)
return list(hrefs)
def get_project_metadata(page, slug: str) -> dict:
"""Visit a project page and return basic metadata + proposal addresses + card text.
Card text typically contains 'SOLO-004 ENDED DP-00003 (MEM): The Gigabus Proposal Pass $0.64...'
so we capture it for downstream title parsing.
"""
url = f"{BASE}/projects/{slug}"
page.goto(url, wait_until="domcontentloaded", timeout=30000)
page.wait_for_timeout(1500)
proposals = page.evaluate(
"""() => {
const links = Array.from(document.querySelectorAll('a[href*="/proposal/"]'));
const seen = new Set();
const out = [];
const TARGET_ADDR_RE = /\\/proposal\\/([A-Za-z0-9]+)/;
for (const a of links) {
const m = a.getAttribute('href').match(TARGET_ADDR_RE);
if (!m) continue;
if (seen.has(m[1])) continue;
seen.add(m[1]);
const addr = m[1];
// Walk up only while the ancestor contains exactly one proposal link
// (so we get the card, not a parent that contains all cards).
let card = a;
while (card.parentElement) {
const parent = card.parentElement;
const propLinks = parent.querySelectorAll('a[href*="/proposal/"]');
if (propLinks.length > 1) break;
card = parent;
}
out.push({
address: addr,
link_text: (a.innerText || '').trim().slice(0, 600),
card_text: (card.innerText || '').trim().slice(0, 1500),
});
}
return out;
}"""
)
# Try to read project name from h1 / title
project_name = page.evaluate(
"""() => {
const h = document.querySelector('h1');
return h ? h.innerText.trim() : '';
}"""
) or slug.title()
return {"slug": slug, "name": project_name, "url": url, "proposals": proposals}
# Strict pattern: DP-NNNNN (CAT): Title — the canonical proposal heading.
DP_STRICT_RE = re.compile(r"DP-\d+\s*\([A-Z]+\)\s*[:\-]\s*[^\n\r]+", re.MULTILINE)
# Loose pattern: any line starting with DP-NNNNN followed by something.
DP_LOOSE_RE = re.compile(r"DP-\d+\s*(?:\([A-Z]+\))?\s*[:\-]?\s*[^\n\r]+", re.MULTILINE)
STAT_BLEED_RE = re.compile(
r"\s*(?:Pass|Fail|Passed|Failed|Active|Pending|Ended|Live|TOTAL|VOLUME|STATUS|MCAP|PRICE|SPOT|\$|\+\d|-\d|\d+\.\d+%|\d{4,})",
re.IGNORECASE,
)
def _clean_title_candidate(line: str) -> str:
line = line.strip()
bleed = STAT_BLEED_RE.search(line)
if bleed and bleed.start() > 10: # require some title before the bleed
line = line[: bleed.start()].rstrip(" :-—")
return line.strip()[:200]
def extract_dp_title(*texts: str) -> str:
"""Find the canonical 'DP-NNNNN (CAT): Title' line.
Strategy:
1. Try strict pattern (with parenthetical category code) across all sources.
Take the SHORTEST hit prose continuations of an already-correct title
tend to be longer than the title itself.
2. Fall back to loose pattern, longest match.
"""
strict: list[str] = []
loose: list[str] = []
for t in texts:
if not t:
continue
for m in DP_STRICT_RE.finditer(t):
cleaned = _clean_title_candidate(m.group(0))
if cleaned:
strict.append(cleaned)
for m in DP_LOOSE_RE.finditer(t):
cleaned = _clean_title_candidate(m.group(0))
if cleaned:
loose.append(cleaned)
if strict:
return min(strict, key=len)
if loose:
return max(loose, key=len)
return ""
def fetch_proposal(page, project_slug: str, addr: str, card_text: str = "") -> dict | None:
"""Visit proposal page, capture rendered text + decode instructions via in-browser fetch."""
url = f"{BASE}/projects/{project_slug}/proposal/{addr}"
log.info("fetching proposal %s/%s", project_slug, addr[:8])
try:
page.goto(url, wait_until="domcontentloaded", timeout=45000)
except PWTimeout:
log.warning("timeout loading %s — using whatever rendered", url)
page.wait_for_timeout(2500) # let RSC stream finish
body_text = page.evaluate("() => document.body.innerText || ''")
# Title preference: card_text (from project page) → body_text DP-NNNNN match → first h1/h2
title_block = extract_dp_title(card_text, body_text)
if not title_block:
title_block = page.evaluate(
"""() => {
const h = document.querySelector('h1, h2');
return h ? h.innerText.trim() : '';
}"""
) or f"proposal-{addr[:8]}"
# Status: 'Passed' / 'Failed' / 'Active' / 'Pending'
status = page.evaluate(
"""() => {
const text = document.body.innerText || '';
const m = text.match(/\\n(Passed|Failed|Active|Pending|Live|Ended)\\b/);
return m ? m[1] : '';
}"""
)
# Get the structured /api/decode-proposal data
decoded = None
try:
decoded = page.evaluate(
f"""async () => {{
try {{
const r = await fetch('/api/decode-proposal/{addr}');
if (!r.ok) return null;
return await r.json();
}} catch (e) {{ return null; }}
}}"""
)
except Exception as e:
log.debug("decode fetch failed for %s: %s", addr, e)
return {
"address": addr,
"project_slug": project_slug,
"url": url,
"title": title_block,
"status": status,
"body_text": body_text,
"decoded": decoded,
}
def parse_dp_code(title: str) -> tuple[str, str]:
"""Parse 'DP-00003 (MEM): The Gigabus Proposal' → ('dp-00003-mem', 'The Gigabus Proposal').
Falls back gracefully if format doesn't match.
"""
# Match leading DP-NNNNN[space(category)]?[:]?[space]? plus the rest
m = re.match(r"^(DP-\d+(?:\s*\([A-Z]+\))?)\s*[:\-]?\s*(.*)$", title.strip())
if m:
code = re.sub(r"[^a-z0-9]+", "-", m.group(1).lower()).strip("-")
rest = m.group(2).strip()
return code, rest
return "", title.strip()
def build_filename(project_slug: str, proposal: dict, today: str) -> str:
"""YYYY-MM-DD-metadao-{slug}-{title-fragment}-{addr8}.md
Embedding the address fragment makes filenames stable across runs even when
the title isn't unique (e.g. projects that don't use DP-NNNNN naming).
"""
title = proposal.get("title") or ""
code, rest = parse_dp_code(title)
parts: list[str] = []
if code:
parts.append(code)
if rest:
parts.append(slugify(rest, max_len=40))
body_slug = "-".join(p for p in parts if p)[:60].rstrip("-")
addr_frag = proposal["address"][:8].lower()
if body_slug:
return f"{today}-metadao-{project_slug}-{body_slug}-{addr_frag}.md"
return f"{today}-metadao-{project_slug}-{addr_frag}.md"
def build_source_markdown(project: dict, proposal: dict, today: str) -> str:
"""Build the source markdown matching the existing schema."""
title = proposal.get("title") or f"{project['name']} proposal {proposal['address'][:8]}"
body_text = (proposal.get("body_text") or "").strip()
decoded = proposal.get("decoded") or {}
# Build YAML frontmatter
fm_lines = [
"---",
"type: source",
f'title: "MetaDAO: {project["name"]}{title}"',
'author: "metadao.fi"',
f'url: "{proposal["url"]}"',
f"date: {today}",
"domain: internet-finance",
"format: data",
"status: unprocessed",
f'tags: [futardio, metadao, futarchy, solana, governance, {project["slug"]}]',
"event_type: proposal",
f'project_slug: "{project["slug"]}"',
f'proposal_address: "{proposal["address"]}"',
]
if proposal.get("status"):
fm_lines.append(f'proposal_status: "{proposal["status"]}"')
if decoded.get("squadsProposal"):
fm_lines.append(f'squads_proposal: "{decoded["squadsProposal"]}"')
if decoded.get("squadsStatus"):
fm_lines.append(f'squads_status: "{decoded["squadsStatus"]}"')
fm_lines.append("---")
fm_lines.append("")
# Header section — quick facts
body_md = [
f"# {title}",
"",
"## Proposal Details",
f"- Project: {project['name']} (`{project['slug']}`)",
f"- Proposal: {title}",
f"- Address: `{proposal['address']}`",
]
if proposal.get("status"):
body_md.append(f"- Status: {proposal['status']}")
body_md.append(f"- URL: {proposal['url']}")
# Proposal prose body (rendered text from the page)
body_md.append("")
body_md.append("## Proposal Body")
body_md.append("")
body_md.append(body_text or "_(no body captured)_")
# Decoded on-chain instructions
if decoded:
body_md.append("")
body_md.append("## On-chain Decoded")
if decoded.get("squadsUrl"):
body_md.append(f"- Squads: {decoded['squadsUrl']}")
instrs = decoded.get("instructions") or []
if instrs:
body_md.append("")
body_md.append("### Instructions")
for i, instr in enumerate(instrs, 1):
body_md.append(f"{i}. **{instr.get('description', instr.get('type', 'instruction'))}** ({instr.get('program', '')})")
for f in instr.get("fields", []) or []:
val = f.get("fullValue") or f.get("value") or ""
body_md.append(f" - {f.get('label', '')}: `{val}`")
if instr.get("summary"):
body_md.append(f" - Summary: {instr['summary']}")
return "\n".join(fm_lines + body_md) + "\n"
def main() -> int:
p = argparse.ArgumentParser(description="Scrape MetaDAO proposals into inbox source files")
p.add_argument("--archive-dir", required=True, help="existing archive dir (skip if basename exists here)")
p.add_argument("--output-dir", required=True, help="dir to write new source markdown into")
p.add_argument("--project", help="restrict to a single project slug (default: scan all)")
p.add_argument("--limit", type=int, default=0, help="max number of new proposals to capture (0 = unlimited)")
p.add_argument("--dry-run", action="store_true", help="print intended writes instead of writing")
p.add_argument("--headless", action="store_true", default=True)
args = p.parse_args()
archive_dir = Path(args.archive_dir).resolve()
output_dir = Path(args.output_dir).resolve()
seen_basenames = existing_basenames(archive_dir, output_dir)
seen_addresses = existing_proposal_addresses(archive_dir, output_dir)
log.info("loaded %d existing basenames + %d known proposal addresses from %s + %s",
len(seen_basenames), len(seen_addresses), archive_dir, output_dir)
today = date.today().isoformat()
written: list[str] = []
skipped_existing = 0
with sync_playwright() as pw:
browser = pw.chromium.launch(headless=args.headless)
ctx = browser.new_context(user_agent=USER_AGENT)
page = ctx.new_page()
# Prime cookies
log.info("priming Vercel session via homepage")
page.goto(f"{BASE}/", wait_until="domcontentloaded", timeout=30000)
page.wait_for_timeout(1500)
# Discovery
if args.project:
project_slugs = [args.project]
else:
project_slugs = list_project_slugs(page)
log.info("discovered %d project slugs: %s", len(project_slugs), project_slugs)
for slug in project_slugs:
try:
project = get_project_metadata(page, slug)
except Exception:
log.exception("failed to read project %s", slug)
continue
log.info(" %s%d proposals", slug, len(project["proposals"]))
for prop in project["proposals"]:
addr = prop["address"]
# Pre-check #1: known proposal address (cheapest, no browser visit)
if addr in seen_addresses:
skipped_existing += 1
continue
# Pre-check #2: address fragment in an existing basename
addr_frag = addr[:8].lower()
if any(addr_frag in b.lower() for b in seen_basenames):
skipped_existing += 1
continue
try:
proposal_data = fetch_proposal(page, slug, addr, card_text=prop.get("card_text", ""))
except Exception:
log.exception("failed to fetch proposal %s/%s", slug, addr)
continue
if not proposal_data:
continue
fname = build_filename(slug, proposal_data, today)
if Path(fname).stem in seen_basenames:
skipped_existing += 1
log.info(" skip (already archived by title): %s", fname)
continue
content = build_source_markdown(project, proposal_data, today)
target = output_dir / fname
if args.dry_run:
log.info(" DRY: would write %s (%d bytes)", target, len(content))
else:
target.parent.mkdir(parents=True, exist_ok=True)
target.write_text(content)
log.info(" wrote %s (%d bytes)", target, len(content))
written.append(fname)
if args.limit and len(written) >= args.limit:
log.info("hit limit=%d, stopping", args.limit)
browser.close()
print(json.dumps({"written": written, "skipped_existing": skipped_existing}))
return 0
browser.close()
print(json.dumps({"written": written, "skipped_existing": skipped_existing, "dry_run": args.dry_run}))
return 0
if __name__ == "__main__":
sys.exit(main())