feat(ingestion): metadao.fi scraper to replace broken futard.io ingestion
Some checks are pending
CI / lint-and-test (pull_request) Waiting to run
Some checks are pending
CI / lint-and-test (pull_request) Waiting to run
Background: - futard.io retired its /api/graphql endpoint between Apr 17–20 - Cloud Scheduler ingest-futard has been firing into 500s ever since (the AttributeError on e.url masked the real 404 for 5 days; fixed in living-ip/teleo-api@b8eb441 which surfaced the actual root cause) - The ecosystem migrated to metadao.fi, which is Vercel-protected - Direct curl is blocked by Vercel's anti-bot challenge regardless of headers; a real headless browser passes it cleanly Approach: - Playwright-driven scraper, runs as a one-shot - Discovery: scrape /projects DOM for project slugs, then each /projects/{slug} for proposal addresses - For each NEW proposal: visit page for prose body + call /api/decode-proposal/{addr} via in-browser fetch (bypasses challenge via the primed Vercel cookies in the browser context) for structured on-chain instructions - Idempotent: dedup against existing proposal addresses in archive frontmatter AND filename basenames - Filename embeds 8-char address fragment for stable cross-run dedup even on projects that don't use DP-NNNNN naming convention Tested locally against 6 active projects (p2p-protocol, paystream, zklsol, loyal, ranger, solomon). Captured 13 new proposals — including the Solomon Gigabus DP-00003 that triggered this work — with proper titles, status, on-chain instruction decoding (Squads transactions, SPL transfers, memos), and project metadata. Output schema matches existing futardio source files (type: source, event_type: proposal, domain: internet-finance, status: unprocessed) so the existing extract pipeline picks them up unchanged. Architectural note: this script is intentionally NOT wired to systemd yet — VPS deploy needs Playwright + Chromium system libs which require apt sudo (currently scoped to teleo-* services only). Reviewing the script first; deploy path is a separate decision. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
3fe524dd14
commit
b8fba8195f
1 changed files with 471 additions and 0 deletions
471
scripts/metadao-scrape.py
Executable file
471
scripts/metadao-scrape.py
Executable file
|
|
@ -0,0 +1,471 @@
|
|||
#!/usr/bin/env python3
|
||||
"""metadao-scrape.py — pull active/recent proposals from metadao.fi into source markdown.
|
||||
|
||||
Replaces the broken futard.io GraphQL ingestion (Cloud Run → teleo-api).
|
||||
metadao.fi is a Vercel-protected Next.js App Router site; direct curl is blocked
|
||||
by the anti-bot challenge. A real headless browser passes the challenge cleanly,
|
||||
and once cookies are issued for the context we can call /api/decode-proposal/{addr}
|
||||
from inside the browser to get structured instruction data.
|
||||
|
||||
Discovery flow:
|
||||
1. visit / to prime Vercel cookies
|
||||
2. visit /projects, scrape distinct /projects/{slug} hrefs
|
||||
3. for each project, visit /projects/{slug}, scrape proposal addresses from DOM
|
||||
4. for each NEW proposal (basename not already in --archive-dir):
|
||||
a. visit proposal page, capture rendered prose
|
||||
b. call /api/decode-proposal/{addr} via in-browser fetch for instructions
|
||||
c. write source markdown to --output-dir
|
||||
|
||||
Idempotent. Skips proposals whose basename is already present in archive-dir
|
||||
or output-dir. Designed to run from a systemd timer or one-shot.
|
||||
|
||||
Usage:
|
||||
python3 metadao-scrape.py --archive-dir /opt/teleo-eval/workspaces/main/inbox/archive \\
|
||||
--output-dir /opt/teleo-eval/workspaces/main/inbox/queue \\
|
||||
[--dry-run] [--limit 10] [--project solomon]
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
from datetime import date, datetime
|
||||
from pathlib import Path
|
||||
|
||||
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(message)s",
|
||||
)
|
||||
log = logging.getLogger("metadao-scrape")
|
||||
|
||||
BASE = "https://www.metadao.fi"
|
||||
USER_AGENT = (
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
|
||||
def slugify(text: str, max_len: int = 60) -> str:
|
||||
s = text.lower().strip()
|
||||
s = re.sub(r"[^a-z0-9\s-]", "", s)
|
||||
s = re.sub(r"\s+", "-", s)
|
||||
s = re.sub(r"-+", "-", s)
|
||||
return s.strip("-")[:max_len].rstrip("-")
|
||||
|
||||
|
||||
def existing_basenames(*dirs: Path) -> set[str]:
|
||||
"""Collect all .md basenames (without extension) across the given dirs (recursive)."""
|
||||
seen: set[str] = set()
|
||||
for d in dirs:
|
||||
if not d.exists():
|
||||
continue
|
||||
for p in d.rglob("*.md"):
|
||||
seen.add(p.stem)
|
||||
return seen
|
||||
|
||||
|
||||
PROP_ADDR_RE = re.compile(r"proposal_address:\s*[\"']?([A-Za-z0-9]{32,44})[\"']?")
|
||||
URL_ADDR_RE = re.compile(r"(?:futard\.io|metadao\.fi)/[^/\s\"']*proposal/([A-Za-z0-9]{32,44})")
|
||||
|
||||
|
||||
def existing_proposal_addresses(*dirs: Path) -> set[str]:
|
||||
"""Scan frontmatter / URLs in existing source files to collect known proposal addresses.
|
||||
|
||||
Reads only the first 4KB of each file (frontmatter + URL line are at the top)
|
||||
to keep this fast on large archives.
|
||||
"""
|
||||
addrs: set[str] = set()
|
||||
for d in dirs:
|
||||
if not d.exists():
|
||||
continue
|
||||
for p in d.rglob("*.md"):
|
||||
try:
|
||||
head = p.read_text(errors="replace")[:4096]
|
||||
except Exception:
|
||||
continue
|
||||
for m in PROP_ADDR_RE.finditer(head):
|
||||
addrs.add(m.group(1))
|
||||
for m in URL_ADDR_RE.finditer(head):
|
||||
addrs.add(m.group(1))
|
||||
return addrs
|
||||
|
||||
|
||||
def list_project_slugs(page) -> list[str]:
|
||||
"""Read /projects and extract distinct project slugs."""
|
||||
page.goto(f"{BASE}/projects", wait_until="domcontentloaded", timeout=30000)
|
||||
page.wait_for_timeout(1500)
|
||||
hrefs = page.evaluate(
|
||||
"""() => {
|
||||
const links = Array.from(document.querySelectorAll('a[href^="/projects/"]'));
|
||||
const slugs = new Set();
|
||||
for (const a of links) {
|
||||
const m = a.getAttribute('href').match(/^\\/projects\\/([a-z0-9-]+)(?:\\/|$)/);
|
||||
if (m && m[1]) slugs.add(m[1]);
|
||||
}
|
||||
return [...slugs];
|
||||
}"""
|
||||
)
|
||||
return list(hrefs)
|
||||
|
||||
|
||||
def get_project_metadata(page, slug: str) -> dict:
|
||||
"""Visit a project page and return basic metadata + proposal addresses + card text.
|
||||
Card text typically contains 'SOLO-004 ENDED DP-00003 (MEM): The Gigabus Proposal Pass $0.64...'
|
||||
so we capture it for downstream title parsing.
|
||||
"""
|
||||
url = f"{BASE}/projects/{slug}"
|
||||
page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
||||
page.wait_for_timeout(1500)
|
||||
|
||||
proposals = page.evaluate(
|
||||
"""() => {
|
||||
const links = Array.from(document.querySelectorAll('a[href*="/proposal/"]'));
|
||||
const seen = new Set();
|
||||
const out = [];
|
||||
const TARGET_ADDR_RE = /\\/proposal\\/([A-Za-z0-9]+)/;
|
||||
for (const a of links) {
|
||||
const m = a.getAttribute('href').match(TARGET_ADDR_RE);
|
||||
if (!m) continue;
|
||||
if (seen.has(m[1])) continue;
|
||||
seen.add(m[1]);
|
||||
const addr = m[1];
|
||||
// Walk up only while the ancestor contains exactly one proposal link
|
||||
// (so we get the card, not a parent that contains all cards).
|
||||
let card = a;
|
||||
while (card.parentElement) {
|
||||
const parent = card.parentElement;
|
||||
const propLinks = parent.querySelectorAll('a[href*="/proposal/"]');
|
||||
if (propLinks.length > 1) break;
|
||||
card = parent;
|
||||
}
|
||||
out.push({
|
||||
address: addr,
|
||||
link_text: (a.innerText || '').trim().slice(0, 600),
|
||||
card_text: (card.innerText || '').trim().slice(0, 1500),
|
||||
});
|
||||
}
|
||||
return out;
|
||||
}"""
|
||||
)
|
||||
|
||||
# Try to read project name from h1 / title
|
||||
project_name = page.evaluate(
|
||||
"""() => {
|
||||
const h = document.querySelector('h1');
|
||||
return h ? h.innerText.trim() : '';
|
||||
}"""
|
||||
) or slug.title()
|
||||
|
||||
return {"slug": slug, "name": project_name, "url": url, "proposals": proposals}
|
||||
|
||||
|
||||
# Strict pattern: DP-NNNNN (CAT): Title — the canonical proposal heading.
|
||||
DP_STRICT_RE = re.compile(r"DP-\d+\s*\([A-Z]+\)\s*[:\-]\s*[^\n\r]+", re.MULTILINE)
|
||||
# Loose pattern: any line starting with DP-NNNNN followed by something.
|
||||
DP_LOOSE_RE = re.compile(r"DP-\d+\s*(?:\([A-Z]+\))?\s*[:\-]?\s*[^\n\r]+", re.MULTILINE)
|
||||
STAT_BLEED_RE = re.compile(
|
||||
r"\s*(?:Pass|Fail|Passed|Failed|Active|Pending|Ended|Live|TOTAL|VOLUME|STATUS|MCAP|PRICE|SPOT|\$|\+\d|-\d|\d+\.\d+%|\d{4,})",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def _clean_title_candidate(line: str) -> str:
|
||||
line = line.strip()
|
||||
bleed = STAT_BLEED_RE.search(line)
|
||||
if bleed and bleed.start() > 10: # require some title before the bleed
|
||||
line = line[: bleed.start()].rstrip(" :-—")
|
||||
return line.strip()[:200]
|
||||
|
||||
|
||||
def extract_dp_title(*texts: str) -> str:
|
||||
"""Find the canonical 'DP-NNNNN (CAT): Title' line.
|
||||
|
||||
Strategy:
|
||||
1. Try strict pattern (with parenthetical category code) across all sources.
|
||||
Take the SHORTEST hit — prose continuations of an already-correct title
|
||||
tend to be longer than the title itself.
|
||||
2. Fall back to loose pattern, longest match.
|
||||
"""
|
||||
strict: list[str] = []
|
||||
loose: list[str] = []
|
||||
for t in texts:
|
||||
if not t:
|
||||
continue
|
||||
for m in DP_STRICT_RE.finditer(t):
|
||||
cleaned = _clean_title_candidate(m.group(0))
|
||||
if cleaned:
|
||||
strict.append(cleaned)
|
||||
for m in DP_LOOSE_RE.finditer(t):
|
||||
cleaned = _clean_title_candidate(m.group(0))
|
||||
if cleaned:
|
||||
loose.append(cleaned)
|
||||
if strict:
|
||||
return min(strict, key=len)
|
||||
if loose:
|
||||
return max(loose, key=len)
|
||||
return ""
|
||||
|
||||
|
||||
def fetch_proposal(page, project_slug: str, addr: str, card_text: str = "") -> dict | None:
|
||||
"""Visit proposal page, capture rendered text + decode instructions via in-browser fetch."""
|
||||
url = f"{BASE}/projects/{project_slug}/proposal/{addr}"
|
||||
log.info("fetching proposal %s/%s", project_slug, addr[:8])
|
||||
try:
|
||||
page.goto(url, wait_until="domcontentloaded", timeout=45000)
|
||||
except PWTimeout:
|
||||
log.warning("timeout loading %s — using whatever rendered", url)
|
||||
page.wait_for_timeout(2500) # let RSC stream finish
|
||||
|
||||
body_text = page.evaluate("() => document.body.innerText || ''")
|
||||
|
||||
# Title preference: card_text (from project page) → body_text DP-NNNNN match → first h1/h2
|
||||
title_block = extract_dp_title(card_text, body_text)
|
||||
if not title_block:
|
||||
title_block = page.evaluate(
|
||||
"""() => {
|
||||
const h = document.querySelector('h1, h2');
|
||||
return h ? h.innerText.trim() : '';
|
||||
}"""
|
||||
) or f"proposal-{addr[:8]}"
|
||||
|
||||
# Status: 'Passed' / 'Failed' / 'Active' / 'Pending'
|
||||
status = page.evaluate(
|
||||
"""() => {
|
||||
const text = document.body.innerText || '';
|
||||
const m = text.match(/\\n(Passed|Failed|Active|Pending|Live|Ended)\\b/);
|
||||
return m ? m[1] : '';
|
||||
}"""
|
||||
)
|
||||
|
||||
# Get the structured /api/decode-proposal data
|
||||
decoded = None
|
||||
try:
|
||||
decoded = page.evaluate(
|
||||
f"""async () => {{
|
||||
try {{
|
||||
const r = await fetch('/api/decode-proposal/{addr}');
|
||||
if (!r.ok) return null;
|
||||
return await r.json();
|
||||
}} catch (e) {{ return null; }}
|
||||
}}"""
|
||||
)
|
||||
except Exception as e:
|
||||
log.debug("decode fetch failed for %s: %s", addr, e)
|
||||
|
||||
return {
|
||||
"address": addr,
|
||||
"project_slug": project_slug,
|
||||
"url": url,
|
||||
"title": title_block,
|
||||
"status": status,
|
||||
"body_text": body_text,
|
||||
"decoded": decoded,
|
||||
}
|
||||
|
||||
|
||||
def parse_dp_code(title: str) -> tuple[str, str]:
|
||||
"""Parse 'DP-00003 (MEM): The Gigabus Proposal' → ('dp-00003-mem', 'The Gigabus Proposal').
|
||||
Falls back gracefully if format doesn't match.
|
||||
"""
|
||||
# Match leading DP-NNNNN[space(category)]?[:]?[space]? plus the rest
|
||||
m = re.match(r"^(DP-\d+(?:\s*\([A-Z]+\))?)\s*[:\-]?\s*(.*)$", title.strip())
|
||||
if m:
|
||||
code = re.sub(r"[^a-z0-9]+", "-", m.group(1).lower()).strip("-")
|
||||
rest = m.group(2).strip()
|
||||
return code, rest
|
||||
return "", title.strip()
|
||||
|
||||
|
||||
def build_filename(project_slug: str, proposal: dict, today: str) -> str:
|
||||
"""YYYY-MM-DD-metadao-{slug}-{title-fragment}-{addr8}.md
|
||||
|
||||
Embedding the address fragment makes filenames stable across runs even when
|
||||
the title isn't unique (e.g. projects that don't use DP-NNNNN naming).
|
||||
"""
|
||||
title = proposal.get("title") or ""
|
||||
code, rest = parse_dp_code(title)
|
||||
parts: list[str] = []
|
||||
if code:
|
||||
parts.append(code)
|
||||
if rest:
|
||||
parts.append(slugify(rest, max_len=40))
|
||||
body_slug = "-".join(p for p in parts if p)[:60].rstrip("-")
|
||||
addr_frag = proposal["address"][:8].lower()
|
||||
if body_slug:
|
||||
return f"{today}-metadao-{project_slug}-{body_slug}-{addr_frag}.md"
|
||||
return f"{today}-metadao-{project_slug}-{addr_frag}.md"
|
||||
|
||||
|
||||
def build_source_markdown(project: dict, proposal: dict, today: str) -> str:
|
||||
"""Build the source markdown matching the existing schema."""
|
||||
title = proposal.get("title") or f"{project['name']} proposal {proposal['address'][:8]}"
|
||||
body_text = (proposal.get("body_text") or "").strip()
|
||||
decoded = proposal.get("decoded") or {}
|
||||
|
||||
# Build YAML frontmatter
|
||||
fm_lines = [
|
||||
"---",
|
||||
"type: source",
|
||||
f'title: "MetaDAO: {project["name"]} — {title}"',
|
||||
'author: "metadao.fi"',
|
||||
f'url: "{proposal["url"]}"',
|
||||
f"date: {today}",
|
||||
"domain: internet-finance",
|
||||
"format: data",
|
||||
"status: unprocessed",
|
||||
f'tags: [futardio, metadao, futarchy, solana, governance, {project["slug"]}]',
|
||||
"event_type: proposal",
|
||||
f'project_slug: "{project["slug"]}"',
|
||||
f'proposal_address: "{proposal["address"]}"',
|
||||
]
|
||||
if proposal.get("status"):
|
||||
fm_lines.append(f'proposal_status: "{proposal["status"]}"')
|
||||
if decoded.get("squadsProposal"):
|
||||
fm_lines.append(f'squads_proposal: "{decoded["squadsProposal"]}"')
|
||||
if decoded.get("squadsStatus"):
|
||||
fm_lines.append(f'squads_status: "{decoded["squadsStatus"]}"')
|
||||
fm_lines.append("---")
|
||||
fm_lines.append("")
|
||||
|
||||
# Header section — quick facts
|
||||
body_md = [
|
||||
f"# {title}",
|
||||
"",
|
||||
"## Proposal Details",
|
||||
f"- Project: {project['name']} (`{project['slug']}`)",
|
||||
f"- Proposal: {title}",
|
||||
f"- Address: `{proposal['address']}`",
|
||||
]
|
||||
if proposal.get("status"):
|
||||
body_md.append(f"- Status: {proposal['status']}")
|
||||
body_md.append(f"- URL: {proposal['url']}")
|
||||
|
||||
# Proposal prose body (rendered text from the page)
|
||||
body_md.append("")
|
||||
body_md.append("## Proposal Body")
|
||||
body_md.append("")
|
||||
body_md.append(body_text or "_(no body captured)_")
|
||||
|
||||
# Decoded on-chain instructions
|
||||
if decoded:
|
||||
body_md.append("")
|
||||
body_md.append("## On-chain Decoded")
|
||||
if decoded.get("squadsUrl"):
|
||||
body_md.append(f"- Squads: {decoded['squadsUrl']}")
|
||||
instrs = decoded.get("instructions") or []
|
||||
if instrs:
|
||||
body_md.append("")
|
||||
body_md.append("### Instructions")
|
||||
for i, instr in enumerate(instrs, 1):
|
||||
body_md.append(f"{i}. **{instr.get('description', instr.get('type', 'instruction'))}** ({instr.get('program', '')})")
|
||||
for f in instr.get("fields", []) or []:
|
||||
val = f.get("fullValue") or f.get("value") or ""
|
||||
body_md.append(f" - {f.get('label', '')}: `{val}`")
|
||||
if instr.get("summary"):
|
||||
body_md.append(f" - Summary: {instr['summary']}")
|
||||
|
||||
return "\n".join(fm_lines + body_md) + "\n"
|
||||
|
||||
|
||||
def main() -> int:
|
||||
p = argparse.ArgumentParser(description="Scrape MetaDAO proposals into inbox source files")
|
||||
p.add_argument("--archive-dir", required=True, help="existing archive dir (skip if basename exists here)")
|
||||
p.add_argument("--output-dir", required=True, help="dir to write new source markdown into")
|
||||
p.add_argument("--project", help="restrict to a single project slug (default: scan all)")
|
||||
p.add_argument("--limit", type=int, default=0, help="max number of new proposals to capture (0 = unlimited)")
|
||||
p.add_argument("--dry-run", action="store_true", help="print intended writes instead of writing")
|
||||
p.add_argument("--headless", action="store_true", default=True)
|
||||
args = p.parse_args()
|
||||
|
||||
archive_dir = Path(args.archive_dir).resolve()
|
||||
output_dir = Path(args.output_dir).resolve()
|
||||
seen_basenames = existing_basenames(archive_dir, output_dir)
|
||||
seen_addresses = existing_proposal_addresses(archive_dir, output_dir)
|
||||
log.info("loaded %d existing basenames + %d known proposal addresses from %s + %s",
|
||||
len(seen_basenames), len(seen_addresses), archive_dir, output_dir)
|
||||
|
||||
today = date.today().isoformat()
|
||||
|
||||
written: list[str] = []
|
||||
skipped_existing = 0
|
||||
|
||||
with sync_playwright() as pw:
|
||||
browser = pw.chromium.launch(headless=args.headless)
|
||||
ctx = browser.new_context(user_agent=USER_AGENT)
|
||||
page = ctx.new_page()
|
||||
|
||||
# Prime cookies
|
||||
log.info("priming Vercel session via homepage")
|
||||
page.goto(f"{BASE}/", wait_until="domcontentloaded", timeout=30000)
|
||||
page.wait_for_timeout(1500)
|
||||
|
||||
# Discovery
|
||||
if args.project:
|
||||
project_slugs = [args.project]
|
||||
else:
|
||||
project_slugs = list_project_slugs(page)
|
||||
log.info("discovered %d project slugs: %s", len(project_slugs), project_slugs)
|
||||
|
||||
for slug in project_slugs:
|
||||
try:
|
||||
project = get_project_metadata(page, slug)
|
||||
except Exception:
|
||||
log.exception("failed to read project %s", slug)
|
||||
continue
|
||||
log.info(" %s — %d proposals", slug, len(project["proposals"]))
|
||||
|
||||
for prop in project["proposals"]:
|
||||
addr = prop["address"]
|
||||
# Pre-check #1: known proposal address (cheapest, no browser visit)
|
||||
if addr in seen_addresses:
|
||||
skipped_existing += 1
|
||||
continue
|
||||
# Pre-check #2: address fragment in an existing basename
|
||||
addr_frag = addr[:8].lower()
|
||||
if any(addr_frag in b.lower() for b in seen_basenames):
|
||||
skipped_existing += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
proposal_data = fetch_proposal(page, slug, addr, card_text=prop.get("card_text", ""))
|
||||
except Exception:
|
||||
log.exception("failed to fetch proposal %s/%s", slug, addr)
|
||||
continue
|
||||
if not proposal_data:
|
||||
continue
|
||||
|
||||
fname = build_filename(slug, proposal_data, today)
|
||||
|
||||
if Path(fname).stem in seen_basenames:
|
||||
skipped_existing += 1
|
||||
log.info(" skip (already archived by title): %s", fname)
|
||||
continue
|
||||
|
||||
content = build_source_markdown(project, proposal_data, today)
|
||||
target = output_dir / fname
|
||||
if args.dry_run:
|
||||
log.info(" DRY: would write %s (%d bytes)", target, len(content))
|
||||
else:
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
target.write_text(content)
|
||||
log.info(" wrote %s (%d bytes)", target, len(content))
|
||||
written.append(fname)
|
||||
|
||||
if args.limit and len(written) >= args.limit:
|
||||
log.info("hit limit=%d, stopping", args.limit)
|
||||
browser.close()
|
||||
print(json.dumps({"written": written, "skipped_existing": skipped_existing}))
|
||||
return 0
|
||||
|
||||
browser.close()
|
||||
|
||||
print(json.dumps({"written": written, "skipped_existing": skipped_existing, "dry_run": args.dry_run}))
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Loading…
Reference in a new issue