From b8fba8195f6f0f6203b5c9d5a150ea94eff47b91 Mon Sep 17 00:00:00 2001
From: m3taversal <m3taversal@gmail.com>
Date: Sat, 25 Apr 2026 13:09:31 +0100
Subject: [PATCH 1/4] feat(ingestion): metadao.fi scraper to replace broken
 futard.io ingestion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Background:
- futard.io retired its /api/graphql endpoint between Apr 17–20
- Cloud Scheduler ingest-futard has been firing into 500s ever since
  (the AttributeError on e.url masked the real 404 for 5 days; fixed
   in living-ip/teleo-api@b8eb441 which surfaced the actual root cause)
- The ecosystem migrated to metadao.fi, which is Vercel-protected
- Direct curl is blocked by Vercel's anti-bot challenge regardless of
  headers; a real headless browser passes it cleanly

Approach:
- Playwright-driven scraper, runs as a one-shot
- Discovery: scrape /projects DOM for project slugs, then each
  /projects/{slug} for proposal addresses
- For each NEW proposal: visit page for prose body + call
  /api/decode-proposal/{addr} via in-browser fetch (bypasses challenge
  via the primed Vercel cookies in the browser context) for structured
  on-chain instructions
- Idempotent: dedup against existing proposal addresses in archive
  frontmatter AND filename basenames
- Filename embeds 8-char address fragment for stable cross-run dedup
  even on projects that don't use DP-NNNNN naming convention

Tested locally against 6 active projects (p2p-protocol, paystream,
zklsol, loyal, ranger, solomon). Captured 13 new proposals — including
the Solomon Gigabus DP-00003 that triggered this work — with proper
titles, status, on-chain instruction decoding (Squads transactions,
SPL transfers, memos), and project metadata.

Output schema matches existing futardio source files (type: source,
event_type: proposal, domain: internet-finance, status: unprocessed)
so the existing extract pipeline picks them up unchanged.

Architectural note: this script is intentionally NOT wired to systemd
yet — VPS deploy needs Playwright + Chromium system libs which require
apt sudo (currently scoped to teleo-* services only). Reviewing the
script first; deploy path is a separate decision.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 scripts/metadao-scrape.py | 471 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 471 insertions(+)
 create mode 100755 scripts/metadao-scrape.py

diff --git a/scripts/metadao-scrape.py b/scripts/metadao-scrape.py
new file mode 100755
index 0000000..3d2c648
--- /dev/null
+++ b/scripts/metadao-scrape.py
@@ -0,0 +1,471 @@
+#!/usr/bin/env python3
+"""metadao-scrape.py — pull active/recent proposals from metadao.fi into source markdown.
+
+Replaces the broken futard.io GraphQL ingestion (Cloud Run → teleo-api).
+metadao.fi is a Vercel-protected Next.js App Router site; direct curl is blocked
+by the anti-bot challenge. A real headless browser passes the challenge cleanly,
+and once cookies are issued for the context we can call /api/decode-proposal/{addr}
+from inside the browser to get structured instruction data.
+
+Discovery flow:
+  1. visit / to prime Vercel cookies
+  2. visit /projects, scrape distinct /projects/{slug} hrefs
+  3. for each project, visit /projects/{slug}, scrape proposal addresses from DOM
+  4. for each NEW proposal (basename not already in --archive-dir):
+     a. visit proposal page, capture rendered prose
+     b. call /api/decode-proposal/{addr} via in-browser fetch for instructions
+     c. write source markdown to --output-dir
+
+Idempotent. Skips proposals whose basename is already present in archive-dir
+or output-dir. Designed to run from a systemd timer or one-shot.
+
+Usage:
+  python3 metadao-scrape.py --archive-dir /opt/teleo-eval/workspaces/main/inbox/archive \\
+                            --output-dir /opt/teleo-eval/workspaces/main/inbox/queue \\
+                            [--dry-run] [--limit 10] [--project solomon]
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import re
+import sys
+from datetime import date, datetime
+from pathlib import Path
+
+from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s %(message)s",
+)
+log = logging.getLogger("metadao-scrape")
+
+BASE = "https://www.metadao.fi"
+USER_AGENT = (
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
+)
+
+
+def slugify(text: str, max_len: int = 60) -> str:
+    s = text.lower().strip()
+    s = re.sub(r"[^a-z0-9\s-]", "", s)
+    s = re.sub(r"\s+", "-", s)
+    s = re.sub(r"-+", "-", s)
+    return s.strip("-")[:max_len].rstrip("-")
+
+
+def existing_basenames(*dirs: Path) -> set[str]:
+    """Collect all .md basenames (without extension) across the given dirs (recursive)."""
+    seen: set[str] = set()
+    for d in dirs:
+        if not d.exists():
+            continue
+        for p in d.rglob("*.md"):
+            seen.add(p.stem)
+    return seen
+
+
+PROP_ADDR_RE = re.compile(r"proposal_address:\s*[\"']?([A-Za-z0-9]{32,44})[\"']?")
+URL_ADDR_RE = re.compile(r"(?:futard\.io|metadao\.fi)/[^/\s\"']*proposal/([A-Za-z0-9]{32,44})")
+
+
+def existing_proposal_addresses(*dirs: Path) -> set[str]:
+    """Scan frontmatter / URLs in existing source files to collect known proposal addresses.
+
+    Reads only the first 4KB of each file (frontmatter + URL line are at the top)
+    to keep this fast on large archives.
+    """
+    addrs: set[str] = set()
+    for d in dirs:
+        if not d.exists():
+            continue
+        for p in d.rglob("*.md"):
+            try:
+                head = p.read_text(errors="replace")[:4096]
+            except Exception:
+                continue
+            for m in PROP_ADDR_RE.finditer(head):
+                addrs.add(m.group(1))
+            for m in URL_ADDR_RE.finditer(head):
+                addrs.add(m.group(1))
+    return addrs
+
+
+def list_project_slugs(page) -> list[str]:
+    """Read /projects and extract distinct project slugs."""
+    page.goto(f"{BASE}/projects", wait_until="domcontentloaded", timeout=30000)
+    page.wait_for_timeout(1500)
+    hrefs = page.evaluate(
+        """() => {
+            const links = Array.from(document.querySelectorAll('a[href^="/projects/"]'));
+            const slugs = new Set();
+            for (const a of links) {
+                const m = a.getAttribute('href').match(/^\\/projects\\/([a-z0-9-]+)(?:\\/|$)/);
+                if (m && m[1]) slugs.add(m[1]);
+            }
+            return [...slugs];
+        }"""
+    )
+    return list(hrefs)
+
+
+def get_project_metadata(page, slug: str) -> dict:
+    """Visit a project page and return basic metadata + proposal addresses + card text.
+    Card text typically contains 'SOLO-004 ENDED DP-00003 (MEM): The Gigabus Proposal Pass $0.64...'
+    so we capture it for downstream title parsing.
+    """
+    url = f"{BASE}/projects/{slug}"
+    page.goto(url, wait_until="domcontentloaded", timeout=30000)
+    page.wait_for_timeout(1500)
+
+    proposals = page.evaluate(
+        """() => {
+            const links = Array.from(document.querySelectorAll('a[href*="/proposal/"]'));
+            const seen = new Set();
+            const out = [];
+            const TARGET_ADDR_RE = /\\/proposal\\/([A-Za-z0-9]+)/;
+            for (const a of links) {
+                const m = a.getAttribute('href').match(TARGET_ADDR_RE);
+                if (!m) continue;
+                if (seen.has(m[1])) continue;
+                seen.add(m[1]);
+                const addr = m[1];
+                // Walk up only while the ancestor contains exactly one proposal link
+                // (so we get the card, not a parent that contains all cards).
+                let card = a;
+                while (card.parentElement) {
+                    const parent = card.parentElement;
+                    const propLinks = parent.querySelectorAll('a[href*="/proposal/"]');
+                    if (propLinks.length > 1) break;
+                    card = parent;
+                }
+                out.push({
+                    address: addr,
+                    link_text: (a.innerText || '').trim().slice(0, 600),
+                    card_text: (card.innerText || '').trim().slice(0, 1500),
+                });
+            }
+            return out;
+        }"""
+    )
+
+    # Try to read project name from h1 / title
+    project_name = page.evaluate(
+        """() => {
+            const h = document.querySelector('h1');
+            return h ? h.innerText.trim() : '';
+        }"""
+    ) or slug.title()
+
+    return {"slug": slug, "name": project_name, "url": url, "proposals": proposals}
+
+
+# Strict pattern: DP-NNNNN (CAT): Title — the canonical proposal heading.
+DP_STRICT_RE = re.compile(r"DP-\d+\s*\([A-Z]+\)\s*[:\-]\s*[^\n\r]+", re.MULTILINE)
+# Loose pattern: any line starting with DP-NNNNN followed by something.
+DP_LOOSE_RE = re.compile(r"DP-\d+\s*(?:\([A-Z]+\))?\s*[:\-]?\s*[^\n\r]+", re.MULTILINE)
+STAT_BLEED_RE = re.compile(
+    r"\s*(?:Pass|Fail|Passed|Failed|Active|Pending|Ended|Live|TOTAL|VOLUME|STATUS|MCAP|PRICE|SPOT|\$|\+\d|-\d|\d+\.\d+%|\d{4,})",
+    re.IGNORECASE,
+)
+
+
+def _clean_title_candidate(line: str) -> str:
+    line = line.strip()
+    bleed = STAT_BLEED_RE.search(line)
+    if bleed and bleed.start() > 10:  # require some title before the bleed
+        line = line[: bleed.start()].rstrip(" :-—")
+    return line.strip()[:200]
+
+
+def extract_dp_title(*texts: str) -> str:
+    """Find the canonical 'DP-NNNNN (CAT): Title' line.
+
+    Strategy:
+      1. Try strict pattern (with parenthetical category code) across all sources.
+         Take the SHORTEST hit — prose continuations of an already-correct title
+         tend to be longer than the title itself.
+      2. Fall back to loose pattern, longest match.
+    """
+    strict: list[str] = []
+    loose: list[str] = []
+    for t in texts:
+        if not t:
+            continue
+        for m in DP_STRICT_RE.finditer(t):
+            cleaned = _clean_title_candidate(m.group(0))
+            if cleaned:
+                strict.append(cleaned)
+        for m in DP_LOOSE_RE.finditer(t):
+            cleaned = _clean_title_candidate(m.group(0))
+            if cleaned:
+                loose.append(cleaned)
+    if strict:
+        return min(strict, key=len)
+    if loose:
+        return max(loose, key=len)
+    return ""
+
+
+def fetch_proposal(page, project_slug: str, addr: str, card_text: str = "") -> dict | None:
+    """Visit proposal page, capture rendered text + decode instructions via in-browser fetch."""
+    url = f"{BASE}/projects/{project_slug}/proposal/{addr}"
+    log.info("fetching proposal %s/%s", project_slug, addr[:8])
+    try:
+        page.goto(url, wait_until="domcontentloaded", timeout=45000)
+    except PWTimeout:
+        log.warning("timeout loading %s — using whatever rendered", url)
+    page.wait_for_timeout(2500)  # let RSC stream finish
+
+    body_text = page.evaluate("() => document.body.innerText || ''")
+
+    # Title preference: card_text (from project page) → body_text DP-NNNNN match → first h1/h2
+    title_block = extract_dp_title(card_text, body_text)
+    if not title_block:
+        title_block = page.evaluate(
+            """() => {
+                const h = document.querySelector('h1, h2');
+                return h ? h.innerText.trim() : '';
+            }"""
+        ) or f"proposal-{addr[:8]}"
+
+    # Status: 'Passed' / 'Failed' / 'Active' / 'Pending'
+    status = page.evaluate(
+        """() => {
+            const text = document.body.innerText || '';
+            const m = text.match(/\\n(Passed|Failed|Active|Pending|Live|Ended)\\b/);
+            return m ? m[1] : '';
+        }"""
+    )
+
+    # Get the structured /api/decode-proposal data
+    decoded = None
+    try:
+        decoded = page.evaluate(
+            f"""async () => {{
+                try {{
+                    const r = await fetch('/api/decode-proposal/{addr}');
+                    if (!r.ok) return null;
+                    return await r.json();
+                }} catch (e) {{ return null; }}
+            }}"""
+        )
+    except Exception as e:
+        log.debug("decode fetch failed for %s: %s", addr, e)
+
+    return {
+        "address": addr,
+        "project_slug": project_slug,
+        "url": url,
+        "title": title_block,
+        "status": status,
+        "body_text": body_text,
+        "decoded": decoded,
+    }
+
+
+def parse_dp_code(title: str) -> tuple[str, str]:
+    """Parse 'DP-00003 (MEM): The Gigabus Proposal' → ('dp-00003-mem', 'The Gigabus Proposal').
+    Falls back gracefully if format doesn't match.
+    """
+    # Match leading DP-NNNNN[space(category)]?[:]?[space]? plus the rest
+    m = re.match(r"^(DP-\d+(?:\s*\([A-Z]+\))?)\s*[:\-]?\s*(.*)$", title.strip())
+    if m:
+        code = re.sub(r"[^a-z0-9]+", "-", m.group(1).lower()).strip("-")
+        rest = m.group(2).strip()
+        return code, rest
+    return "", title.strip()
+
+
+def build_filename(project_slug: str, proposal: dict, today: str) -> str:
+    """YYYY-MM-DD-metadao-{slug}-{title-fragment}-{addr8}.md
+
+    Embedding the address fragment makes filenames stable across runs even when
+    the title isn't unique (e.g. projects that don't use DP-NNNNN naming).
+    """
+    title = proposal.get("title") or ""
+    code, rest = parse_dp_code(title)
+    parts: list[str] = []
+    if code:
+        parts.append(code)
+    if rest:
+        parts.append(slugify(rest, max_len=40))
+    body_slug = "-".join(p for p in parts if p)[:60].rstrip("-")
+    addr_frag = proposal["address"][:8].lower()
+    if body_slug:
+        return f"{today}-metadao-{project_slug}-{body_slug}-{addr_frag}.md"
+    return f"{today}-metadao-{project_slug}-{addr_frag}.md"
+
+
+def build_source_markdown(project: dict, proposal: dict, today: str) -> str:
+    """Build the source markdown matching the existing schema."""
+    title = proposal.get("title") or f"{project['name']} proposal {proposal['address'][:8]}"
+    body_text = (proposal.get("body_text") or "").strip()
+    decoded = proposal.get("decoded") or {}
+
+    # Build YAML frontmatter
+    fm_lines = [
+        "---",
+        "type: source",
+        f'title: "MetaDAO: {project["name"]} — {title}"',
+        'author: "metadao.fi"',
+        f'url: "{proposal["url"]}"',
+        f"date: {today}",
+        "domain: internet-finance",
+        "format: data",
+        "status: unprocessed",
+        f'tags: [futardio, metadao, futarchy, solana, governance, {project["slug"]}]',
+        "event_type: proposal",
+        f'project_slug: "{project["slug"]}"',
+        f'proposal_address: "{proposal["address"]}"',
+    ]
+    if proposal.get("status"):
+        fm_lines.append(f'proposal_status: "{proposal["status"]}"')
+    if decoded.get("squadsProposal"):
+        fm_lines.append(f'squads_proposal: "{decoded["squadsProposal"]}"')
+    if decoded.get("squadsStatus"):
+        fm_lines.append(f'squads_status: "{decoded["squadsStatus"]}"')
+    fm_lines.append("---")
+    fm_lines.append("")
+
+    # Header section — quick facts
+    body_md = [
+        f"# {title}",
+        "",
+        "## Proposal Details",
+        f"- Project: {project['name']} (`{project['slug']}`)",
+        f"- Proposal: {title}",
+        f"- Address: `{proposal['address']}`",
+    ]
+    if proposal.get("status"):
+        body_md.append(f"- Status: {proposal['status']}")
+    body_md.append(f"- URL: {proposal['url']}")
+
+    # Proposal prose body (rendered text from the page)
+    body_md.append("")
+    body_md.append("## Proposal Body")
+    body_md.append("")
+    body_md.append(body_text or "_(no body captured)_")
+
+    # Decoded on-chain instructions
+    if decoded:
+        body_md.append("")
+        body_md.append("## On-chain Decoded")
+        if decoded.get("squadsUrl"):
+            body_md.append(f"- Squads: {decoded['squadsUrl']}")
+        instrs = decoded.get("instructions") or []
+        if instrs:
+            body_md.append("")
+            body_md.append("### Instructions")
+            for i, instr in enumerate(instrs, 1):
+                body_md.append(f"{i}. **{instr.get('description', instr.get('type', 'instruction'))}** ({instr.get('program', '')})")
+                for f in instr.get("fields", []) or []:
+                    val = f.get("fullValue") or f.get("value") or ""
+                    body_md.append(f"   - {f.get('label', '')}: `{val}`")
+                if instr.get("summary"):
+                    body_md.append(f"   - Summary: {instr['summary']}")
+
+    return "\n".join(fm_lines + body_md) + "\n"
+
+
+def main() -> int:
+    p = argparse.ArgumentParser(description="Scrape MetaDAO proposals into inbox source files")
+    p.add_argument("--archive-dir", required=True, help="existing archive dir (skip if basename exists here)")
+    p.add_argument("--output-dir", required=True, help="dir to write new source markdown into")
+    p.add_argument("--project", help="restrict to a single project slug (default: scan all)")
+    p.add_argument("--limit", type=int, default=0, help="max number of new proposals to capture (0 = unlimited)")
+    p.add_argument("--dry-run", action="store_true", help="print intended writes instead of writing")
+    p.add_argument("--headless", action="store_true", default=True)
+    args = p.parse_args()
+
+    archive_dir = Path(args.archive_dir).resolve()
+    output_dir = Path(args.output_dir).resolve()
+    seen_basenames = existing_basenames(archive_dir, output_dir)
+    seen_addresses = existing_proposal_addresses(archive_dir, output_dir)
+    log.info("loaded %d existing basenames + %d known proposal addresses from %s + %s",
+             len(seen_basenames), len(seen_addresses), archive_dir, output_dir)
+
+    today = date.today().isoformat()
+
+    written: list[str] = []
+    skipped_existing = 0
+
+    with sync_playwright() as pw:
+        browser = pw.chromium.launch(headless=args.headless)
+        ctx = browser.new_context(user_agent=USER_AGENT)
+        page = ctx.new_page()
+
+        # Prime cookies
+        log.info("priming Vercel session via homepage")
+        page.goto(f"{BASE}/", wait_until="domcontentloaded", timeout=30000)
+        page.wait_for_timeout(1500)
+
+        # Discovery
+        if args.project:
+            project_slugs = [args.project]
+        else:
+            project_slugs = list_project_slugs(page)
+        log.info("discovered %d project slugs: %s", len(project_slugs), project_slugs)
+
+        for slug in project_slugs:
+            try:
+                project = get_project_metadata(page, slug)
+            except Exception:
+                log.exception("failed to read project %s", slug)
+                continue
+            log.info(" %s — %d proposals", slug, len(project["proposals"]))
+
+            for prop in project["proposals"]:
+                addr = prop["address"]
+                # Pre-check #1: known proposal address (cheapest, no browser visit)
+                if addr in seen_addresses:
+                    skipped_existing += 1
+                    continue
+                # Pre-check #2: address fragment in an existing basename
+                addr_frag = addr[:8].lower()
+                if any(addr_frag in b.lower() for b in seen_basenames):
+                    skipped_existing += 1
+                    continue
+
+                try:
+                    proposal_data = fetch_proposal(page, slug, addr, card_text=prop.get("card_text", ""))
+                except Exception:
+                    log.exception("failed to fetch proposal %s/%s", slug, addr)
+                    continue
+                if not proposal_data:
+                    continue
+
+                fname = build_filename(slug, proposal_data, today)
+
+                if Path(fname).stem in seen_basenames:
+                    skipped_existing += 1
+                    log.info("  skip (already archived by title): %s", fname)
+                    continue
+
+                content = build_source_markdown(project, proposal_data, today)
+                target = output_dir / fname
+                if args.dry_run:
+                    log.info("  DRY: would write %s (%d bytes)", target, len(content))
+                else:
+                    target.parent.mkdir(parents=True, exist_ok=True)
+                    target.write_text(content)
+                    log.info("  wrote %s (%d bytes)", target, len(content))
+                written.append(fname)
+
+                if args.limit and len(written) >= args.limit:
+                    log.info("hit limit=%d, stopping", args.limit)
+                    browser.close()
+                    print(json.dumps({"written": written, "skipped_existing": skipped_existing}))
+                    return 0
+
+        browser.close()
+
+    print(json.dumps({"written": written, "skipped_existing": skipped_existing, "dry_run": args.dry_run}))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
-- 
2.45.2


From 800d1d8b8e9f0667ba890e0f31ccb19ed6b5ebf0 Mon Sep 17 00:00:00 2001
From: m3taversal <m3taversal@gmail.com>
Date: Sat, 25 Apr 2026 13:19:06 +0100
Subject: [PATCH 2/4] fix(metadao-scrape): YAML escape + URL regex + dry_run
 consistency
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ganymede review on PR #6:
- WARNING: title and project["name"] flowed unescaped into YAML, would
  corrupt frontmatter on quote-bearing inputs (e.g. 'Adopt "Conservative"
  Pricing'). New _yaml_str helper routes free-text values through
  json.dumps (JSON strings are valid YAML strings). Applied to title,
  author, url, project_slug, proposal_address, proposal_status,
  squads_proposal, squads_status.
- NIT: URL_ADDR_RE didn't match new metadao.fi URLs — pattern segment
  couldn't span /projects/{slug}/proposal/. Added (?:/[^/...]*)*? for
  variable path depth. Verified against three URL shapes.
- NIT: dry_run key was omitted from JSON output on early --limit exit
  but present on normal exit. Trivial consistency fix.
- NIT (deferred): STAT_BLEED_RE protection is accidental rather than
  designed; only matters if MetaDAO breaks DP-NNNNN naming convention.
  Per Ganymede 'optional — current behavior fine.'

Verified: URL regex matches futard.io legacy + metadao.fi new + hypothetical
no-slug shapes. YAML escape survives embedded quotes, newlines, backslashes,
em-dashes.
---
 scripts/metadao-scrape.py | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/scripts/metadao-scrape.py b/scripts/metadao-scrape.py
index 3d2c648..7373686 100755
--- a/scripts/metadao-scrape.py
+++ b/scripts/metadao-scrape.py
@@ -57,6 +57,11 @@ def slugify(text: str, max_len: int = 60) -> str:
     return s.strip("-")[:max_len].rstrip("-")
 
 
+def _yaml_str(s: str) -> str:
+    """Quote-safe YAML string. JSON strings are valid YAML strings."""
+    return json.dumps(s, ensure_ascii=False)
+
+
 def existing_basenames(*dirs: Path) -> set[str]:
     """Collect all .md basenames (without extension) across the given dirs (recursive)."""
     seen: set[str] = set()
@@ -69,7 +74,7 @@ def existing_basenames(*dirs: Path) -> set[str]:
 
 
 PROP_ADDR_RE = re.compile(r"proposal_address:\s*[\"']?([A-Za-z0-9]{32,44})[\"']?")
-URL_ADDR_RE = re.compile(r"(?:futard\.io|metadao\.fi)/[^/\s\"']*proposal/([A-Za-z0-9]{32,44})")
+URL_ADDR_RE = re.compile(r"(?:futard\.io|metadao\.fi)(?:/[^/\s\"']*)*?/proposal/([A-Za-z0-9]{32,44})")
 
 
 def existing_proposal_addresses(*dirs: Path) -> set[str]:
@@ -306,28 +311,31 @@ def build_source_markdown(project: dict, proposal: dict, today: str) -> str:
     body_text = (proposal.get("body_text") or "").strip()
     decoded = proposal.get("decoded") or {}
 
-    # Build YAML frontmatter
+    # Build YAML frontmatter — all free-text values escaped via _yaml_str (json.dumps).
+    # project_slug is constrained to [a-z0-9-] by slugify upstream, but pass through
+    # the same path for consistency.
+    full_title = f"MetaDAO: {project['name']} — {title}"
     fm_lines = [
         "---",
         "type: source",
-        f'title: "MetaDAO: {project["name"]} — {title}"',
-        'author: "metadao.fi"',
-        f'url: "{proposal["url"]}"',
+        f"title: {_yaml_str(full_title)}",
+        f"author: {_yaml_str('metadao.fi')}",
+        f"url: {_yaml_str(proposal['url'])}",
         f"date: {today}",
         "domain: internet-finance",
         "format: data",
         "status: unprocessed",
-        f'tags: [futardio, metadao, futarchy, solana, governance, {project["slug"]}]',
+        f"tags: [futardio, metadao, futarchy, solana, governance, {project['slug']}]",
         "event_type: proposal",
-        f'project_slug: "{project["slug"]}"',
-        f'proposal_address: "{proposal["address"]}"',
+        f"project_slug: {_yaml_str(project['slug'])}",
+        f"proposal_address: {_yaml_str(proposal['address'])}",
     ]
     if proposal.get("status"):
-        fm_lines.append(f'proposal_status: "{proposal["status"]}"')
+        fm_lines.append(f"proposal_status: {_yaml_str(proposal['status'])}")
     if decoded.get("squadsProposal"):
-        fm_lines.append(f'squads_proposal: "{decoded["squadsProposal"]}"')
+        fm_lines.append(f"squads_proposal: {_yaml_str(decoded['squadsProposal'])}")
     if decoded.get("squadsStatus"):
-        fm_lines.append(f'squads_status: "{decoded["squadsStatus"]}"')
+        fm_lines.append(f"squads_status: {_yaml_str(decoded['squadsStatus'])}")
     fm_lines.append("---")
     fm_lines.append("")
 
@@ -458,7 +466,7 @@ def main() -> int:
                 if args.limit and len(written) >= args.limit:
                     log.info("hit limit=%d, stopping", args.limit)
                     browser.close()
-                    print(json.dumps({"written": written, "skipped_existing": skipped_existing}))
+                    print(json.dumps({"written": written, "skipped_existing": skipped_existing, "dry_run": args.dry_run}))
                     return 0
 
         browser.close()
-- 
2.45.2


From dde055fdbf715bacafb0e92362b0a733d1e6c900 Mon Sep 17 00:00:00 2001
From: m3taversal <m3taversal@gmail.com>
Date: Sat, 25 Apr 2026 13:24:15 +0100
Subject: [PATCH 3/4] fix(metadao-scrape): STAT_BLEED word boundaries +
 min-render gate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ganymede review on PR #6 (commit 800d1d8 → this commit):

- WARNING: STAT_BLEED_RE false-positives on common words. The original
  pattern matched standalone stat-keyword tokens, clipping legitimate
  titles like "Engage with Pantera and Active Capital" → trimmed at
  " Active". Fix: require numeric/symbolic context (\$, +, -, \d) AFTER
  the stat-word, so word-only sequences pass through unchanged.

- _clean_title_candidate now uses finditer + first-match-past-offset-10
  instead of re.search. The DP-NNNNN digit sequence always wins leftmost
  position; we want the first POST-title bleed match instead.

- NIT 3: minimum-render gate before write. Skip partial renders rather
  than archiving stubs whose downstream extraction null-results.
  Threshold: body < 500B AND no DP-N in title → skip and retry next run.

Verified 10/10 on test grid: real bleed trimmed, mid-word false-positives
preserved (Compass, Active Capital, Live Streaming, Encompass, Activate,
Passage, Failure all pass through unchanged).

NIT 1 (--headless no-op flag) and NIT 2 (futardio tag provenance noise):
deferred — cosmetic, batch with future touch.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 scripts/metadao-scrape.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/scripts/metadao-scrape.py b/scripts/metadao-scrape.py
index 7373686..8909882 100755
--- a/scripts/metadao-scrape.py
+++ b/scripts/metadao-scrape.py
@@ -173,16 +173,23 @@ DP_STRICT_RE = re.compile(r"DP-\d+\s*\([A-Z]+\)\s*[:\-]\s*[^\n\r]+", re.MULTILIN
 # Loose pattern: any line starting with DP-NNNNN followed by something.
 DP_LOOSE_RE = re.compile(r"DP-\d+\s*(?:\([A-Z]+\))?\s*[:\-]?\s*[^\n\r]+", re.MULTILINE)
 STAT_BLEED_RE = re.compile(
-    r"\s*(?:Pass|Fail|Passed|Failed|Active|Pending|Ended|Live|TOTAL|VOLUME|STATUS|MCAP|PRICE|SPOT|\$|\+\d|-\d|\d+\.\d+%|\d{4,})",
+    # Stat keywords only bleed when followed by a numeric/symbolic stat token,
+    # so word-only sequences like "Active Capital" or "Live Streaming Service" pass.
+    r"\s+\b(?:Pass|Fail|Passed|Failed|Active|Pending|Ended|Live|TOTAL|VOLUME|STATUS|MCAP|PRICE|SPOT)\b\s+(?:\$|\+|-|\d)"
+    r"|\s*(?:\$\d|\+\d{2,}|\d+\.\d+%|\d{5,})",
     re.IGNORECASE,
 )
 
 
 def _clean_title_candidate(line: str) -> str:
     line = line.strip()
-    bleed = STAT_BLEED_RE.search(line)
-    if bleed and bleed.start() > 10:  # require some title before the bleed
-        line = line[: bleed.start()].rstrip(" :-—")
+    # Find first bleed match past offset 10. re.search returns leftmost, but the
+    # DP-NNNNN digit sequence always wins first place; we want the first POST-title
+    # match instead. Walk all matches and trim at the earliest one past the guard.
+    for bleed in STAT_BLEED_RE.finditer(line):
+        if bleed.start() > 10:
+            line = line[: bleed.start()].rstrip(" :-—")
+            break
     return line.strip()[:200]
 
 
@@ -446,6 +453,15 @@ def main() -> int:
                 if not proposal_data:
                     continue
 
+                # Minimum-render gate: skip partial renders rather than archiving stubs.
+                # Successful captures are 20KB+; require either a real body or a DP-N title.
+                body_len = len(proposal_data.get("body_text") or "")
+                has_dp_match = bool(re.search(r"DP-\d+", proposal_data.get("title", "") or ""))
+                if body_len < 500 and not has_dp_match:
+                    log.warning("  skip (insufficient render): %s body=%dB title=%r",
+                                addr, body_len, proposal_data.get("title", ""))
+                    continue
+
                 fname = build_filename(slug, proposal_data, today)
 
                 if Path(fname).stem in seen_basenames:
-- 
2.45.2


From 353c4a57b9915a4e130c5f017835f60abc7be14f Mon Sep 17 00:00:00 2001
From: m3taversal <m3taversal@gmail.com>
Date: Sat, 25 Apr 2026 13:27:48 +0100
Subject: [PATCH 4/4] fix(deploy): add scripts/ to deploy.sh + auto-deploy.sh
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per Ganymede review of PR #6: scripts/ was in neither deploy script,
so 25 root-level Python scripts (metadao-scrape.py, embed-claims.py,
tier0-gate.py, etc.) lived in repo but never reached VPS.

Changes (identical pattern in both files):
- Add scripts/*.py to pre-deploy syntax check glob
- Add scripts/ rsync to $PIPELINE_DIR/scripts/

Restart trigger NOT updated — scripts/ are cron-invoked (not
daemon-imported), same pattern as fetch_coins.py.

All 25 scripts/*.py pre-flight syntax check passed locally.
---
 deploy/auto-deploy.sh | 3 ++-
 deploy/deploy.sh      | 6 +++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/deploy/auto-deploy.sh b/deploy/auto-deploy.sh
index 00352ca..1613a21 100755
--- a/deploy/auto-deploy.sh
+++ b/deploy/auto-deploy.sh
@@ -51,7 +51,7 @@ fi
 
 # Syntax check all Python files before copying
 ERRORS=0
-for f in lib/*.py *.py diagnostics/*.py telegram/*.py tests/*.py; do
+for f in lib/*.py *.py diagnostics/*.py telegram/*.py tests/*.py scripts/*.py; do
   [ -f "$f" ] || continue
   if ! python3 -c "import ast, sys; ast.parse(open(sys.argv[1]).read())" "$f" 2>&1; then
     log "SYNTAX ERROR: $f"
@@ -77,6 +77,7 @@ rsync "${RSYNC_OPTS[@]}" telegram/ "$PIPELINE_DIR/telegram/"
 rsync "${RSYNC_OPTS[@]}" diagnostics/ "$DIAGNOSTICS_DIR/"
 rsync "${RSYNC_OPTS[@]}" agent-state/ "$AGENT_STATE_DIR/"
 rsync "${RSYNC_OPTS[@]}" tests/ "$PIPELINE_DIR/tests/"
+rsync "${RSYNC_OPTS[@]}" scripts/ "$PIPELINE_DIR/scripts/"
 [ -f research/research-session.sh ] && rsync "${RSYNC_OPTS[@]}" research/research-session.sh /opt/teleo-eval/research-session.sh
 
 # Safety net: ensure all .sh files are executable after rsync
diff --git a/deploy/deploy.sh b/deploy/deploy.sh
index f6abeed..161a116 100755
--- a/deploy/deploy.sh
+++ b/deploy/deploy.sh
@@ -41,7 +41,7 @@ echo ""
 # Syntax check all Python files before deploying
 echo "=== Pre-deploy syntax check ==="
 ERRORS=0
-for f in "$REPO_ROOT/lib/"*.py "$REPO_ROOT/"*.py "$REPO_ROOT/diagnostics/"*.py "$REPO_ROOT/telegram/"*.py; do
+for f in "$REPO_ROOT/lib/"*.py "$REPO_ROOT/"*.py "$REPO_ROOT/diagnostics/"*.py "$REPO_ROOT/telegram/"*.py "$REPO_ROOT/scripts/"*.py; do
   [ -f "$f" ] || continue
   if ! python3 -c "import ast, sys; ast.parse(open(sys.argv[1]).read())" "$f" 2>/dev/null; then
     echo "SYNTAX ERROR: $f"
@@ -80,6 +80,10 @@ echo "=== Tests ==="
 rsync "${RSYNC_OPTS[@]}" "$REPO_ROOT/tests/" "$VPS_HOST:$VPS_PIPELINE/tests/"
 echo ""
 
+echo "=== Scripts ==="
+rsync "${RSYNC_OPTS[@]}" "$REPO_ROOT/scripts/" "$VPS_HOST:$VPS_PIPELINE/scripts/"
+echo ""
+
 echo "=== Diagnostics ==="
 rsync "${RSYNC_OPTS[@]}" "$REPO_ROOT/diagnostics/" "$VPS_HOST:$VPS_DIAGNOSTICS/"
 echo ""
-- 
2.45.2