From b8fba8195f6f0f6203b5c9d5a150ea94eff47b91 Mon Sep 17 00:00:00 2001 From: m3taversal Date: Sat, 25 Apr 2026 13:09:31 +0100 Subject: [PATCH 1/4] feat(ingestion): metadao.fi scraper to replace broken futard.io ingestion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Background: - futard.io retired its /api/graphql endpoint between Apr 17–20 - Cloud Scheduler ingest-futard has been firing into 500s ever since (the AttributeError on e.url masked the real 404 for 5 days; fixed in living-ip/teleo-api@b8eb441 which surfaced the actual root cause) - The ecosystem migrated to metadao.fi, which is Vercel-protected - Direct curl is blocked by Vercel's anti-bot challenge regardless of headers; a real headless browser passes it cleanly Approach: - Playwright-driven scraper, runs as a one-shot - Discovery: scrape /projects DOM for project slugs, then each /projects/{slug} for proposal addresses - For each NEW proposal: visit page for prose body + call /api/decode-proposal/{addr} via in-browser fetch (bypasses challenge via the primed Vercel cookies in the browser context) for structured on-chain instructions - Idempotent: dedup against existing proposal addresses in archive frontmatter AND filename basenames - Filename embeds 8-char address fragment for stable cross-run dedup even on projects that don't use DP-NNNNN naming convention Tested locally against 6 active projects (p2p-protocol, paystream, zklsol, loyal, ranger, solomon). Captured 13 new proposals — including the Solomon Gigabus DP-00003 that triggered this work — with proper titles, status, on-chain instruction decoding (Squads transactions, SPL transfers, memos), and project metadata. Output schema matches existing futardio source files (type: source, event_type: proposal, domain: internet-finance, status: unprocessed) so the existing extract pipeline picks them up unchanged. Architectural note: this script is intentionally NOT wired to systemd yet — VPS deploy needs Playwright + Chromium system libs which require apt sudo (currently scoped to teleo-* services only). Reviewing the script first; deploy path is a separate decision. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/metadao-scrape.py | 471 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 471 insertions(+) create mode 100755 scripts/metadao-scrape.py diff --git a/scripts/metadao-scrape.py b/scripts/metadao-scrape.py new file mode 100755 index 0000000..3d2c648 --- /dev/null +++ b/scripts/metadao-scrape.py @@ -0,0 +1,471 @@ +#!/usr/bin/env python3 +"""metadao-scrape.py — pull active/recent proposals from metadao.fi into source markdown. + +Replaces the broken futard.io GraphQL ingestion (Cloud Run → teleo-api). +metadao.fi is a Vercel-protected Next.js App Router site; direct curl is blocked +by the anti-bot challenge. A real headless browser passes the challenge cleanly, +and once cookies are issued for the context we can call /api/decode-proposal/{addr} +from inside the browser to get structured instruction data. + +Discovery flow: + 1. visit / to prime Vercel cookies + 2. visit /projects, scrape distinct /projects/{slug} hrefs + 3. for each project, visit /projects/{slug}, scrape proposal addresses from DOM + 4. for each NEW proposal (basename not already in --archive-dir): + a. visit proposal page, capture rendered prose + b. call /api/decode-proposal/{addr} via in-browser fetch for instructions + c. write source markdown to --output-dir + +Idempotent. Skips proposals whose basename is already present in archive-dir +or output-dir. Designed to run from a systemd timer or one-shot. + +Usage: + python3 metadao-scrape.py --archive-dir /opt/teleo-eval/workspaces/main/inbox/archive \\ + --output-dir /opt/teleo-eval/workspaces/main/inbox/queue \\ + [--dry-run] [--limit 10] [--project solomon] +""" +from __future__ import annotations + +import argparse +import json +import logging +import re +import sys +from datetime import date, datetime +from pathlib import Path + +from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s %(message)s", +) +log = logging.getLogger("metadao-scrape") + +BASE = "https://www.metadao.fi" +USER_AGENT = ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36" +) + + +def slugify(text: str, max_len: int = 60) -> str: + s = text.lower().strip() + s = re.sub(r"[^a-z0-9\s-]", "", s) + s = re.sub(r"\s+", "-", s) + s = re.sub(r"-+", "-", s) + return s.strip("-")[:max_len].rstrip("-") + + +def existing_basenames(*dirs: Path) -> set[str]: + """Collect all .md basenames (without extension) across the given dirs (recursive).""" + seen: set[str] = set() + for d in dirs: + if not d.exists(): + continue + for p in d.rglob("*.md"): + seen.add(p.stem) + return seen + + +PROP_ADDR_RE = re.compile(r"proposal_address:\s*[\"']?([A-Za-z0-9]{32,44})[\"']?") +URL_ADDR_RE = re.compile(r"(?:futard\.io|metadao\.fi)/[^/\s\"']*proposal/([A-Za-z0-9]{32,44})") + + +def existing_proposal_addresses(*dirs: Path) -> set[str]: + """Scan frontmatter / URLs in existing source files to collect known proposal addresses. + + Reads only the first 4KB of each file (frontmatter + URL line are at the top) + to keep this fast on large archives. + """ + addrs: set[str] = set() + for d in dirs: + if not d.exists(): + continue + for p in d.rglob("*.md"): + try: + head = p.read_text(errors="replace")[:4096] + except Exception: + continue + for m in PROP_ADDR_RE.finditer(head): + addrs.add(m.group(1)) + for m in URL_ADDR_RE.finditer(head): + addrs.add(m.group(1)) + return addrs + + +def list_project_slugs(page) -> list[str]: + """Read /projects and extract distinct project slugs.""" + page.goto(f"{BASE}/projects", wait_until="domcontentloaded", timeout=30000) + page.wait_for_timeout(1500) + hrefs = page.evaluate( + """() => { + const links = Array.from(document.querySelectorAll('a[href^="/projects/"]')); + const slugs = new Set(); + for (const a of links) { + const m = a.getAttribute('href').match(/^\\/projects\\/([a-z0-9-]+)(?:\\/|$)/); + if (m && m[1]) slugs.add(m[1]); + } + return [...slugs]; + }""" + ) + return list(hrefs) + + +def get_project_metadata(page, slug: str) -> dict: + """Visit a project page and return basic metadata + proposal addresses + card text. + Card text typically contains 'SOLO-004 ENDED DP-00003 (MEM): The Gigabus Proposal Pass $0.64...' + so we capture it for downstream title parsing. + """ + url = f"{BASE}/projects/{slug}" + page.goto(url, wait_until="domcontentloaded", timeout=30000) + page.wait_for_timeout(1500) + + proposals = page.evaluate( + """() => { + const links = Array.from(document.querySelectorAll('a[href*="/proposal/"]')); + const seen = new Set(); + const out = []; + const TARGET_ADDR_RE = /\\/proposal\\/([A-Za-z0-9]+)/; + for (const a of links) { + const m = a.getAttribute('href').match(TARGET_ADDR_RE); + if (!m) continue; + if (seen.has(m[1])) continue; + seen.add(m[1]); + const addr = m[1]; + // Walk up only while the ancestor contains exactly one proposal link + // (so we get the card, not a parent that contains all cards). + let card = a; + while (card.parentElement) { + const parent = card.parentElement; + const propLinks = parent.querySelectorAll('a[href*="/proposal/"]'); + if (propLinks.length > 1) break; + card = parent; + } + out.push({ + address: addr, + link_text: (a.innerText || '').trim().slice(0, 600), + card_text: (card.innerText || '').trim().slice(0, 1500), + }); + } + return out; + }""" + ) + + # Try to read project name from h1 / title + project_name = page.evaluate( + """() => { + const h = document.querySelector('h1'); + return h ? h.innerText.trim() : ''; + }""" + ) or slug.title() + + return {"slug": slug, "name": project_name, "url": url, "proposals": proposals} + + +# Strict pattern: DP-NNNNN (CAT): Title — the canonical proposal heading. +DP_STRICT_RE = re.compile(r"DP-\d+\s*\([A-Z]+\)\s*[:\-]\s*[^\n\r]+", re.MULTILINE) +# Loose pattern: any line starting with DP-NNNNN followed by something. +DP_LOOSE_RE = re.compile(r"DP-\d+\s*(?:\([A-Z]+\))?\s*[:\-]?\s*[^\n\r]+", re.MULTILINE) +STAT_BLEED_RE = re.compile( + r"\s*(?:Pass|Fail|Passed|Failed|Active|Pending|Ended|Live|TOTAL|VOLUME|STATUS|MCAP|PRICE|SPOT|\$|\+\d|-\d|\d+\.\d+%|\d{4,})", + re.IGNORECASE, +) + + +def _clean_title_candidate(line: str) -> str: + line = line.strip() + bleed = STAT_BLEED_RE.search(line) + if bleed and bleed.start() > 10: # require some title before the bleed + line = line[: bleed.start()].rstrip(" :-—") + return line.strip()[:200] + + +def extract_dp_title(*texts: str) -> str: + """Find the canonical 'DP-NNNNN (CAT): Title' line. + + Strategy: + 1. Try strict pattern (with parenthetical category code) across all sources. + Take the SHORTEST hit — prose continuations of an already-correct title + tend to be longer than the title itself. + 2. Fall back to loose pattern, longest match. + """ + strict: list[str] = [] + loose: list[str] = [] + for t in texts: + if not t: + continue + for m in DP_STRICT_RE.finditer(t): + cleaned = _clean_title_candidate(m.group(0)) + if cleaned: + strict.append(cleaned) + for m in DP_LOOSE_RE.finditer(t): + cleaned = _clean_title_candidate(m.group(0)) + if cleaned: + loose.append(cleaned) + if strict: + return min(strict, key=len) + if loose: + return max(loose, key=len) + return "" + + +def fetch_proposal(page, project_slug: str, addr: str, card_text: str = "") -> dict | None: + """Visit proposal page, capture rendered text + decode instructions via in-browser fetch.""" + url = f"{BASE}/projects/{project_slug}/proposal/{addr}" + log.info("fetching proposal %s/%s", project_slug, addr[:8]) + try: + page.goto(url, wait_until="domcontentloaded", timeout=45000) + except PWTimeout: + log.warning("timeout loading %s — using whatever rendered", url) + page.wait_for_timeout(2500) # let RSC stream finish + + body_text = page.evaluate("() => document.body.innerText || ''") + + # Title preference: card_text (from project page) → body_text DP-NNNNN match → first h1/h2 + title_block = extract_dp_title(card_text, body_text) + if not title_block: + title_block = page.evaluate( + """() => { + const h = document.querySelector('h1, h2'); + return h ? h.innerText.trim() : ''; + }""" + ) or f"proposal-{addr[:8]}" + + # Status: 'Passed' / 'Failed' / 'Active' / 'Pending' + status = page.evaluate( + """() => { + const text = document.body.innerText || ''; + const m = text.match(/\\n(Passed|Failed|Active|Pending|Live|Ended)\\b/); + return m ? m[1] : ''; + }""" + ) + + # Get the structured /api/decode-proposal data + decoded = None + try: + decoded = page.evaluate( + f"""async () => {{ + try {{ + const r = await fetch('/api/decode-proposal/{addr}'); + if (!r.ok) return null; + return await r.json(); + }} catch (e) {{ return null; }} + }}""" + ) + except Exception as e: + log.debug("decode fetch failed for %s: %s", addr, e) + + return { + "address": addr, + "project_slug": project_slug, + "url": url, + "title": title_block, + "status": status, + "body_text": body_text, + "decoded": decoded, + } + + +def parse_dp_code(title: str) -> tuple[str, str]: + """Parse 'DP-00003 (MEM): The Gigabus Proposal' → ('dp-00003-mem', 'The Gigabus Proposal'). + Falls back gracefully if format doesn't match. + """ + # Match leading DP-NNNNN[space(category)]?[:]?[space]? plus the rest + m = re.match(r"^(DP-\d+(?:\s*\([A-Z]+\))?)\s*[:\-]?\s*(.*)$", title.strip()) + if m: + code = re.sub(r"[^a-z0-9]+", "-", m.group(1).lower()).strip("-") + rest = m.group(2).strip() + return code, rest + return "", title.strip() + + +def build_filename(project_slug: str, proposal: dict, today: str) -> str: + """YYYY-MM-DD-metadao-{slug}-{title-fragment}-{addr8}.md + + Embedding the address fragment makes filenames stable across runs even when + the title isn't unique (e.g. projects that don't use DP-NNNNN naming). + """ + title = proposal.get("title") or "" + code, rest = parse_dp_code(title) + parts: list[str] = [] + if code: + parts.append(code) + if rest: + parts.append(slugify(rest, max_len=40)) + body_slug = "-".join(p for p in parts if p)[:60].rstrip("-") + addr_frag = proposal["address"][:8].lower() + if body_slug: + return f"{today}-metadao-{project_slug}-{body_slug}-{addr_frag}.md" + return f"{today}-metadao-{project_slug}-{addr_frag}.md" + + +def build_source_markdown(project: dict, proposal: dict, today: str) -> str: + """Build the source markdown matching the existing schema.""" + title = proposal.get("title") or f"{project['name']} proposal {proposal['address'][:8]}" + body_text = (proposal.get("body_text") or "").strip() + decoded = proposal.get("decoded") or {} + + # Build YAML frontmatter + fm_lines = [ + "---", + "type: source", + f'title: "MetaDAO: {project["name"]} — {title}"', + 'author: "metadao.fi"', + f'url: "{proposal["url"]}"', + f"date: {today}", + "domain: internet-finance", + "format: data", + "status: unprocessed", + f'tags: [futardio, metadao, futarchy, solana, governance, {project["slug"]}]', + "event_type: proposal", + f'project_slug: "{project["slug"]}"', + f'proposal_address: "{proposal["address"]}"', + ] + if proposal.get("status"): + fm_lines.append(f'proposal_status: "{proposal["status"]}"') + if decoded.get("squadsProposal"): + fm_lines.append(f'squads_proposal: "{decoded["squadsProposal"]}"') + if decoded.get("squadsStatus"): + fm_lines.append(f'squads_status: "{decoded["squadsStatus"]}"') + fm_lines.append("---") + fm_lines.append("") + + # Header section — quick facts + body_md = [ + f"# {title}", + "", + "## Proposal Details", + f"- Project: {project['name']} (`{project['slug']}`)", + f"- Proposal: {title}", + f"- Address: `{proposal['address']}`", + ] + if proposal.get("status"): + body_md.append(f"- Status: {proposal['status']}") + body_md.append(f"- URL: {proposal['url']}") + + # Proposal prose body (rendered text from the page) + body_md.append("") + body_md.append("## Proposal Body") + body_md.append("") + body_md.append(body_text or "_(no body captured)_") + + # Decoded on-chain instructions + if decoded: + body_md.append("") + body_md.append("## On-chain Decoded") + if decoded.get("squadsUrl"): + body_md.append(f"- Squads: {decoded['squadsUrl']}") + instrs = decoded.get("instructions") or [] + if instrs: + body_md.append("") + body_md.append("### Instructions") + for i, instr in enumerate(instrs, 1): + body_md.append(f"{i}. **{instr.get('description', instr.get('type', 'instruction'))}** ({instr.get('program', '')})") + for f in instr.get("fields", []) or []: + val = f.get("fullValue") or f.get("value") or "" + body_md.append(f" - {f.get('label', '')}: `{val}`") + if instr.get("summary"): + body_md.append(f" - Summary: {instr['summary']}") + + return "\n".join(fm_lines + body_md) + "\n" + + +def main() -> int: + p = argparse.ArgumentParser(description="Scrape MetaDAO proposals into inbox source files") + p.add_argument("--archive-dir", required=True, help="existing archive dir (skip if basename exists here)") + p.add_argument("--output-dir", required=True, help="dir to write new source markdown into") + p.add_argument("--project", help="restrict to a single project slug (default: scan all)") + p.add_argument("--limit", type=int, default=0, help="max number of new proposals to capture (0 = unlimited)") + p.add_argument("--dry-run", action="store_true", help="print intended writes instead of writing") + p.add_argument("--headless", action="store_true", default=True) + args = p.parse_args() + + archive_dir = Path(args.archive_dir).resolve() + output_dir = Path(args.output_dir).resolve() + seen_basenames = existing_basenames(archive_dir, output_dir) + seen_addresses = existing_proposal_addresses(archive_dir, output_dir) + log.info("loaded %d existing basenames + %d known proposal addresses from %s + %s", + len(seen_basenames), len(seen_addresses), archive_dir, output_dir) + + today = date.today().isoformat() + + written: list[str] = [] + skipped_existing = 0 + + with sync_playwright() as pw: + browser = pw.chromium.launch(headless=args.headless) + ctx = browser.new_context(user_agent=USER_AGENT) + page = ctx.new_page() + + # Prime cookies + log.info("priming Vercel session via homepage") + page.goto(f"{BASE}/", wait_until="domcontentloaded", timeout=30000) + page.wait_for_timeout(1500) + + # Discovery + if args.project: + project_slugs = [args.project] + else: + project_slugs = list_project_slugs(page) + log.info("discovered %d project slugs: %s", len(project_slugs), project_slugs) + + for slug in project_slugs: + try: + project = get_project_metadata(page, slug) + except Exception: + log.exception("failed to read project %s", slug) + continue + log.info(" %s — %d proposals", slug, len(project["proposals"])) + + for prop in project["proposals"]: + addr = prop["address"] + # Pre-check #1: known proposal address (cheapest, no browser visit) + if addr in seen_addresses: + skipped_existing += 1 + continue + # Pre-check #2: address fragment in an existing basename + addr_frag = addr[:8].lower() + if any(addr_frag in b.lower() for b in seen_basenames): + skipped_existing += 1 + continue + + try: + proposal_data = fetch_proposal(page, slug, addr, card_text=prop.get("card_text", "")) + except Exception: + log.exception("failed to fetch proposal %s/%s", slug, addr) + continue + if not proposal_data: + continue + + fname = build_filename(slug, proposal_data, today) + + if Path(fname).stem in seen_basenames: + skipped_existing += 1 + log.info(" skip (already archived by title): %s", fname) + continue + + content = build_source_markdown(project, proposal_data, today) + target = output_dir / fname + if args.dry_run: + log.info(" DRY: would write %s (%d bytes)", target, len(content)) + else: + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text(content) + log.info(" wrote %s (%d bytes)", target, len(content)) + written.append(fname) + + if args.limit and len(written) >= args.limit: + log.info("hit limit=%d, stopping", args.limit) + browser.close() + print(json.dumps({"written": written, "skipped_existing": skipped_existing})) + return 0 + + browser.close() + + print(json.dumps({"written": written, "skipped_existing": skipped_existing, "dry_run": args.dry_run})) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) -- 2.45.2 From 800d1d8b8e9f0667ba890e0f31ccb19ed6b5ebf0 Mon Sep 17 00:00:00 2001 From: m3taversal Date: Sat, 25 Apr 2026 13:19:06 +0100 Subject: [PATCH 2/4] fix(metadao-scrape): YAML escape + URL regex + dry_run consistency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ganymede review on PR #6: - WARNING: title and project["name"] flowed unescaped into YAML, would corrupt frontmatter on quote-bearing inputs (e.g. 'Adopt "Conservative" Pricing'). New _yaml_str helper routes free-text values through json.dumps (JSON strings are valid YAML strings). Applied to title, author, url, project_slug, proposal_address, proposal_status, squads_proposal, squads_status. - NIT: URL_ADDR_RE didn't match new metadao.fi URLs — pattern segment couldn't span /projects/{slug}/proposal/. Added (?:/[^/...]*)*? for variable path depth. Verified against three URL shapes. - NIT: dry_run key was omitted from JSON output on early --limit exit but present on normal exit. Trivial consistency fix. - NIT (deferred): STAT_BLEED_RE protection is accidental rather than designed; only matters if MetaDAO breaks DP-NNNNN naming convention. Per Ganymede 'optional — current behavior fine.' Verified: URL regex matches futard.io legacy + metadao.fi new + hypothetical no-slug shapes. YAML escape survives embedded quotes, newlines, backslashes, em-dashes. --- scripts/metadao-scrape.py | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/scripts/metadao-scrape.py b/scripts/metadao-scrape.py index 3d2c648..7373686 100755 --- a/scripts/metadao-scrape.py +++ b/scripts/metadao-scrape.py @@ -57,6 +57,11 @@ def slugify(text: str, max_len: int = 60) -> str: return s.strip("-")[:max_len].rstrip("-") +def _yaml_str(s: str) -> str: + """Quote-safe YAML string. JSON strings are valid YAML strings.""" + return json.dumps(s, ensure_ascii=False) + + def existing_basenames(*dirs: Path) -> set[str]: """Collect all .md basenames (without extension) across the given dirs (recursive).""" seen: set[str] = set() @@ -69,7 +74,7 @@ def existing_basenames(*dirs: Path) -> set[str]: PROP_ADDR_RE = re.compile(r"proposal_address:\s*[\"']?([A-Za-z0-9]{32,44})[\"']?") -URL_ADDR_RE = re.compile(r"(?:futard\.io|metadao\.fi)/[^/\s\"']*proposal/([A-Za-z0-9]{32,44})") +URL_ADDR_RE = re.compile(r"(?:futard\.io|metadao\.fi)(?:/[^/\s\"']*)*?/proposal/([A-Za-z0-9]{32,44})") def existing_proposal_addresses(*dirs: Path) -> set[str]: @@ -306,28 +311,31 @@ def build_source_markdown(project: dict, proposal: dict, today: str) -> str: body_text = (proposal.get("body_text") or "").strip() decoded = proposal.get("decoded") or {} - # Build YAML frontmatter + # Build YAML frontmatter — all free-text values escaped via _yaml_str (json.dumps). + # project_slug is constrained to [a-z0-9-] by slugify upstream, but pass through + # the same path for consistency. + full_title = f"MetaDAO: {project['name']} — {title}" fm_lines = [ "---", "type: source", - f'title: "MetaDAO: {project["name"]} — {title}"', - 'author: "metadao.fi"', - f'url: "{proposal["url"]}"', + f"title: {_yaml_str(full_title)}", + f"author: {_yaml_str('metadao.fi')}", + f"url: {_yaml_str(proposal['url'])}", f"date: {today}", "domain: internet-finance", "format: data", "status: unprocessed", - f'tags: [futardio, metadao, futarchy, solana, governance, {project["slug"]}]', + f"tags: [futardio, metadao, futarchy, solana, governance, {project['slug']}]", "event_type: proposal", - f'project_slug: "{project["slug"]}"', - f'proposal_address: "{proposal["address"]}"', + f"project_slug: {_yaml_str(project['slug'])}", + f"proposal_address: {_yaml_str(proposal['address'])}", ] if proposal.get("status"): - fm_lines.append(f'proposal_status: "{proposal["status"]}"') + fm_lines.append(f"proposal_status: {_yaml_str(proposal['status'])}") if decoded.get("squadsProposal"): - fm_lines.append(f'squads_proposal: "{decoded["squadsProposal"]}"') + fm_lines.append(f"squads_proposal: {_yaml_str(decoded['squadsProposal'])}") if decoded.get("squadsStatus"): - fm_lines.append(f'squads_status: "{decoded["squadsStatus"]}"') + fm_lines.append(f"squads_status: {_yaml_str(decoded['squadsStatus'])}") fm_lines.append("---") fm_lines.append("") @@ -458,7 +466,7 @@ def main() -> int: if args.limit and len(written) >= args.limit: log.info("hit limit=%d, stopping", args.limit) browser.close() - print(json.dumps({"written": written, "skipped_existing": skipped_existing})) + print(json.dumps({"written": written, "skipped_existing": skipped_existing, "dry_run": args.dry_run})) return 0 browser.close() -- 2.45.2 From dde055fdbf715bacafb0e92362b0a733d1e6c900 Mon Sep 17 00:00:00 2001 From: m3taversal Date: Sat, 25 Apr 2026 13:24:15 +0100 Subject: [PATCH 3/4] fix(metadao-scrape): STAT_BLEED word boundaries + min-render gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ganymede review on PR #6 (commit 800d1d8 → this commit): - WARNING: STAT_BLEED_RE false-positives on common words. The original pattern matched standalone stat-keyword tokens, clipping legitimate titles like "Engage with Pantera and Active Capital" → trimmed at " Active". Fix: require numeric/symbolic context (\$, +, -, \d) AFTER the stat-word, so word-only sequences pass through unchanged. - _clean_title_candidate now uses finditer + first-match-past-offset-10 instead of re.search. The DP-NNNNN digit sequence always wins leftmost position; we want the first POST-title bleed match instead. - NIT 3: minimum-render gate before write. Skip partial renders rather than archiving stubs whose downstream extraction null-results. Threshold: body < 500B AND no DP-N in title → skip and retry next run. Verified 10/10 on test grid: real bleed trimmed, mid-word false-positives preserved (Compass, Active Capital, Live Streaming, Encompass, Activate, Passage, Failure all pass through unchanged). NIT 1 (--headless no-op flag) and NIT 2 (futardio tag provenance noise): deferred — cosmetic, batch with future touch. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/metadao-scrape.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/scripts/metadao-scrape.py b/scripts/metadao-scrape.py index 7373686..8909882 100755 --- a/scripts/metadao-scrape.py +++ b/scripts/metadao-scrape.py @@ -173,16 +173,23 @@ DP_STRICT_RE = re.compile(r"DP-\d+\s*\([A-Z]+\)\s*[:\-]\s*[^\n\r]+", re.MULTILIN # Loose pattern: any line starting with DP-NNNNN followed by something. DP_LOOSE_RE = re.compile(r"DP-\d+\s*(?:\([A-Z]+\))?\s*[:\-]?\s*[^\n\r]+", re.MULTILINE) STAT_BLEED_RE = re.compile( - r"\s*(?:Pass|Fail|Passed|Failed|Active|Pending|Ended|Live|TOTAL|VOLUME|STATUS|MCAP|PRICE|SPOT|\$|\+\d|-\d|\d+\.\d+%|\d{4,})", + # Stat keywords only bleed when followed by a numeric/symbolic stat token, + # so word-only sequences like "Active Capital" or "Live Streaming Service" pass. + r"\s+\b(?:Pass|Fail|Passed|Failed|Active|Pending|Ended|Live|TOTAL|VOLUME|STATUS|MCAP|PRICE|SPOT)\b\s+(?:\$|\+|-|\d)" + r"|\s*(?:\$\d|\+\d{2,}|\d+\.\d+%|\d{5,})", re.IGNORECASE, ) def _clean_title_candidate(line: str) -> str: line = line.strip() - bleed = STAT_BLEED_RE.search(line) - if bleed and bleed.start() > 10: # require some title before the bleed - line = line[: bleed.start()].rstrip(" :-—") + # Find first bleed match past offset 10. re.search returns leftmost, but the + # DP-NNNNN digit sequence always wins first place; we want the first POST-title + # match instead. Walk all matches and trim at the earliest one past the guard. + for bleed in STAT_BLEED_RE.finditer(line): + if bleed.start() > 10: + line = line[: bleed.start()].rstrip(" :-—") + break return line.strip()[:200] @@ -446,6 +453,15 @@ def main() -> int: if not proposal_data: continue + # Minimum-render gate: skip partial renders rather than archiving stubs. + # Successful captures are 20KB+; require either a real body or a DP-N title. + body_len = len(proposal_data.get("body_text") or "") + has_dp_match = bool(re.search(r"DP-\d+", proposal_data.get("title", "") or "")) + if body_len < 500 and not has_dp_match: + log.warning(" skip (insufficient render): %s body=%dB title=%r", + addr, body_len, proposal_data.get("title", "")) + continue + fname = build_filename(slug, proposal_data, today) if Path(fname).stem in seen_basenames: -- 2.45.2 From 353c4a57b9915a4e130c5f017835f60abc7be14f Mon Sep 17 00:00:00 2001 From: m3taversal Date: Sat, 25 Apr 2026 13:27:48 +0100 Subject: [PATCH 4/4] fix(deploy): add scripts/ to deploy.sh + auto-deploy.sh MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per Ganymede review of PR #6: scripts/ was in neither deploy script, so 25 root-level Python scripts (metadao-scrape.py, embed-claims.py, tier0-gate.py, etc.) lived in repo but never reached VPS. Changes (identical pattern in both files): - Add scripts/*.py to pre-deploy syntax check glob - Add scripts/ rsync to $PIPELINE_DIR/scripts/ Restart trigger NOT updated — scripts/ are cron-invoked (not daemon-imported), same pattern as fetch_coins.py. All 25 scripts/*.py pre-flight syntax check passed locally. --- deploy/auto-deploy.sh | 3 ++- deploy/deploy.sh | 6 +++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/deploy/auto-deploy.sh b/deploy/auto-deploy.sh index 00352ca..1613a21 100755 --- a/deploy/auto-deploy.sh +++ b/deploy/auto-deploy.sh @@ -51,7 +51,7 @@ fi # Syntax check all Python files before copying ERRORS=0 -for f in lib/*.py *.py diagnostics/*.py telegram/*.py tests/*.py; do +for f in lib/*.py *.py diagnostics/*.py telegram/*.py tests/*.py scripts/*.py; do [ -f "$f" ] || continue if ! python3 -c "import ast, sys; ast.parse(open(sys.argv[1]).read())" "$f" 2>&1; then log "SYNTAX ERROR: $f" @@ -77,6 +77,7 @@ rsync "${RSYNC_OPTS[@]}" telegram/ "$PIPELINE_DIR/telegram/" rsync "${RSYNC_OPTS[@]}" diagnostics/ "$DIAGNOSTICS_DIR/" rsync "${RSYNC_OPTS[@]}" agent-state/ "$AGENT_STATE_DIR/" rsync "${RSYNC_OPTS[@]}" tests/ "$PIPELINE_DIR/tests/" +rsync "${RSYNC_OPTS[@]}" scripts/ "$PIPELINE_DIR/scripts/" [ -f research/research-session.sh ] && rsync "${RSYNC_OPTS[@]}" research/research-session.sh /opt/teleo-eval/research-session.sh # Safety net: ensure all .sh files are executable after rsync diff --git a/deploy/deploy.sh b/deploy/deploy.sh index f6abeed..161a116 100755 --- a/deploy/deploy.sh +++ b/deploy/deploy.sh @@ -41,7 +41,7 @@ echo "" # Syntax check all Python files before deploying echo "=== Pre-deploy syntax check ===" ERRORS=0 -for f in "$REPO_ROOT/lib/"*.py "$REPO_ROOT/"*.py "$REPO_ROOT/diagnostics/"*.py "$REPO_ROOT/telegram/"*.py; do +for f in "$REPO_ROOT/lib/"*.py "$REPO_ROOT/"*.py "$REPO_ROOT/diagnostics/"*.py "$REPO_ROOT/telegram/"*.py "$REPO_ROOT/scripts/"*.py; do [ -f "$f" ] || continue if ! python3 -c "import ast, sys; ast.parse(open(sys.argv[1]).read())" "$f" 2>/dev/null; then echo "SYNTAX ERROR: $f" @@ -80,6 +80,10 @@ echo "=== Tests ===" rsync "${RSYNC_OPTS[@]}" "$REPO_ROOT/tests/" "$VPS_HOST:$VPS_PIPELINE/tests/" echo "" +echo "=== Scripts ===" +rsync "${RSYNC_OPTS[@]}" "$REPO_ROOT/scripts/" "$VPS_HOST:$VPS_PIPELINE/scripts/" +echo "" + echo "=== Diagnostics ===" rsync "${RSYNC_OPTS[@]}" "$REPO_ROOT/diagnostics/" "$VPS_HOST:$VPS_DIAGNOSTICS/" echo "" -- 2.45.2