fix(metadao-scrape): YAML escape + URL regex + dry_run consistency
Some checks are pending
CI / lint-and-test (pull_request) Waiting to run
Some checks are pending
CI / lint-and-test (pull_request) Waiting to run
Ganymede review on PR #6: - WARNING: title and project["name"] flowed unescaped into YAML, would corrupt frontmatter on quote-bearing inputs (e.g. 'Adopt "Conservative" Pricing'). New _yaml_str helper routes free-text values through json.dumps (JSON strings are valid YAML strings). Applied to title, author, url, project_slug, proposal_address, proposal_status, squads_proposal, squads_status. - NIT: URL_ADDR_RE didn't match new metadao.fi URLs — pattern segment couldn't span /projects/{slug}/proposal/. Added (?:/[^/...]*)*? for variable path depth. Verified against three URL shapes. - NIT: dry_run key was omitted from JSON output on early --limit exit but present on normal exit. Trivial consistency fix. - NIT (deferred): STAT_BLEED_RE protection is accidental rather than designed; only matters if MetaDAO breaks DP-NNNNN naming convention. Per Ganymede 'optional — current behavior fine.' Verified: URL regex matches futard.io legacy + metadao.fi new + hypothetical no-slug shapes. YAML escape survives embedded quotes, newlines, backslashes, em-dashes.
This commit is contained in:
parent
b8fba8195f
commit
800d1d8b8e
1 changed files with 20 additions and 12 deletions
|
|
@ -57,6 +57,11 @@ def slugify(text: str, max_len: int = 60) -> str:
|
|||
return s.strip("-")[:max_len].rstrip("-")
|
||||
|
||||
|
||||
def _yaml_str(s: str) -> str:
|
||||
"""Quote-safe YAML string. JSON strings are valid YAML strings."""
|
||||
return json.dumps(s, ensure_ascii=False)
|
||||
|
||||
|
||||
def existing_basenames(*dirs: Path) -> set[str]:
|
||||
"""Collect all .md basenames (without extension) across the given dirs (recursive)."""
|
||||
seen: set[str] = set()
|
||||
|
|
@ -69,7 +74,7 @@ def existing_basenames(*dirs: Path) -> set[str]:
|
|||
|
||||
|
||||
PROP_ADDR_RE = re.compile(r"proposal_address:\s*[\"']?([A-Za-z0-9]{32,44})[\"']?")
|
||||
URL_ADDR_RE = re.compile(r"(?:futard\.io|metadao\.fi)/[^/\s\"']*proposal/([A-Za-z0-9]{32,44})")
|
||||
URL_ADDR_RE = re.compile(r"(?:futard\.io|metadao\.fi)(?:/[^/\s\"']*)*?/proposal/([A-Za-z0-9]{32,44})")
|
||||
|
||||
|
||||
def existing_proposal_addresses(*dirs: Path) -> set[str]:
|
||||
|
|
@ -306,28 +311,31 @@ def build_source_markdown(project: dict, proposal: dict, today: str) -> str:
|
|||
body_text = (proposal.get("body_text") or "").strip()
|
||||
decoded = proposal.get("decoded") or {}
|
||||
|
||||
# Build YAML frontmatter
|
||||
# Build YAML frontmatter — all free-text values escaped via _yaml_str (json.dumps).
|
||||
# project_slug is constrained to [a-z0-9-] by slugify upstream, but pass through
|
||||
# the same path for consistency.
|
||||
full_title = f"MetaDAO: {project['name']} — {title}"
|
||||
fm_lines = [
|
||||
"---",
|
||||
"type: source",
|
||||
f'title: "MetaDAO: {project["name"]} — {title}"',
|
||||
'author: "metadao.fi"',
|
||||
f'url: "{proposal["url"]}"',
|
||||
f"title: {_yaml_str(full_title)}",
|
||||
f"author: {_yaml_str('metadao.fi')}",
|
||||
f"url: {_yaml_str(proposal['url'])}",
|
||||
f"date: {today}",
|
||||
"domain: internet-finance",
|
||||
"format: data",
|
||||
"status: unprocessed",
|
||||
f'tags: [futardio, metadao, futarchy, solana, governance, {project["slug"]}]',
|
||||
f"tags: [futardio, metadao, futarchy, solana, governance, {project['slug']}]",
|
||||
"event_type: proposal",
|
||||
f'project_slug: "{project["slug"]}"',
|
||||
f'proposal_address: "{proposal["address"]}"',
|
||||
f"project_slug: {_yaml_str(project['slug'])}",
|
||||
f"proposal_address: {_yaml_str(proposal['address'])}",
|
||||
]
|
||||
if proposal.get("status"):
|
||||
fm_lines.append(f'proposal_status: "{proposal["status"]}"')
|
||||
fm_lines.append(f"proposal_status: {_yaml_str(proposal['status'])}")
|
||||
if decoded.get("squadsProposal"):
|
||||
fm_lines.append(f'squads_proposal: "{decoded["squadsProposal"]}"')
|
||||
fm_lines.append(f"squads_proposal: {_yaml_str(decoded['squadsProposal'])}")
|
||||
if decoded.get("squadsStatus"):
|
||||
fm_lines.append(f'squads_status: "{decoded["squadsStatus"]}"')
|
||||
fm_lines.append(f"squads_status: {_yaml_str(decoded['squadsStatus'])}")
|
||||
fm_lines.append("---")
|
||||
fm_lines.append("")
|
||||
|
||||
|
|
@ -458,7 +466,7 @@ def main() -> int:
|
|||
if args.limit and len(written) >= args.limit:
|
||||
log.info("hit limit=%d, stopping", args.limit)
|
||||
browser.close()
|
||||
print(json.dumps({"written": written, "skipped_existing": skipped_existing}))
|
||||
print(json.dumps({"written": written, "skipped_existing": skipped_existing, "dry_run": args.dry_run}))
|
||||
return 0
|
||||
|
||||
browser.close()
|
||||
|
|
|
|||
Loading…
Reference in a new issue