feat(ingestion): metadao.fi scraper to replace broken futard.io ingestion #6

Open
m3taversal wants to merge 4 commits from ship/metadao-scraper into main
Showing only changes of commit 800d1d8b8e - Show all commits

View file

@ -57,6 +57,11 @@ def slugify(text: str, max_len: int = 60) -> str:
return s.strip("-")[:max_len].rstrip("-")
def _yaml_str(s: str) -> str:
"""Quote-safe YAML string. JSON strings are valid YAML strings."""
return json.dumps(s, ensure_ascii=False)
def existing_basenames(*dirs: Path) -> set[str]:
"""Collect all .md basenames (without extension) across the given dirs (recursive)."""
seen: set[str] = set()
@ -69,7 +74,7 @@ def existing_basenames(*dirs: Path) -> set[str]:
PROP_ADDR_RE = re.compile(r"proposal_address:\s*[\"']?([A-Za-z0-9]{32,44})[\"']?")
URL_ADDR_RE = re.compile(r"(?:futard\.io|metadao\.fi)/[^/\s\"']*proposal/([A-Za-z0-9]{32,44})")
URL_ADDR_RE = re.compile(r"(?:futard\.io|metadao\.fi)(?:/[^/\s\"']*)*?/proposal/([A-Za-z0-9]{32,44})")
def existing_proposal_addresses(*dirs: Path) -> set[str]:
@ -306,28 +311,31 @@ def build_source_markdown(project: dict, proposal: dict, today: str) -> str:
body_text = (proposal.get("body_text") or "").strip()
decoded = proposal.get("decoded") or {}
# Build YAML frontmatter
# Build YAML frontmatter — all free-text values escaped via _yaml_str (json.dumps).
# project_slug is constrained to [a-z0-9-] by slugify upstream, but pass through
# the same path for consistency.
full_title = f"MetaDAO: {project['name']}{title}"
fm_lines = [
"---",
"type: source",
f'title: "MetaDAO: {project["name"]}{title}"',
'author: "metadao.fi"',
f'url: "{proposal["url"]}"',
f"title: {_yaml_str(full_title)}",
f"author: {_yaml_str('metadao.fi')}",
f"url: {_yaml_str(proposal['url'])}",
f"date: {today}",
"domain: internet-finance",
"format: data",
"status: unprocessed",
f'tags: [futardio, metadao, futarchy, solana, governance, {project["slug"]}]',
f"tags: [futardio, metadao, futarchy, solana, governance, {project['slug']}]",
"event_type: proposal",
f'project_slug: "{project["slug"]}"',
f'proposal_address: "{proposal["address"]}"',
f"project_slug: {_yaml_str(project['slug'])}",
f"proposal_address: {_yaml_str(proposal['address'])}",
]
if proposal.get("status"):
fm_lines.append(f'proposal_status: "{proposal["status"]}"')
fm_lines.append(f"proposal_status: {_yaml_str(proposal['status'])}")
if decoded.get("squadsProposal"):
fm_lines.append(f'squads_proposal: "{decoded["squadsProposal"]}"')
fm_lines.append(f"squads_proposal: {_yaml_str(decoded['squadsProposal'])}")
if decoded.get("squadsStatus"):
fm_lines.append(f'squads_status: "{decoded["squadsStatus"]}"')
fm_lines.append(f"squads_status: {_yaml_str(decoded['squadsStatus'])}")
fm_lines.append("---")
fm_lines.append("")
@ -458,7 +466,7 @@ def main() -> int:
if args.limit and len(written) >= args.limit:
log.info("hit limit=%d, stopping", args.limit)
browser.close()
print(json.dumps({"written": written, "skipped_existing": skipped_existing}))
print(json.dumps({"written": written, "skipped_existing": skipped_existing, "dry_run": args.dry_run}))
return 0
browser.close()