feat(ingestion): metadao.fi scraper to replace broken futard.io ingestion #6

Open
m3taversal wants to merge 4 commits from ship/metadao-scraper into main
Showing only changes of commit dde055fdbf - Show all commits

View file

@ -173,16 +173,23 @@ DP_STRICT_RE = re.compile(r"DP-\d+\s*\([A-Z]+\)\s*[:\-]\s*[^\n\r]+", re.MULTILIN
# Loose pattern: any line starting with DP-NNNNN followed by something.
DP_LOOSE_RE = re.compile(r"DP-\d+\s*(?:\([A-Z]+\))?\s*[:\-]?\s*[^\n\r]+", re.MULTILINE)
STAT_BLEED_RE = re.compile(
r"\s*(?:Pass|Fail|Passed|Failed|Active|Pending|Ended|Live|TOTAL|VOLUME|STATUS|MCAP|PRICE|SPOT|\$|\+\d|-\d|\d+\.\d+%|\d{4,})",
# Stat keywords only bleed when followed by a numeric/symbolic stat token,
# so word-only sequences like "Active Capital" or "Live Streaming Service" pass.
r"\s+\b(?:Pass|Fail|Passed|Failed|Active|Pending|Ended|Live|TOTAL|VOLUME|STATUS|MCAP|PRICE|SPOT)\b\s+(?:\$|\+|-|\d)"
r"|\s*(?:\$\d|\+\d{2,}|\d+\.\d+%|\d{5,})",
re.IGNORECASE,
)
def _clean_title_candidate(line: str) -> str:
line = line.strip()
bleed = STAT_BLEED_RE.search(line)
if bleed and bleed.start() > 10: # require some title before the bleed
line = line[: bleed.start()].rstrip(" :-—")
# Find first bleed match past offset 10. re.search returns leftmost, but the
# DP-NNNNN digit sequence always wins first place; we want the first POST-title
# match instead. Walk all matches and trim at the earliest one past the guard.
for bleed in STAT_BLEED_RE.finditer(line):
if bleed.start() > 10:
line = line[: bleed.start()].rstrip(" :-—")
break
return line.strip()[:200]
@ -446,6 +453,15 @@ def main() -> int:
if not proposal_data:
continue
# Minimum-render gate: skip partial renders rather than archiving stubs.
# Successful captures are 20KB+; require either a real body or a DP-N title.
body_len = len(proposal_data.get("body_text") or "")
has_dp_match = bool(re.search(r"DP-\d+", proposal_data.get("title", "") or ""))
if body_len < 500 and not has_dp_match:
log.warning(" skip (insufficient render): %s body=%dB title=%r",
addr, body_len, proposal_data.get("title", ""))
continue
fname = build_filename(slug, proposal_data, today)
if Path(fname).stem in seen_basenames: