Merge remote-tracking branch 'origin/epimetheus/reduce-rejections'

2026-04-20 19:03:26 +01:00 · 2026-04-20 19:03:26 +01:00 · f0cf772182
commit f0cf772182
parent 4fc541c656 12078c8707
3 changed files with 68 additions and 4 deletions
--- a/lib/extract.py
+++ b/lib/extract.py
@ -235,7 +235,13 @@ def _build_claim_content(claim: dict, agent: str, source_format: str | None = No
    domain = claim.get("domain", "")
    title = claim.get("title", claim.get("filename", "").replace("-", " ").replace(".md", ""))
    description = claim.get("description", "")
-    confidence = claim.get("confidence", "experimental")
+    raw_confidence = claim.get("confidence", "experimental")
+    _CONFIDENCE_MAP = {
+        "proven": "proven", "likely": "likely", "experimental": "experimental",
+        "speculative": "speculative", "high": "likely", "medium": "experimental",
+        "low": "speculative", "very high": "proven", "moderate": "experimental",
+    }
+    confidence = _CONFIDENCE_MAP.get(raw_confidence.lower().strip(), "experimental") if isinstance(raw_confidence, str) else "experimental"
    source_ref = claim.get("source", "")
    body = claim.get("body", "")
    scope = claim.get("scope", "")
@ -252,8 +258,8 @@ def _build_claim_content(claim: dict, agent: str, source_format: str | None = No
            if target not in edge_fields[rel]:
                edge_fields[rel].append(target)
    for r in related_claims[:5]:
-        r_clean = r.replace(".md", "")
-        if r_clean not in edge_fields["related"]:
+        r_clean = r.replace(".md", "").strip("[]").strip()
+        if r_clean and r_clean not in edge_fields["related"]:
            edge_fields["related"].append(r_clean)

    edge_lines = []
@ -301,6 +307,14 @@ def _build_entity_content(entity: dict, domain: str) -> str:
    description = entity.get("content", "")

    if description:
+        # Strip code fences the LLM may have wrapped the content in
+        description = description.strip()
+        if description.startswith("```"):
+            first_nl = description.find("\n")
+            if first_nl != -1:
+                description = description[first_nl + 1:]
+        if description.endswith("```"):
+            description = description[:-3].rstrip()
        return description

    name = entity.get("filename", "").replace("-", " ").replace(".md", "").title()
@ -435,6 +449,31 @@ async def _extract_one_source(
            content = _build_entity_content(e, domain)
            entity_files.append({"filename": filename, "domain": domain, "content": content})

+    # 6.5. Pre-filter near-duplicates BEFORE post-extract validation
+    # Uses same SequenceMatcher threshold as tier0. Catches duplicates cheaply ($0)
+    # before they create PRs and burn eval cycles.
+    if claim_files and existing_claims:
+        from difflib import SequenceMatcher as _SM
+        _DEDUP_THRESHOLD = 0.85
+        filtered = []
+        for cf in claim_files:
+            title_lower = Path(cf["filename"]).stem.replace("-", " ").lower()
+            title_words = set(title_lower.split()[:6])
+            is_dup = False
+            for existing in existing_claims:
+                existing_lower = existing.replace("-", " ").lower()
+                if len(title_words & set(existing_lower.split()[:6])) < 2:
+                    continue
+                if _SM(None, title_lower, existing_lower).ratio() >= _DEDUP_THRESHOLD:
+                    logger.info("Extract-dedup: skipping near-duplicate '%s' (matches '%s')", cf["filename"], existing)
+                    is_dup = True
+                    break
+            if not is_dup:
+                filtered.append(cf)
+        if len(filtered) < len(claim_files):
+            logger.info("Extract-dedup: filtered %d/%d near-duplicates", len(claim_files) - len(filtered), len(claim_files))
+        claim_files = filtered
+
    # 7. Post-extraction validation
    if claim_files:
        kept_claims, rejected_claims, stats = validate_and_fix_claims(
--- a/lib/extraction_prompt.py
+++ b/lib/extraction_prompt.py
@ -119,6 +119,7 @@ Set `contributor_thesis_extractable: true` if you extracted the contributor's th
            "These existing claims are topically related to this source. For each NEW claim you extract,",
            "check this list and specify connections in the `connections` array.\n",
        ]
+        high_sim = []
        for i, pa in enumerate(prior_art[:10], 1):
            title = pa.get("claim_title", "untitled")
            path = pa.get("claim_path", "")
@ -128,7 +129,16 @@ Set `contributor_thesis_extractable: true` if you extracted the contributor's th
            pa_lines.append(f"{i}. **{title}** (`{filename}`, similarity: {score:.2f})")
            if desc:
                pa_lines.append(f"   {desc}")
+            if score >= 0.75:
+                high_sim.append(title)
        pa_lines.append("")
+        if high_sim:
+            pa_lines.append("**WARNING — HIGH SIMILARITY MATCHES (score >= 0.75):**")
+            pa_lines.append("The following existing claims are very similar to themes in this source.")
+            pa_lines.append("Do NOT extract new claims that restate these — use ENRICHMENT instead:")
+            for hs in high_sim:
+                pa_lines.append(f"  - {hs}")
+            pa_lines.append("")
        connection_candidates = "\n".join(pa_lines)
    else:
        connection_candidates = ""
--- a/lib/validate.py
+++ b/lib/validate.py
@ -140,7 +140,12 @@ def validate_schema(fm: dict) -> list[str]:
    valid_conf = schema.get("valid_confidence")
    confidence = fm.get("confidence")
    if valid_conf and confidence and confidence not in valid_conf:
-        violations.append(f"invalid_confidence:{confidence}")
+        # Common LLM aliases — normalize before failing
+        _CONFIDENCE_ALIASES = {"high": "likely", "medium": "experimental", "low": "speculative", "very high": "proven", "moderate": "experimental"}
+        if isinstance(confidence, str) and confidence.lower().strip() in _CONFIDENCE_ALIASES:
+            pass  # Fixable by post-extract or fixer — don't gate on this
+        else:
+            violations.append(f"invalid_confidence:{confidence}")

    desc = fm.get("description")
    if isinstance(desc, str) and len(desc.strip()) < 10:
@ -550,6 +555,16 @@ def tier05_mechanical_check(diff: str, existing_claims: set[str] | None = None)
        is_new = filepath in new_files

        if is_new:
+            # Strip code fences — LLM agents sometimes wrap content in ```markdown or ```yaml
+            stripped = content.strip()
+            if stripped.startswith("```"):
+                first_nl = stripped.find("\n")
+                if first_nl != -1:
+                    stripped = stripped[first_nl + 1:]
+                if stripped.endswith("```"):
+                    stripped = stripped[:-3].strip()
+                content = stripped
+
            fm, body = parse_frontmatter(content)
            if fm is None:
                issues.append("frontmatter_schema")