Merge remote-tracking branch 'origin/epimetheus/reduce-rejections'
Some checks are pending
CI / lint-and-test (push) Waiting to run

This commit is contained in:
m3taversal 2026-04-20 19:03:26 +01:00
commit f0cf772182
3 changed files with 68 additions and 4 deletions

View file

@ -235,7 +235,13 @@ def _build_claim_content(claim: dict, agent: str, source_format: str | None = No
domain = claim.get("domain", "")
title = claim.get("title", claim.get("filename", "").replace("-", " ").replace(".md", ""))
description = claim.get("description", "")
confidence = claim.get("confidence", "experimental")
raw_confidence = claim.get("confidence", "experimental")
_CONFIDENCE_MAP = {
"proven": "proven", "likely": "likely", "experimental": "experimental",
"speculative": "speculative", "high": "likely", "medium": "experimental",
"low": "speculative", "very high": "proven", "moderate": "experimental",
}
confidence = _CONFIDENCE_MAP.get(raw_confidence.lower().strip(), "experimental") if isinstance(raw_confidence, str) else "experimental"
source_ref = claim.get("source", "")
body = claim.get("body", "")
scope = claim.get("scope", "")
@ -252,8 +258,8 @@ def _build_claim_content(claim: dict, agent: str, source_format: str | None = No
if target not in edge_fields[rel]:
edge_fields[rel].append(target)
for r in related_claims[:5]:
r_clean = r.replace(".md", "")
if r_clean not in edge_fields["related"]:
r_clean = r.replace(".md", "").strip("[]").strip()
if r_clean and r_clean not in edge_fields["related"]:
edge_fields["related"].append(r_clean)
edge_lines = []
@ -301,6 +307,14 @@ def _build_entity_content(entity: dict, domain: str) -> str:
description = entity.get("content", "")
if description:
# Strip code fences the LLM may have wrapped the content in
description = description.strip()
if description.startswith("```"):
first_nl = description.find("\n")
if first_nl != -1:
description = description[first_nl + 1:]
if description.endswith("```"):
description = description[:-3].rstrip()
return description
name = entity.get("filename", "").replace("-", " ").replace(".md", "").title()
@ -435,6 +449,31 @@ async def _extract_one_source(
content = _build_entity_content(e, domain)
entity_files.append({"filename": filename, "domain": domain, "content": content})
# 6.5. Pre-filter near-duplicates BEFORE post-extract validation
# Uses same SequenceMatcher threshold as tier0. Catches duplicates cheaply ($0)
# before they create PRs and burn eval cycles.
if claim_files and existing_claims:
from difflib import SequenceMatcher as _SM
_DEDUP_THRESHOLD = 0.85
filtered = []
for cf in claim_files:
title_lower = Path(cf["filename"]).stem.replace("-", " ").lower()
title_words = set(title_lower.split()[:6])
is_dup = False
for existing in existing_claims:
existing_lower = existing.replace("-", " ").lower()
if len(title_words & set(existing_lower.split()[:6])) < 2:
continue
if _SM(None, title_lower, existing_lower).ratio() >= _DEDUP_THRESHOLD:
logger.info("Extract-dedup: skipping near-duplicate '%s' (matches '%s')", cf["filename"], existing)
is_dup = True
break
if not is_dup:
filtered.append(cf)
if len(filtered) < len(claim_files):
logger.info("Extract-dedup: filtered %d/%d near-duplicates", len(claim_files) - len(filtered), len(claim_files))
claim_files = filtered
# 7. Post-extraction validation
if claim_files:
kept_claims, rejected_claims, stats = validate_and_fix_claims(

View file

@ -119,6 +119,7 @@ Set `contributor_thesis_extractable: true` if you extracted the contributor's th
"These existing claims are topically related to this source. For each NEW claim you extract,",
"check this list and specify connections in the `connections` array.\n",
]
high_sim = []
for i, pa in enumerate(prior_art[:10], 1):
title = pa.get("claim_title", "untitled")
path = pa.get("claim_path", "")
@ -128,7 +129,16 @@ Set `contributor_thesis_extractable: true` if you extracted the contributor's th
pa_lines.append(f"{i}. **{title}** (`{filename}`, similarity: {score:.2f})")
if desc:
pa_lines.append(f" {desc}")
if score >= 0.75:
high_sim.append(title)
pa_lines.append("")
if high_sim:
pa_lines.append("**WARNING — HIGH SIMILARITY MATCHES (score >= 0.75):**")
pa_lines.append("The following existing claims are very similar to themes in this source.")
pa_lines.append("Do NOT extract new claims that restate these — use ENRICHMENT instead:")
for hs in high_sim:
pa_lines.append(f" - {hs}")
pa_lines.append("")
connection_candidates = "\n".join(pa_lines)
else:
connection_candidates = ""

View file

@ -140,7 +140,12 @@ def validate_schema(fm: dict) -> list[str]:
valid_conf = schema.get("valid_confidence")
confidence = fm.get("confidence")
if valid_conf and confidence and confidence not in valid_conf:
violations.append(f"invalid_confidence:{confidence}")
# Common LLM aliases — normalize before failing
_CONFIDENCE_ALIASES = {"high": "likely", "medium": "experimental", "low": "speculative", "very high": "proven", "moderate": "experimental"}
if isinstance(confidence, str) and confidence.lower().strip() in _CONFIDENCE_ALIASES:
pass # Fixable by post-extract or fixer — don't gate on this
else:
violations.append(f"invalid_confidence:{confidence}")
desc = fm.get("description")
if isinstance(desc, str) and len(desc.strip()) < 10:
@ -550,6 +555,16 @@ def tier05_mechanical_check(diff: str, existing_claims: set[str] | None = None)
is_new = filepath in new_files
if is_new:
# Strip code fences — LLM agents sometimes wrap content in ```markdown or ```yaml
stripped = content.strip()
if stripped.startswith("```"):
first_nl = stripped.find("\n")
if first_nl != -1:
stripped = stripped[first_nl + 1:]
if stripped.endswith("```"):
stripped = stripped[:-3].strip()
content = stripped
fm, body = parse_frontmatter(content)
if fm is None:
issues.append("frontmatter_schema")