Merge remote-tracking branch 'origin/epimetheus/reduce-rejections'
Some checks are pending
CI / lint-and-test (push) Waiting to run
Some checks are pending
CI / lint-and-test (push) Waiting to run
This commit is contained in:
commit
f0cf772182
3 changed files with 68 additions and 4 deletions
|
|
@ -235,7 +235,13 @@ def _build_claim_content(claim: dict, agent: str, source_format: str | None = No
|
|||
domain = claim.get("domain", "")
|
||||
title = claim.get("title", claim.get("filename", "").replace("-", " ").replace(".md", ""))
|
||||
description = claim.get("description", "")
|
||||
confidence = claim.get("confidence", "experimental")
|
||||
raw_confidence = claim.get("confidence", "experimental")
|
||||
_CONFIDENCE_MAP = {
|
||||
"proven": "proven", "likely": "likely", "experimental": "experimental",
|
||||
"speculative": "speculative", "high": "likely", "medium": "experimental",
|
||||
"low": "speculative", "very high": "proven", "moderate": "experimental",
|
||||
}
|
||||
confidence = _CONFIDENCE_MAP.get(raw_confidence.lower().strip(), "experimental") if isinstance(raw_confidence, str) else "experimental"
|
||||
source_ref = claim.get("source", "")
|
||||
body = claim.get("body", "")
|
||||
scope = claim.get("scope", "")
|
||||
|
|
@ -252,8 +258,8 @@ def _build_claim_content(claim: dict, agent: str, source_format: str | None = No
|
|||
if target not in edge_fields[rel]:
|
||||
edge_fields[rel].append(target)
|
||||
for r in related_claims[:5]:
|
||||
r_clean = r.replace(".md", "")
|
||||
if r_clean not in edge_fields["related"]:
|
||||
r_clean = r.replace(".md", "").strip("[]").strip()
|
||||
if r_clean and r_clean not in edge_fields["related"]:
|
||||
edge_fields["related"].append(r_clean)
|
||||
|
||||
edge_lines = []
|
||||
|
|
@ -301,6 +307,14 @@ def _build_entity_content(entity: dict, domain: str) -> str:
|
|||
description = entity.get("content", "")
|
||||
|
||||
if description:
|
||||
# Strip code fences the LLM may have wrapped the content in
|
||||
description = description.strip()
|
||||
if description.startswith("```"):
|
||||
first_nl = description.find("\n")
|
||||
if first_nl != -1:
|
||||
description = description[first_nl + 1:]
|
||||
if description.endswith("```"):
|
||||
description = description[:-3].rstrip()
|
||||
return description
|
||||
|
||||
name = entity.get("filename", "").replace("-", " ").replace(".md", "").title()
|
||||
|
|
@ -435,6 +449,31 @@ async def _extract_one_source(
|
|||
content = _build_entity_content(e, domain)
|
||||
entity_files.append({"filename": filename, "domain": domain, "content": content})
|
||||
|
||||
# 6.5. Pre-filter near-duplicates BEFORE post-extract validation
|
||||
# Uses same SequenceMatcher threshold as tier0. Catches duplicates cheaply ($0)
|
||||
# before they create PRs and burn eval cycles.
|
||||
if claim_files and existing_claims:
|
||||
from difflib import SequenceMatcher as _SM
|
||||
_DEDUP_THRESHOLD = 0.85
|
||||
filtered = []
|
||||
for cf in claim_files:
|
||||
title_lower = Path(cf["filename"]).stem.replace("-", " ").lower()
|
||||
title_words = set(title_lower.split()[:6])
|
||||
is_dup = False
|
||||
for existing in existing_claims:
|
||||
existing_lower = existing.replace("-", " ").lower()
|
||||
if len(title_words & set(existing_lower.split()[:6])) < 2:
|
||||
continue
|
||||
if _SM(None, title_lower, existing_lower).ratio() >= _DEDUP_THRESHOLD:
|
||||
logger.info("Extract-dedup: skipping near-duplicate '%s' (matches '%s')", cf["filename"], existing)
|
||||
is_dup = True
|
||||
break
|
||||
if not is_dup:
|
||||
filtered.append(cf)
|
||||
if len(filtered) < len(claim_files):
|
||||
logger.info("Extract-dedup: filtered %d/%d near-duplicates", len(claim_files) - len(filtered), len(claim_files))
|
||||
claim_files = filtered
|
||||
|
||||
# 7. Post-extraction validation
|
||||
if claim_files:
|
||||
kept_claims, rejected_claims, stats = validate_and_fix_claims(
|
||||
|
|
|
|||
|
|
@ -119,6 +119,7 @@ Set `contributor_thesis_extractable: true` if you extracted the contributor's th
|
|||
"These existing claims are topically related to this source. For each NEW claim you extract,",
|
||||
"check this list and specify connections in the `connections` array.\n",
|
||||
]
|
||||
high_sim = []
|
||||
for i, pa in enumerate(prior_art[:10], 1):
|
||||
title = pa.get("claim_title", "untitled")
|
||||
path = pa.get("claim_path", "")
|
||||
|
|
@ -128,7 +129,16 @@ Set `contributor_thesis_extractable: true` if you extracted the contributor's th
|
|||
pa_lines.append(f"{i}. **{title}** (`{filename}`, similarity: {score:.2f})")
|
||||
if desc:
|
||||
pa_lines.append(f" {desc}")
|
||||
if score >= 0.75:
|
||||
high_sim.append(title)
|
||||
pa_lines.append("")
|
||||
if high_sim:
|
||||
pa_lines.append("**WARNING — HIGH SIMILARITY MATCHES (score >= 0.75):**")
|
||||
pa_lines.append("The following existing claims are very similar to themes in this source.")
|
||||
pa_lines.append("Do NOT extract new claims that restate these — use ENRICHMENT instead:")
|
||||
for hs in high_sim:
|
||||
pa_lines.append(f" - {hs}")
|
||||
pa_lines.append("")
|
||||
connection_candidates = "\n".join(pa_lines)
|
||||
else:
|
||||
connection_candidates = ""
|
||||
|
|
|
|||
|
|
@ -140,7 +140,12 @@ def validate_schema(fm: dict) -> list[str]:
|
|||
valid_conf = schema.get("valid_confidence")
|
||||
confidence = fm.get("confidence")
|
||||
if valid_conf and confidence and confidence not in valid_conf:
|
||||
violations.append(f"invalid_confidence:{confidence}")
|
||||
# Common LLM aliases — normalize before failing
|
||||
_CONFIDENCE_ALIASES = {"high": "likely", "medium": "experimental", "low": "speculative", "very high": "proven", "moderate": "experimental"}
|
||||
if isinstance(confidence, str) and confidence.lower().strip() in _CONFIDENCE_ALIASES:
|
||||
pass # Fixable by post-extract or fixer — don't gate on this
|
||||
else:
|
||||
violations.append(f"invalid_confidence:{confidence}")
|
||||
|
||||
desc = fm.get("description")
|
||||
if isinstance(desc, str) and len(desc.strip()) < 10:
|
||||
|
|
@ -550,6 +555,16 @@ def tier05_mechanical_check(diff: str, existing_claims: set[str] | None = None)
|
|||
is_new = filepath in new_files
|
||||
|
||||
if is_new:
|
||||
# Strip code fences — LLM agents sometimes wrap content in ```markdown or ```yaml
|
||||
stripped = content.strip()
|
||||
if stripped.startswith("```"):
|
||||
first_nl = stripped.find("\n")
|
||||
if first_nl != -1:
|
||||
stripped = stripped[first_nl + 1:]
|
||||
if stripped.endswith("```"):
|
||||
stripped = stripped[:-3].strip()
|
||||
content = stripped
|
||||
|
||||
fm, body = parse_frontmatter(content)
|
||||
if fm is None:
|
||||
issues.append("frontmatter_schema")
|
||||
|
|
|
|||
Loading…
Reference in a new issue