fix: 3 extraction bugs causing 0% approval rate
1. Remove hardcoded [[_map]] dead link from reconstruct_claim_content 2. Add source status check + archive move after extraction (prevents re-extraction loops) 3. Fix wiki link slug→space normalization (resolves hyphens to spaces before stripping) Pentagon-Agent: Epimetheus <0144398e-4ed3-4fe2-95a3-3d72e1abf887>
This commit is contained in:
parent
2253f48993
commit
eee2b5c78b
3 changed files with 49 additions and 6 deletions
|
|
@ -100,6 +100,13 @@ for SOURCE in $SOURCES; do
|
||||||
BASENAME=$(basename "$SOURCE" .md)
|
BASENAME=$(basename "$SOURCE" .md)
|
||||||
BRANCH="extract/$BASENAME"
|
BRANCH="extract/$BASENAME"
|
||||||
|
|
||||||
|
# Skip already-processed sources (status set by extraction script, merged to main)
|
||||||
|
if grep -q "^status: processed\|^status: enrichment\|^status: null-result" "$SOURCE" 2>/dev/null; then
|
||||||
|
echo "[$(date)] [$COUNT/$MAX] SKIP $BASENAME (already processed — status in frontmatter)" >> $LOG
|
||||||
|
SKIPPED=$((SKIPPED + 1))
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
# Skip conversation archives — valuable content enters through standalone sources,
|
# Skip conversation archives — valuable content enters through standalone sources,
|
||||||
# inline tags (SOURCE:/CLAIM:), and transcript review. Raw conversations produce
|
# inline tags (SOURCE:/CLAIM:), and transcript review. Raw conversations produce
|
||||||
# low-quality claims with schema failures. (Epimetheus session 4)
|
# low-quality claims with schema failures. (Epimetheus session 4)
|
||||||
|
|
@ -244,6 +251,28 @@ Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>" >> $LOG 2>&1
|
||||||
SUCCESS=$((SUCCESS + 1))
|
SUCCESS=$((SUCCESS + 1))
|
||||||
echo " -> SUCCESS ($CHANGED files)" >> $LOG
|
echo " -> SUCCESS ($CHANGED files)" >> $LOG
|
||||||
|
|
||||||
|
# Move source from queue/ to archive/ on main worktree (prevents re-extraction)
|
||||||
|
DOMAIN=$(grep -m1 "^domain:" "$MAIN_REPO/inbox/queue/$BASENAME.md" 2>/dev/null | sed 's/^domain:\s*//')
|
||||||
|
ARCHIVE_DIR="$MAIN_REPO/inbox/archive/${DOMAIN:-uncategorized}"
|
||||||
|
mkdir -p "$ARCHIVE_DIR"
|
||||||
|
if [ -f "$MAIN_REPO/inbox/queue/$BASENAME.md" ]; then
|
||||||
|
mv "$MAIN_REPO/inbox/queue/$BASENAME.md" "$ARCHIVE_DIR/$BASENAME.md"
|
||||||
|
cd $MAIN_REPO
|
||||||
|
git add "inbox/queue/$BASENAME.md" "inbox/archive/" 2>/dev/null
|
||||||
|
git commit -m "pipeline: archive $BASENAME after extraction
|
||||||
|
|
||||||
|
Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>" >> $LOG 2>&1
|
||||||
|
for attempt in 1 2 3; do
|
||||||
|
git pull --rebase origin main >> $LOG 2>&1
|
||||||
|
git push origin main >> $LOG 2>&1 && break
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
cd $REPO
|
||||||
|
git fetch origin main >> $LOG 2>&1
|
||||||
|
git reset --hard origin/main >> $LOG 2>&1
|
||||||
|
echo " -> Archived $BASENAME to $ARCHIVE_DIR" >> $LOG
|
||||||
|
fi
|
||||||
|
|
||||||
# Back to main
|
# Back to main
|
||||||
git checkout -f main >> $LOG 2>&1
|
git checkout -f main >> $LOG 2>&1
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -163,15 +163,29 @@ def fix_frontmatter(content: str, domain: str, agent: str) -> tuple[str, list[st
|
||||||
|
|
||||||
|
|
||||||
def fix_wiki_links(content: str, existing_claims: set[str]) -> tuple[str, list[str]]:
|
def fix_wiki_links(content: str, existing_claims: set[str]) -> tuple[str, list[str]]:
|
||||||
"""Strip brackets from broken wiki links, keeping the text. Returns (fixed_content, fixes)."""
|
"""Fix or strip broken wiki links. Resolves slug→space mismatches before stripping.
|
||||||
|
|
||||||
|
The LLM often generates wiki links as slugs (hyphens) but KB filenames use spaces.
|
||||||
|
Try normalizing hyphens→spaces before giving up and stripping brackets.
|
||||||
|
"""
|
||||||
fixes = []
|
fixes = []
|
||||||
|
# Build a lookup: normalized (lowercased, hyphens→spaces) → original stem
|
||||||
|
_normalized_lookup: dict[str, str] = {}
|
||||||
|
for stem in existing_claims:
|
||||||
|
_normalized_lookup[stem.lower().replace("-", " ")] = stem
|
||||||
|
|
||||||
def replace_broken(match):
|
def replace_broken(match):
|
||||||
link = match.group(1).strip()
|
link = match.group(1).strip()
|
||||||
if link not in existing_claims:
|
if link in existing_claims:
|
||||||
fixes.append(f"stripped_wiki_link:{link[:60]}")
|
return match.group(0) # Exact match — keep as-is
|
||||||
return link # Keep text, remove brackets
|
# Try normalizing slug to spaces
|
||||||
return match.group(0)
|
normalized = link.lower().replace("-", " ")
|
||||||
|
if normalized in _normalized_lookup:
|
||||||
|
resolved = _normalized_lookup[normalized]
|
||||||
|
fixes.append(f"resolved_wiki_link:{link[:40]}->{resolved[:40]}")
|
||||||
|
return f"[[{resolved}]]"
|
||||||
|
fixes.append(f"stripped_wiki_link:{link[:60]}")
|
||||||
|
return link # Keep text, remove brackets
|
||||||
|
|
||||||
fixed = WIKI_LINK_RE.sub(replace_broken, content)
|
fixed = WIKI_LINK_RE.sub(replace_broken, content)
|
||||||
return fixed, fixes
|
return fixed, fixes
|
||||||
|
|
|
||||||
|
|
@ -262,7 +262,7 @@ def reconstruct_claim_content(claim, domain, agent):
|
||||||
]
|
]
|
||||||
for r in related[:5]:
|
for r in related[:5]:
|
||||||
lines.append(f"- [[{r}]]")
|
lines.append(f"- [[{r}]]")
|
||||||
lines.extend(["", "Topics:", "- [[_map]]", ""])
|
lines.extend(["", "Topics:", ""])
|
||||||
return "\n".join(lines)
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue