Some checks are pending
CI / lint-and-test (push) Waiting to run
Move scattered root-level files into categorized directories: - deploy/ — deployment + mirror scripts (Ship) - scripts/ — one-off backfills + migrations (Ship) - research/ — nightly research + prompts (Ship) - docs/ — all operational documentation (shared) Delete 3 dead cron scripts replaced by pipeline daemon: - batch-extract-50.sh, evaluate-trigger.sh, extract-cron.sh Add CODEOWNERS mapping every path to its owning agent. Add README with directory structure, ownership table, and VPS layout. Update deploy.sh paths to match new structure. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
115 lines
3.8 KiB
Bash
Executable file
115 lines
3.8 KiB
Bash
Executable file
#!/bin/bash
|
|
# Reconcile source archive status: mark sources as processed if claims already exist
|
|
# Usage: ./reconcile-source-status.sh [--apply]
|
|
# Default: dry-run (preview only)
|
|
# --apply: actually modify files
|
|
|
|
CODEX_DIR="/Users/coryabdalla/Pentagon/teleo-codex"
|
|
ARCHIVE_DIR="$CODEX_DIR/inbox/archive"
|
|
DOMAINS_DIR="$CODEX_DIR/domains"
|
|
|
|
MODE="dry-run"
|
|
[[ "${1:-}" == "--apply" ]] && MODE="apply"
|
|
|
|
echo "=== Source Status Reconciliation ==="
|
|
echo "Mode: $MODE"
|
|
echo ""
|
|
|
|
matched=0
|
|
null_result=0
|
|
skipped=0
|
|
already_ok=0
|
|
|
|
while read -r src; do
|
|
# Only process unprocessed sources
|
|
status=$(grep "^status:" "$src" 2>/dev/null | head -1 | sed 's/^status: *//')
|
|
if [[ "$status" != "unprocessed" ]]; then
|
|
already_ok=$((already_ok + 1))
|
|
continue
|
|
fi
|
|
|
|
url=$(grep "^url:" "$src" 2>/dev/null | head -1 | sed 's/^url: *"*//;s/"*$//')
|
|
title=$(grep "^title:" "$src" 2>/dev/null | head -1 | sed 's/^title: *"*//;s/"*$//')
|
|
fname=$(basename "$src")
|
|
|
|
# Check 1: Is this a test/spam source?
|
|
is_test=false
|
|
if echo "$title" | grep -qiE "^(Futardio: )?test[ -]"; then
|
|
is_test=true
|
|
fi
|
|
|
|
# Check 2: URL-based match — search for the unique URL identifier in claims
|
|
url_matched=false
|
|
if [[ -n "$url" ]]; then
|
|
# Extract the unique hash/slug from the URL (the long alphanumeric key)
|
|
url_key=$(echo "$url" | grep -oE '[A-Za-z0-9]{20,}' | tail -1 || true)
|
|
if [[ -n "$url_key" ]]; then
|
|
if grep -rq "$url_key" "$DOMAINS_DIR" 2>/dev/null; then
|
|
url_matched=true
|
|
fi
|
|
fi
|
|
# Also try the full URL domain+path
|
|
if ! $url_matched; then
|
|
# Try matching the last path segment
|
|
path_seg=$(echo "$url" | grep -oE '[^/]+$' || true)
|
|
if [[ -n "$path_seg" ]] && [[ ${#path_seg} -gt 10 ]]; then
|
|
if grep -rq "$path_seg" "$DOMAINS_DIR" 2>/dev/null; then
|
|
url_matched=true
|
|
fi
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
# Check 3: Title match — search for a distinctive part of the title in claim source: fields
|
|
title_matched=false
|
|
if [[ -n "$title" ]]; then
|
|
# Strip "Futardio: " prefix and grab a distinctive portion
|
|
clean_title=$(echo "$title" | sed 's/^Futardio: //')
|
|
# Use first 30 chars as search key (enough to be distinctive)
|
|
title_key=$(echo "$clean_title" | cut -c1-30)
|
|
if [[ ${#title_key} -gt 8 ]]; then
|
|
if grep -rqi "$title_key" "$DOMAINS_DIR" 2>/dev/null; then
|
|
title_matched=true
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
if $is_test; then
|
|
echo " NULL-RESULT (test/spam): $fname"
|
|
null_result=$((null_result + 1))
|
|
if [[ "$MODE" == "apply" ]]; then
|
|
sed -i '' "s/^status: unprocessed/status: null-result/" "$src"
|
|
if ! grep -q "^processed_by:" "$src"; then
|
|
sed -i '' "/^status: null-result/a\\
|
|
processed_by: epimetheus-reconcile\\
|
|
processed_date: $(date +%Y-%m-%d)\\
|
|
notes: \"auto-reconciled: test/spam source\"" "$src"
|
|
fi
|
|
fi
|
|
elif $url_matched || $title_matched; then
|
|
match_type=""
|
|
$url_matched && match_type="url" || true
|
|
$title_matched && match_type="${match_type:+$match_type+}title" || true
|
|
echo " PROCESSED ($match_type): $fname"
|
|
matched=$((matched + 1))
|
|
if [[ "$MODE" == "apply" ]]; then
|
|
sed -i '' "s/^status: unprocessed/status: processed/" "$src"
|
|
if ! grep -q "^processed_by:" "$src"; then
|
|
sed -i '' "/^status: processed/a\\
|
|
processed_by: epimetheus-reconcile\\
|
|
processed_date: $(date +%Y-%m-%d)\\
|
|
notes: \"auto-reconciled: claims found matching this source\"" "$src"
|
|
fi
|
|
fi
|
|
else
|
|
skipped=$((skipped + 1))
|
|
fi
|
|
done < <(find "$ARCHIVE_DIR" -name "*.md" -type f)
|
|
|
|
echo ""
|
|
echo "=== Summary ==="
|
|
echo "Already correct status: $already_ok"
|
|
echo "Matched → processed: $matched"
|
|
echo "Test/spam → null-result: $null_result"
|
|
echo "Still unprocessed: $skipped"
|
|
echo "Total archive files: $(find "$ARCHIVE_DIR" -name '*.md' -type f 2>/dev/null | wc -l | tr -d ' ')"
|