teleo-infrastructure/scripts/reconcile-source-status.sh

#!/bin/bash
# Reconcile source archive status: mark sources as processed if claims already exist
# Usage: ./reconcile-source-status.sh [--apply]
#   Default: dry-run (preview only)
#   --apply: actually modify files

CODEX_DIR="/Users/coryabdalla/Pentagon/teleo-codex"
ARCHIVE_DIR="$CODEX_DIR/inbox/archive"
DOMAINS_DIR="$CODEX_DIR/domains"

MODE="dry-run"
[[ "${1:-}" == "--apply" ]] && MODE="apply"

echo "=== Source Status Reconciliation ==="
echo "Mode: $MODE"
echo ""

matched=0
null_result=0
skipped=0
already_ok=0

while read -r src; do
  # Only process unprocessed sources
  status=$(grep "^status:" "$src" 2>/dev/null | head -1 | sed 's/^status: *//')
  if [[ "$status" != "unprocessed" ]]; then
    already_ok=$((already_ok + 1))
    continue
  fi

  url=$(grep "^url:" "$src" 2>/dev/null | head -1 | sed 's/^url: *"*//;s/"*$//')
  title=$(grep "^title:" "$src" 2>/dev/null | head -1 | sed 's/^title: *"*//;s/"*$//')
  fname=$(basename "$src")

  # Check 1: Is this a test/spam source?
  is_test=false
  if echo "$title" | grep -qiE "^(Futardio: )?test[ -]"; then
    is_test=true
  fi

  # Check 2: URL-based match — search for the unique URL identifier in claims
  url_matched=false
  if [[ -n "$url" ]]; then
    # Extract the unique hash/slug from the URL (the long alphanumeric key)
    url_key=$(echo "$url" | grep -oE '[A-Za-z0-9]{20,}' | tail -1 || true)
    if [[ -n "$url_key" ]]; then
      if grep -rq "$url_key" "$DOMAINS_DIR" 2>/dev/null; then
        url_matched=true
      fi
    fi
    # Also try the full URL domain+path
    if ! $url_matched; then
      # Try matching the last path segment
      path_seg=$(echo "$url" | grep -oE '[^/]+$' || true)
      if [[ -n "$path_seg" ]] && [[ ${#path_seg} -gt 10 ]]; then
        if grep -rq "$path_seg" "$DOMAINS_DIR" 2>/dev/null; then
          url_matched=true
        fi
      fi
    fi
  fi

  # Check 3: Title match — search for a distinctive part of the title in claim source: fields
  title_matched=false
  if [[ -n "$title" ]]; then
    # Strip "Futardio: " prefix and grab a distinctive portion
    clean_title=$(echo "$title" | sed 's/^Futardio: //')
    # Use first 30 chars as search key (enough to be distinctive)
    title_key=$(echo "$clean_title" | cut -c1-30)
    if [[ ${#title_key} -gt 8 ]]; then
      if grep -rqi "$title_key" "$DOMAINS_DIR" 2>/dev/null; then
        title_matched=true
      fi
    fi
  fi

  if $is_test; then
    echo "  NULL-RESULT (test/spam): $fname"
    null_result=$((null_result + 1))
    if [[ "$MODE" == "apply" ]]; then
      sed -i '' "s/^status: unprocessed/status: null-result/" "$src"
      if ! grep -q "^processed_by:" "$src"; then
        sed -i '' "/^status: null-result/a\\
processed_by: epimetheus-reconcile\\
processed_date: $(date +%Y-%m-%d)\\
notes: \"auto-reconciled: test/spam source\"" "$src"
      fi
    fi
  elif $url_matched || $title_matched; then
    match_type=""
    $url_matched && match_type="url" || true
    $title_matched && match_type="${match_type:+$match_type+}title" || true
    echo "  PROCESSED ($match_type): $fname"
    matched=$((matched + 1))
    if [[ "$MODE" == "apply" ]]; then
      sed -i '' "s/^status: unprocessed/status: processed/" "$src"
      if ! grep -q "^processed_by:" "$src"; then
        sed -i '' "/^status: processed/a\\
processed_by: epimetheus-reconcile\\
processed_date: $(date +%Y-%m-%d)\\
notes: \"auto-reconciled: claims found matching this source\"" "$src"
      fi
    fi
  else
    skipped=$((skipped + 1))
  fi
done < <(find "$ARCHIVE_DIR" -name "*.md" -type f)

echo ""
echo "=== Summary ==="
echo "Already correct status: $already_ok"
echo "Matched → processed:   $matched"
echo "Test/spam → null-result: $null_result"
echo "Still unprocessed:      $skipped"
echo "Total archive files:    $(find "$ARCHIVE_DIR" -name '*.md' -type f 2>/dev/null | wc -l | tr -d ' ')"