feat: author handle domain signal + conversation skip at source (Ganymede)
1. Author handle map: known X accounts (MetaDAO, Anthropic, SpaceX etc.) count as 1 keyword match toward domain routing threshold. Lightweight, no URL parsing. 2. Conversation archives now write to conversations/ subdir instead of top-level staging dir. The cron only moves top-level *.md to queue, so conversations never enter the extraction pipeline. Skip happens at write time, not at batch-extract read time — eliminates wasted I/O every 15 minutes. Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
This commit is contained in:
parent
b5aabe0364
commit
1dfc6dcc5c
1 changed files with 36 additions and 6 deletions
|
|
@ -142,17 +142,43 @@ _DOMAIN_KEYWORDS = {
|
|||
}
|
||||
|
||||
|
||||
def _classify_content(text: str) -> tuple[str, list[str]]:
|
||||
"""Classify content into domain + sub-tags based on keywords.
|
||||
# Author handle → domain map (Ganymede: counts as 1 keyword match)
|
||||
_AUTHOR_DOMAIN_MAP = {
|
||||
"metadaoproject": "internet-finance",
|
||||
"metadaofi": "internet-finance",
|
||||
"futardio": "internet-finance",
|
||||
"p2pdotme": "internet-finance",
|
||||
"oxranga": "internet-finance",
|
||||
"metanallok": "internet-finance",
|
||||
"proph3t_": "internet-finance",
|
||||
"01resolved": "internet-finance",
|
||||
"anthropicai": "ai-alignment",
|
||||
"openai": "ai-alignment",
|
||||
"daborai": "ai-alignment",
|
||||
"deepmind": "ai-alignment",
|
||||
"spacex": "space-development",
|
||||
"blaborig": "space-development",
|
||||
"nasa": "space-development",
|
||||
}
|
||||
|
||||
|
||||
def _classify_content(text: str, author: str = "") -> tuple[str, list[str]]:
|
||||
"""Classify content into domain + sub-tags based on keywords + author.
|
||||
|
||||
Returns (domain, [sub-tags]). Default: internet-finance with no sub-tags.
|
||||
"""
|
||||
text_lower = text.lower()
|
||||
author_lower = author.lower().lstrip("@")
|
||||
|
||||
# Author handle gives 1 keyword match toward domain threshold
|
||||
author_domain = _AUTHOR_DOMAIN_MAP.get(author_lower, "")
|
||||
|
||||
# Check non-IF domains first
|
||||
for domain, keywords in _DOMAIN_KEYWORDS.items():
|
||||
matches = sum(1 for kw in keywords if kw in text_lower)
|
||||
if matches >= 2: # Need 2+ keyword matches to override default domain
|
||||
if author_domain == domain:
|
||||
matches += 1 # Author signal counts as 1 match
|
||||
if matches >= 2:
|
||||
return domain, []
|
||||
|
||||
# Default to internet-finance, classify sub-topics
|
||||
|
|
@ -1234,15 +1260,19 @@ def _extract_urls(text: str) -> list[str]:
|
|||
|
||||
def _archive_exchange(user_text: str, rio_response: str, user, msg,
|
||||
url_content: str | None = None, urls: list[str] | None = None):
|
||||
"""Archive a tagged exchange to inbox/queue/ for pipeline processing."""
|
||||
"""Archive a tagged exchange. Conversations go to telegram-archives/conversations/
|
||||
(not queue — skips extraction). Sources with URLs already have standalone files."""
|
||||
try:
|
||||
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||||
username = user.username if user else "anonymous"
|
||||
slug = re.sub(r"[^a-z0-9]+", "-", user_text[:50].lower()).strip("-")
|
||||
filename = f"{date_str}-telegram-{username}-{slug}.md"
|
||||
|
||||
archive_path = Path(ARCHIVE_DIR) / filename
|
||||
archive_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
# Conversations go to conversations/ subdir (Ganymede: skip extraction at source).
|
||||
# The cron only moves top-level ARCHIVE_DIR/*.md to queue — subdirs are untouched.
|
||||
conv_dir = Path(ARCHIVE_DIR) / "conversations"
|
||||
conv_dir.mkdir(parents=True, exist_ok=True)
|
||||
archive_path = conv_dir / filename
|
||||
|
||||
# Extract rationale (the user's text minus the @mention and URL)
|
||||
rationale = re.sub(r"@\w+", "", user_text).strip()
|
||||
|
|
|
|||
Loading…
Reference in a new issue