From 82d6d357eb27ecfe5481f3c180e342fb644e16ee Mon Sep 17 00:00:00 2001 From: chemavx Date: Tue, 14 Apr 2026 12:42:19 +0000 Subject: [PATCH] feat(news): replace keyword sentiment with VADER MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vaderSentiment==3.3.2 added to requirements.txt. _score_headlines now: - scores each article (title + description) with VADER compound ∈ [-1, +1] - filters out articles with |compound| ≤ 0.05 (no clear signal) - weights remaining articles by recency (GNews newest-first, rank 0 → highest weight) - returns weighted mean clamped to [-1, +1] Removes the custom keyword sets (_POSITIVE/_NEGATIVE) and the set-based bag-of-words algorithm that capped scores at ~±0.5 in practice. Co-Authored-By: Claude Sonnet 4.6 --- bot/data/news.py | 73 ++++++++++++++++++++++++++---------------------- requirements.txt | 1 + 2 files changed, 40 insertions(+), 34 deletions(-) diff --git a/bot/data/news.py b/bot/data/news.py index 8eb1914..fe3fe4e 100644 --- a/bot/data/news.py +++ b/bot/data/news.py @@ -7,6 +7,10 @@ Free tier: 100 requests/day. Budget: - 2-second sleep between actual API calls to avoid burst 429s With ≤9 politics markets and 6 h cache → ≤9 requests per 6 h = ≤36/day. +Sentiment engine: VADER (Valence Aware Dictionary and sEntiment Reasoner). +Designed for short social/news text — handles negations, intensifiers, and +punctuation natively. Returns compound ∈ [-1, +1] per article. + Score returned: -1.0 (very negative headlines) → +1.0 (very positive). Returns 0.0 on any error or missing API key so the caller degrades gracefully. """ @@ -17,32 +21,17 @@ import re import time import httpx +from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer log = logging.getLogger(__name__) GNEWS_API = "https://gnews.io/api/v4/search" CACHE_TTL = 6 * 3600 # seconds — ≤9 politics markets × 4 cycles/day = ≤36 req/day _INTER_REQUEST_SLEEP = 2 # seconds between consecutive real API calls +_NEUTRAL_THRESHOLD = 0.05 # |compound| below this → article excluded from average -# --------------------------------------------------------------------------- -# Keyword lists for headline sentiment -# --------------------------------------------------------------------------- -_POSITIVE = { - "win", "wins", "won", "victory", "success", "successful", - "agree", "agreed", "agreement", "approve", "approved", "approval", - "confirm", "confirmed", "sign", "signed", "deal", "advance", - "progress", "support", "peace", "likely", "probable", "imminent", - "historic", "breakthrough", "resolve", "resolved", "resume", "resumed", -} -_NEGATIVE = { - "fail", "fails", "failed", "failure", "reject", "rejected", "rejection", - "block", "blocked", "refuse", "refused", "deny", "denied", - "lose", "lost", "collapse", "collapsed", "crisis", "war", "attack", - "veto", "oppose", "opposed", "unlikely", "impossible", "never", - "stall", "stalled", "withdraw", "withdrew", "sanction", "sanctions", - "threat", "threatens", "dead", "halt", "halted", "cancel", "cancelled", - "breakdown", "escalate", "escalation", -} +# Shared VADER analyzer (stateless, thread-safe, cheap to create once) +_vader = SentimentIntensityAnalyzer() # Words stripped when building the search query (too generic to help relevance) _QUERY_STOPWORDS = { @@ -54,7 +43,7 @@ _QUERY_STOPWORDS = { "win", "lose", "get", "make", "take", } -# Regex patterns for dates / noise +# Regex patterns for dates / noise in market questions _DATE_RE = re.compile( r"\b(january|february|march|april|may|june|july|august|" r"september|october|november|december)\s+\d{1,2}\b" @@ -184,22 +173,38 @@ class NewsClient: @staticmethod def _score_headlines(articles: list[dict]) -> float: """ - Score each article title + description independently, then average. - Each article vote: (pos_hits - neg_hits) / (pos_hits + neg_hits) ∈ [-1, 1]. - Articles with no sentiment keywords contribute 0 (not excluded). + Score articles with VADER, weight by recency, return ∈ [-1, +1]. + + Algorithm: + 1. For each article: run VADER on title + description → compound ∈ [-1, +1] + 2. Exclude articles with |compound| ≤ _NEUTRAL_THRESHOLD (no clear signal) + 3. GNews returns articles newest-first, so assign linear recency weights: + article[0] → weight N, article[1] → weight N-1, ..., article[N-1] → weight 1 + (only counted for articles that passed the threshold filter) + 4. Return weighted mean, clamped to [-1, +1] """ if not articles: return 0.0 - votes: list[float] = [] - for art in articles: - text = ( - f"{art.get('title', '')} {art.get('description', '')}" - ).lower() - words = set(re.findall(r"\b\w+\b", text)) - pos = len(words & _POSITIVE) - neg = len(words & _NEGATIVE) - total = pos + neg - votes.append((pos - neg) / total if total > 0 else 0.0) + scored: list[tuple[float, int]] = [] # (compound, original_index) + for idx, art in enumerate(articles): + text = f"{art.get('title', '')} {art.get('description', '')}" + compound = _vader.polarity_scores(text)["compound"] + if abs(compound) > _NEUTRAL_THRESHOLD: + scored.append((compound, idx)) - return max(-1.0, min(1.0, sum(votes) / len(votes))) + if not scored: + return 0.0 + + n = len(scored) + # Weight by recency: earlier index (newer article) → higher weight. + # scored is already in original (newest-first) order since we enumerate + # articles sequentially and only append those that pass the threshold. + total_weight = 0.0 + weighted_sum = 0.0 + for rank, (compound, _idx) in enumerate(scored): + weight = n - rank # rank 0 → weight n, rank n-1 → weight 1 + weighted_sum += compound * weight + total_weight += weight + + return max(-1.0, min(1.0, weighted_sum / total_weight)) diff --git a/requirements.txt b/requirements.txt index ba871dc..a669791 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,6 +10,7 @@ pydantic==2.7.0 # Utils python-dotenv==1.0.1 +vaderSentiment==3.3.2 # Testing pytest==8.2.0