From 82d6d357eb27ecfe5481f3c180e342fb644e16ee Mon Sep 17 00:00:00 2001
From: chemavx <chemavx@chemavx.xyz>
Date: Tue, 14 Apr 2026 12:42:19 +0000
Subject: [PATCH] feat(news): replace keyword sentiment with VADER
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

vaderSentiment==3.3.2 added to requirements.txt.

_score_headlines now:
- scores each article (title + description) with VADER compound ∈ [-1, +1]
- filters out articles with |compound| ≤ 0.05 (no clear signal)
- weights remaining articles by recency (GNews newest-first, rank 0 → highest weight)
- returns weighted mean clamped to [-1, +1]

Removes the custom keyword sets (_POSITIVE/_NEGATIVE) and the set-based
bag-of-words algorithm that capped scores at ~±0.5 in practice.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 bot/data/news.py | 73 ++++++++++++++++++++++++++----------------------
 requirements.txt |  1 +
 2 files changed, 40 insertions(+), 34 deletions(-)

diff --git a/bot/data/news.py b/bot/data/news.py
index 8eb1914..fe3fe4e 100644
--- a/bot/data/news.py
+++ b/bot/data/news.py
@@ -7,6 +7,10 @@ Free tier: 100 requests/day.  Budget:
 - 2-second sleep between actual API calls to avoid burst 429s
 With ≤9 politics markets and 6 h cache → ≤9 requests per 6 h = ≤36/day.
 
+Sentiment engine: VADER (Valence Aware Dictionary and sEntiment Reasoner).
+Designed for short social/news text — handles negations, intensifiers, and
+punctuation natively.  Returns compound ∈ [-1, +1] per article.
+
 Score returned: -1.0 (very negative headlines) → +1.0 (very positive).
 Returns 0.0 on any error or missing API key so the caller degrades gracefully.
 """
@@ -17,32 +21,17 @@ import re
 import time
 
 import httpx
+from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 
 log = logging.getLogger(__name__)
 
 GNEWS_API = "https://gnews.io/api/v4/search"
 CACHE_TTL = 6 * 3600   # seconds — ≤9 politics markets × 4 cycles/day = ≤36 req/day
 _INTER_REQUEST_SLEEP = 2  # seconds between consecutive real API calls
+_NEUTRAL_THRESHOLD = 0.05  # |compound| below this → article excluded from average
 
-# ---------------------------------------------------------------------------
-# Keyword lists for headline sentiment
-# ---------------------------------------------------------------------------
-_POSITIVE = {
-    "win", "wins", "won", "victory", "success", "successful",
-    "agree", "agreed", "agreement", "approve", "approved", "approval",
-    "confirm", "confirmed", "sign", "signed", "deal", "advance",
-    "progress", "support", "peace", "likely", "probable", "imminent",
-    "historic", "breakthrough", "resolve", "resolved", "resume", "resumed",
-}
-_NEGATIVE = {
-    "fail", "fails", "failed", "failure", "reject", "rejected", "rejection",
-    "block", "blocked", "refuse", "refused", "deny", "denied",
-    "lose", "lost", "collapse", "collapsed", "crisis", "war", "attack",
-    "veto", "oppose", "opposed", "unlikely", "impossible", "never",
-    "stall", "stalled", "withdraw", "withdrew", "sanction", "sanctions",
-    "threat", "threatens", "dead", "halt", "halted", "cancel", "cancelled",
-    "breakdown", "escalate", "escalation",
-}
+# Shared VADER analyzer (stateless, thread-safe, cheap to create once)
+_vader = SentimentIntensityAnalyzer()
 
 # Words stripped when building the search query (too generic to help relevance)
 _QUERY_STOPWORDS = {
@@ -54,7 +43,7 @@ _QUERY_STOPWORDS = {
     "win", "lose", "get", "make", "take",
 }
 
-# Regex patterns for dates / noise
+# Regex patterns for dates / noise in market questions
 _DATE_RE = re.compile(
     r"\b(january|february|march|april|may|june|july|august|"
     r"september|october|november|december)\s+\d{1,2}\b"
@@ -184,22 +173,38 @@ class NewsClient:
     @staticmethod
     def _score_headlines(articles: list[dict]) -> float:
         """
-        Score each article title + description independently, then average.
-        Each article vote: (pos_hits - neg_hits) / (pos_hits + neg_hits) ∈ [-1, 1].
-        Articles with no sentiment keywords contribute 0 (not excluded).
+        Score articles with VADER, weight by recency, return ∈ [-1, +1].
+
+        Algorithm:
+        1. For each article: run VADER on title + description → compound ∈ [-1, +1]
+        2. Exclude articles with |compound| ≤ _NEUTRAL_THRESHOLD (no clear signal)
+        3. GNews returns articles newest-first, so assign linear recency weights:
+           article[0] → weight N, article[1] → weight N-1, ..., article[N-1] → weight 1
+           (only counted for articles that passed the threshold filter)
+        4. Return weighted mean, clamped to [-1, +1]
         """
         if not articles:
             return 0.0
 
-        votes: list[float] = []
-        for art in articles:
-            text = (
-                f"{art.get('title', '')} {art.get('description', '')}"
-            ).lower()
-            words = set(re.findall(r"\b\w+\b", text))
-            pos = len(words & _POSITIVE)
-            neg = len(words & _NEGATIVE)
-            total = pos + neg
-            votes.append((pos - neg) / total if total > 0 else 0.0)
+        scored: list[tuple[float, int]] = []  # (compound, original_index)
+        for idx, art in enumerate(articles):
+            text = f"{art.get('title', '')} {art.get('description', '')}"
+            compound = _vader.polarity_scores(text)["compound"]
+            if abs(compound) > _NEUTRAL_THRESHOLD:
+                scored.append((compound, idx))
 
-        return max(-1.0, min(1.0, sum(votes) / len(votes)))
+        if not scored:
+            return 0.0
+
+        n = len(scored)
+        # Weight by recency: earlier index (newer article) → higher weight.
+        # scored is already in original (newest-first) order since we enumerate
+        # articles sequentially and only append those that pass the threshold.
+        total_weight = 0.0
+        weighted_sum = 0.0
+        for rank, (compound, _idx) in enumerate(scored):
+            weight = n - rank  # rank 0 → weight n, rank n-1 → weight 1
+            weighted_sum += compound * weight
+            total_weight += weight
+
+        return max(-1.0, min(1.0, weighted_sum / total_weight))
diff --git a/requirements.txt b/requirements.txt
index ba871dc..a669791 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,6 +10,7 @@ pydantic==2.7.0
 
 # Utils
 python-dotenv==1.0.1
+vaderSentiment==3.3.2
 
 # Testing
 pytest==8.2.0