feat(news): replace keyword sentiment with VADER
CI/CD / build-and-push (push) Successful in 1m27s

vaderSentiment==3.3.2 added to requirements.txt.

_score_headlines now:
- scores each article (title + description) with VADER compound ∈ [-1, +1]
- filters out articles with |compound| ≤ 0.05 (no clear signal)
- weights remaining articles by recency (GNews newest-first, rank 0 → highest weight)
- returns weighted mean clamped to [-1, +1]

Removes the custom keyword sets (_POSITIVE/_NEGATIVE) and the set-based
bag-of-words algorithm that capped scores at ~±0.5 in practice.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
chemavx
2026-04-14 12:42:19 +00:00
parent 33ad86f352
commit 82d6d357eb
2 changed files with 40 additions and 34 deletions
+39 -34
View File
@@ -7,6 +7,10 @@ Free tier: 100 requests/day. Budget:
- 2-second sleep between actual API calls to avoid burst 429s - 2-second sleep between actual API calls to avoid burst 429s
With ≤9 politics markets and 6 h cache → ≤9 requests per 6 h = ≤36/day. With ≤9 politics markets and 6 h cache → ≤9 requests per 6 h = ≤36/day.
Sentiment engine: VADER (Valence Aware Dictionary and sEntiment Reasoner).
Designed for short social/news text — handles negations, intensifiers, and
punctuation natively. Returns compound ∈ [-1, +1] per article.
Score returned: -1.0 (very negative headlines) → +1.0 (very positive). Score returned: -1.0 (very negative headlines) → +1.0 (very positive).
Returns 0.0 on any error or missing API key so the caller degrades gracefully. Returns 0.0 on any error or missing API key so the caller degrades gracefully.
""" """
@@ -17,32 +21,17 @@ import re
import time import time
import httpx import httpx
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
GNEWS_API = "https://gnews.io/api/v4/search" GNEWS_API = "https://gnews.io/api/v4/search"
CACHE_TTL = 6 * 3600 # seconds — ≤9 politics markets × 4 cycles/day = ≤36 req/day CACHE_TTL = 6 * 3600 # seconds — ≤9 politics markets × 4 cycles/day = ≤36 req/day
_INTER_REQUEST_SLEEP = 2 # seconds between consecutive real API calls _INTER_REQUEST_SLEEP = 2 # seconds between consecutive real API calls
_NEUTRAL_THRESHOLD = 0.05 # |compound| below this → article excluded from average
# --------------------------------------------------------------------------- # Shared VADER analyzer (stateless, thread-safe, cheap to create once)
# Keyword lists for headline sentiment _vader = SentimentIntensityAnalyzer()
# ---------------------------------------------------------------------------
_POSITIVE = {
"win", "wins", "won", "victory", "success", "successful",
"agree", "agreed", "agreement", "approve", "approved", "approval",
"confirm", "confirmed", "sign", "signed", "deal", "advance",
"progress", "support", "peace", "likely", "probable", "imminent",
"historic", "breakthrough", "resolve", "resolved", "resume", "resumed",
}
_NEGATIVE = {
"fail", "fails", "failed", "failure", "reject", "rejected", "rejection",
"block", "blocked", "refuse", "refused", "deny", "denied",
"lose", "lost", "collapse", "collapsed", "crisis", "war", "attack",
"veto", "oppose", "opposed", "unlikely", "impossible", "never",
"stall", "stalled", "withdraw", "withdrew", "sanction", "sanctions",
"threat", "threatens", "dead", "halt", "halted", "cancel", "cancelled",
"breakdown", "escalate", "escalation",
}
# Words stripped when building the search query (too generic to help relevance) # Words stripped when building the search query (too generic to help relevance)
_QUERY_STOPWORDS = { _QUERY_STOPWORDS = {
@@ -54,7 +43,7 @@ _QUERY_STOPWORDS = {
"win", "lose", "get", "make", "take", "win", "lose", "get", "make", "take",
} }
# Regex patterns for dates / noise # Regex patterns for dates / noise in market questions
_DATE_RE = re.compile( _DATE_RE = re.compile(
r"\b(january|february|march|april|may|june|july|august|" r"\b(january|february|march|april|may|june|july|august|"
r"september|october|november|december)\s+\d{1,2}\b" r"september|october|november|december)\s+\d{1,2}\b"
@@ -184,22 +173,38 @@ class NewsClient:
@staticmethod @staticmethod
def _score_headlines(articles: list[dict]) -> float: def _score_headlines(articles: list[dict]) -> float:
""" """
Score each article title + description independently, then average. Score articles with VADER, weight by recency, return ∈ [-1, +1].
Each article vote: (pos_hits - neg_hits) / (pos_hits + neg_hits) ∈ [-1, 1].
Articles with no sentiment keywords contribute 0 (not excluded). Algorithm:
1. For each article: run VADER on title + description → compound ∈ [-1, +1]
2. Exclude articles with |compound| ≤ _NEUTRAL_THRESHOLD (no clear signal)
3. GNews returns articles newest-first, so assign linear recency weights:
article[0] → weight N, article[1] → weight N-1, ..., article[N-1] → weight 1
(only counted for articles that passed the threshold filter)
4. Return weighted mean, clamped to [-1, +1]
""" """
if not articles: if not articles:
return 0.0 return 0.0
votes: list[float] = [] scored: list[tuple[float, int]] = [] # (compound, original_index)
for art in articles: for idx, art in enumerate(articles):
text = ( text = f"{art.get('title', '')} {art.get('description', '')}"
f"{art.get('title', '')} {art.get('description', '')}" compound = _vader.polarity_scores(text)["compound"]
).lower() if abs(compound) > _NEUTRAL_THRESHOLD:
words = set(re.findall(r"\b\w+\b", text)) scored.append((compound, idx))
pos = len(words & _POSITIVE)
neg = len(words & _NEGATIVE)
total = pos + neg
votes.append((pos - neg) / total if total > 0 else 0.0)
return max(-1.0, min(1.0, sum(votes) / len(votes))) if not scored:
return 0.0
n = len(scored)
# Weight by recency: earlier index (newer article) → higher weight.
# scored is already in original (newest-first) order since we enumerate
# articles sequentially and only append those that pass the threshold.
total_weight = 0.0
weighted_sum = 0.0
for rank, (compound, _idx) in enumerate(scored):
weight = n - rank # rank 0 → weight n, rank n-1 → weight 1
weighted_sum += compound * weight
total_weight += weight
return max(-1.0, min(1.0, weighted_sum / total_weight))
+1
View File
@@ -10,6 +10,7 @@ pydantic==2.7.0
# Utils # Utils
python-dotenv==1.0.1 python-dotenv==1.0.1
vaderSentiment==3.3.2
# Testing # Testing
pytest==8.2.0 pytest==8.2.0