vaderSentiment==3.3.2 added to requirements.txt. _score_headlines now: - scores each article (title + description) with VADER compound ∈ [-1, +1] - filters out articles with |compound| ≤ 0.05 (no clear signal) - weights remaining articles by recency (GNews newest-first, rank 0 → highest weight) - returns weighted mean clamped to [-1, +1] Removes the custom keyword sets (_POSITIVE/_NEGATIVE) and the set-based bag-of-words algorithm that capped scores at ~±0.5 in practice. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+39
-34
@@ -7,6 +7,10 @@ Free tier: 100 requests/day. Budget:
|
|||||||
- 2-second sleep between actual API calls to avoid burst 429s
|
- 2-second sleep between actual API calls to avoid burst 429s
|
||||||
With ≤9 politics markets and 6 h cache → ≤9 requests per 6 h = ≤36/day.
|
With ≤9 politics markets and 6 h cache → ≤9 requests per 6 h = ≤36/day.
|
||||||
|
|
||||||
|
Sentiment engine: VADER (Valence Aware Dictionary and sEntiment Reasoner).
|
||||||
|
Designed for short social/news text — handles negations, intensifiers, and
|
||||||
|
punctuation natively. Returns compound ∈ [-1, +1] per article.
|
||||||
|
|
||||||
Score returned: -1.0 (very negative headlines) → +1.0 (very positive).
|
Score returned: -1.0 (very negative headlines) → +1.0 (very positive).
|
||||||
Returns 0.0 on any error or missing API key so the caller degrades gracefully.
|
Returns 0.0 on any error or missing API key so the caller degrades gracefully.
|
||||||
"""
|
"""
|
||||||
@@ -17,32 +21,17 @@ import re
|
|||||||
import time
|
import time
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
GNEWS_API = "https://gnews.io/api/v4/search"
|
GNEWS_API = "https://gnews.io/api/v4/search"
|
||||||
CACHE_TTL = 6 * 3600 # seconds — ≤9 politics markets × 4 cycles/day = ≤36 req/day
|
CACHE_TTL = 6 * 3600 # seconds — ≤9 politics markets × 4 cycles/day = ≤36 req/day
|
||||||
_INTER_REQUEST_SLEEP = 2 # seconds between consecutive real API calls
|
_INTER_REQUEST_SLEEP = 2 # seconds between consecutive real API calls
|
||||||
|
_NEUTRAL_THRESHOLD = 0.05 # |compound| below this → article excluded from average
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# Shared VADER analyzer (stateless, thread-safe, cheap to create once)
|
||||||
# Keyword lists for headline sentiment
|
_vader = SentimentIntensityAnalyzer()
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
_POSITIVE = {
|
|
||||||
"win", "wins", "won", "victory", "success", "successful",
|
|
||||||
"agree", "agreed", "agreement", "approve", "approved", "approval",
|
|
||||||
"confirm", "confirmed", "sign", "signed", "deal", "advance",
|
|
||||||
"progress", "support", "peace", "likely", "probable", "imminent",
|
|
||||||
"historic", "breakthrough", "resolve", "resolved", "resume", "resumed",
|
|
||||||
}
|
|
||||||
_NEGATIVE = {
|
|
||||||
"fail", "fails", "failed", "failure", "reject", "rejected", "rejection",
|
|
||||||
"block", "blocked", "refuse", "refused", "deny", "denied",
|
|
||||||
"lose", "lost", "collapse", "collapsed", "crisis", "war", "attack",
|
|
||||||
"veto", "oppose", "opposed", "unlikely", "impossible", "never",
|
|
||||||
"stall", "stalled", "withdraw", "withdrew", "sanction", "sanctions",
|
|
||||||
"threat", "threatens", "dead", "halt", "halted", "cancel", "cancelled",
|
|
||||||
"breakdown", "escalate", "escalation",
|
|
||||||
}
|
|
||||||
|
|
||||||
# Words stripped when building the search query (too generic to help relevance)
|
# Words stripped when building the search query (too generic to help relevance)
|
||||||
_QUERY_STOPWORDS = {
|
_QUERY_STOPWORDS = {
|
||||||
@@ -54,7 +43,7 @@ _QUERY_STOPWORDS = {
|
|||||||
"win", "lose", "get", "make", "take",
|
"win", "lose", "get", "make", "take",
|
||||||
}
|
}
|
||||||
|
|
||||||
# Regex patterns for dates / noise
|
# Regex patterns for dates / noise in market questions
|
||||||
_DATE_RE = re.compile(
|
_DATE_RE = re.compile(
|
||||||
r"\b(january|february|march|april|may|june|july|august|"
|
r"\b(january|february|march|april|may|june|july|august|"
|
||||||
r"september|october|november|december)\s+\d{1,2}\b"
|
r"september|october|november|december)\s+\d{1,2}\b"
|
||||||
@@ -184,22 +173,38 @@ class NewsClient:
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def _score_headlines(articles: list[dict]) -> float:
|
def _score_headlines(articles: list[dict]) -> float:
|
||||||
"""
|
"""
|
||||||
Score each article title + description independently, then average.
|
Score articles with VADER, weight by recency, return ∈ [-1, +1].
|
||||||
Each article vote: (pos_hits - neg_hits) / (pos_hits + neg_hits) ∈ [-1, 1].
|
|
||||||
Articles with no sentiment keywords contribute 0 (not excluded).
|
Algorithm:
|
||||||
|
1. For each article: run VADER on title + description → compound ∈ [-1, +1]
|
||||||
|
2. Exclude articles with |compound| ≤ _NEUTRAL_THRESHOLD (no clear signal)
|
||||||
|
3. GNews returns articles newest-first, so assign linear recency weights:
|
||||||
|
article[0] → weight N, article[1] → weight N-1, ..., article[N-1] → weight 1
|
||||||
|
(only counted for articles that passed the threshold filter)
|
||||||
|
4. Return weighted mean, clamped to [-1, +1]
|
||||||
"""
|
"""
|
||||||
if not articles:
|
if not articles:
|
||||||
return 0.0
|
return 0.0
|
||||||
|
|
||||||
votes: list[float] = []
|
scored: list[tuple[float, int]] = [] # (compound, original_index)
|
||||||
for art in articles:
|
for idx, art in enumerate(articles):
|
||||||
text = (
|
text = f"{art.get('title', '')} {art.get('description', '')}"
|
||||||
f"{art.get('title', '')} {art.get('description', '')}"
|
compound = _vader.polarity_scores(text)["compound"]
|
||||||
).lower()
|
if abs(compound) > _NEUTRAL_THRESHOLD:
|
||||||
words = set(re.findall(r"\b\w+\b", text))
|
scored.append((compound, idx))
|
||||||
pos = len(words & _POSITIVE)
|
|
||||||
neg = len(words & _NEGATIVE)
|
|
||||||
total = pos + neg
|
|
||||||
votes.append((pos - neg) / total if total > 0 else 0.0)
|
|
||||||
|
|
||||||
return max(-1.0, min(1.0, sum(votes) / len(votes)))
|
if not scored:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
n = len(scored)
|
||||||
|
# Weight by recency: earlier index (newer article) → higher weight.
|
||||||
|
# scored is already in original (newest-first) order since we enumerate
|
||||||
|
# articles sequentially and only append those that pass the threshold.
|
||||||
|
total_weight = 0.0
|
||||||
|
weighted_sum = 0.0
|
||||||
|
for rank, (compound, _idx) in enumerate(scored):
|
||||||
|
weight = n - rank # rank 0 → weight n, rank n-1 → weight 1
|
||||||
|
weighted_sum += compound * weight
|
||||||
|
total_weight += weight
|
||||||
|
|
||||||
|
return max(-1.0, min(1.0, weighted_sum / total_weight))
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ pydantic==2.7.0
|
|||||||
|
|
||||||
# Utils
|
# Utils
|
||||||
python-dotenv==1.0.1
|
python-dotenv==1.0.1
|
||||||
|
vaderSentiment==3.3.2
|
||||||
|
|
||||||
# Testing
|
# Testing
|
||||||
pytest==8.2.0
|
pytest==8.2.0
|
||||||
|
|||||||
Reference in New Issue
Block a user