""" News sentiment client for GNews API. Free tier: 100 requests/day — we stay well within this by caching each unique query for CACHE_TTL seconds (4 hours). With ~9 political markets refreshed every 4 h that is 9 × 6 = 54 requests/day. Score returned: -1.0 (very negative headlines) → +1.0 (very positive). Returns 0.0 on any error or missing API key so the caller degrades gracefully. """ import logging import os import re import time from datetime import datetime, timezone, timedelta import httpx log = logging.getLogger(__name__) GNEWS_API = "https://gnews.io/api/v4/search" CACHE_TTL = 4 * 3600 # seconds — fits 100 req/day free tier # --------------------------------------------------------------------------- # Keyword lists for headline sentiment # --------------------------------------------------------------------------- _POSITIVE = { "win", "wins", "won", "victory", "success", "successful", "agree", "agreed", "agreement", "approve", "approved", "approval", "confirm", "confirmed", "sign", "signed", "deal", "advance", "progress", "support", "peace", "likely", "probable", "imminent", "historic", "breakthrough", "resolve", "resolved", "resume", "resumed", } _NEGATIVE = { "fail", "fails", "failed", "failure", "reject", "rejected", "rejection", "block", "blocked", "refuse", "refused", "deny", "denied", "lose", "lost", "collapse", "collapsed", "crisis", "war", "attack", "veto", "oppose", "opposed", "unlikely", "impossible", "never", "stall", "stalled", "withdraw", "withdrew", "sanction", "sanctions", "threat", "threatens", "dead", "halt", "halted", "cancel", "cancelled", "breakdown", "escalate", "escalation", } # Words stripped when building the search query (too generic to help relevance) _QUERY_STOPWORDS = { "will", "the", "a", "an", "by", "in", "on", "at", "to", "of", "and", "or", "is", "be", "are", "was", "were", "have", "has", "had", "do", "does", "did", "for", "from", "with", "not", "no", "this", "that", "it", "its", "their", "they", "he", "she", "we", "most", "more", "least", "less", "any", "all", "both", "each", "win", "lose", "get", "make", "take", } # Regex patterns for dates / noise _DATE_RE = re.compile( r"\b(january|february|march|april|may|june|july|august|" r"september|october|november|december)\s+\d{1,2}\b" r"|\b20\d{2}\b" r"|\bQ[1-4]\b", flags=re.IGNORECASE, ) _PUNCT_RE = re.compile(r"[?!\"'.,;:()\[\]{}]") class NewsClient: """ Async GNews client with in-memory result cache. Usage:: client = NewsClient() score = await client.get_sentiment("Will Trump visit China") # score ∈ [-1.0, +1.0] — positive means bullish for the YES outcome await client.close() """ def __init__(self) -> None: self._api_key = os.getenv("GNEWS_API_KEY", "") self._client = httpx.AsyncClient(timeout=10) # {cache_key: (fetched_at_monotonic, score)} self._cache: dict[str, tuple[float, float]] = {} # ------------------------------------------------------------------ # Public API # ------------------------------------------------------------------ async def get_sentiment(self, question: str, days: int = 7) -> float: """ Return a sentiment score ∈ [-1.0, +1.0] for the market question. - Positive: most recent headlines suggest the YES outcome is more likely - Negative: headlines suggest the YES outcome is less likely - 0.0: neutral, no data, or API unavailable """ if not self._api_key: log.debug("GNEWS_API_KEY not set — skipping news signal") return 0.0 query = self._build_query(question) if len(query) < 3: return 0.0 cache_key = query.lower() now = time.monotonic() cached = self._cache.get(cache_key) if cached is not None: fetched_at, score = cached if now - fetched_at < CACHE_TTL: log.debug("News cache hit %r → %.3f", query, score) return score try: resp = await self._client.get( GNEWS_API, params={ "q": query, "lang": "en", "max": 10, "from": _iso_days_ago(days), "token": self._api_key, }, ) except Exception as exc: log.warning("GNews network error for %r: %s", query, exc) return 0.0 if resp.status_code == 403: log.warning("GNews: 403 — invalid key or daily quota exhausted") # Cache a neutral result for 1 h to avoid hammering the endpoint self._cache[cache_key] = (now, 0.0) return 0.0 try: resp.raise_for_status() data = resp.json() except Exception as exc: log.warning("GNews bad response for %r: %s", query, exc) return 0.0 articles = data.get("articles", []) score = self._score_headlines(articles) self._cache[cache_key] = (now, score) log.info( "GNews %r → %d articles, sentiment=%.3f", query, len(articles), score, ) return score async def close(self) -> None: await self._client.aclose() # ------------------------------------------------------------------ # Internal helpers # ------------------------------------------------------------------ @staticmethod def _build_query(question: str) -> str: """Extract meaningful search terms from a market question.""" q = _DATE_RE.sub(" ", question) q = _PUNCT_RE.sub(" ", q) tokens = [ w for w in q.split() if w.lower() not in _QUERY_STOPWORDS and len(w) > 2 ] return " ".join(tokens[:8]) # GNews handles ~8 keyword queries well @staticmethod def _score_headlines(articles: list[dict]) -> float: """ Score each article title + description independently, then average. Each article vote: (pos_hits - neg_hits) / (pos_hits + neg_hits) ∈ [-1, 1]. Articles with no sentiment keywords contribute 0 (not excluded). """ if not articles: return 0.0 votes: list[float] = [] for art in articles: text = ( f"{art.get('title', '')} {art.get('description', '')}" ).lower() words = set(re.findall(r"\b\w+\b", text)) pos = len(words & _POSITIVE) neg = len(words & _NEGATIVE) total = pos + neg votes.append((pos - neg) / total if total > 0 else 0.0) return max(-1.0, min(1.0, sum(votes) / len(votes))) def _iso_days_ago(days: int) -> str: dt = datetime.now(timezone.utc) - timedelta(days=days) return dt.strftime("%Y-%m-%dT%H:%M:%SZ")