""" News sentiment client for GNews API. Free tier: 100 requests/day. Budget: - Cache TTL: 6 hours — same query is never repeated within 6 h - Max 5 queries per trading cycle (politics markets only) - 2-second sleep between actual API calls to avoid burst 429s With ≤9 politics markets and 6 h cache → ≤9 requests per 6 h = ≤36/day. Sentiment engine: VADER (Valence Aware Dictionary and sEntiment Reasoner). Designed for short social/news text — handles negations, intensifiers, and punctuation natively. Returns compound ∈ [-1, +1] per article. Score returned: -1.0 (very negative headlines) → +1.0 (very positive). Returns 0.0 on any error or missing API key so the caller degrades gracefully. """ import asyncio import logging import os import re import time import httpx from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer log = logging.getLogger(__name__) GNEWS_API = "https://gnews.io/api/v4/search" CACHE_TTL = 6 * 3600 # seconds — ≤9 politics markets × 4 cycles/day = ≤36 req/day _INTER_REQUEST_SLEEP = 2 # seconds between consecutive real API calls _NEUTRAL_THRESHOLD = 0.05 # |compound| below this → article excluded from average # Shared VADER analyzer (stateless, thread-safe, cheap to create once) _vader = SentimentIntensityAnalyzer() # Words stripped when building the search query (too generic to help relevance) _QUERY_STOPWORDS = { "will", "the", "a", "an", "by", "in", "on", "at", "to", "of", "and", "or", "is", "be", "are", "was", "were", "have", "has", "had", "do", "does", "did", "for", "from", "with", "not", "no", "this", "that", "it", "its", "their", "they", "he", "she", "we", "most", "more", "least", "less", "any", "all", "both", "each", "win", "lose", "get", "make", "take", } # Regex patterns for dates / noise in market questions _DATE_RE = re.compile( r"\b(january|february|march|april|may|june|july|august|" r"september|october|november|december)\s+\d{1,2}\b" r"|\b20\d{2}\b" r"|\bQ[1-4]\b", flags=re.IGNORECASE, ) _PUNCT_RE = re.compile(r"[?!\"'.,;:()\[\]{}]") class NewsClient: """ Async GNews client with in-memory result cache. Usage:: client = NewsClient() score = await client.get_sentiment("Will Trump visit China") # score ∈ [-1.0, +1.0] — positive means bullish for the YES outcome await client.close() """ def __init__(self) -> None: self._api_key = os.getenv("GNEWS_API_KEY", "") self._client = httpx.AsyncClient( timeout=10, headers={"User-Agent": "Mozilla/5.0 (compatible; polymarket-bot/1.0)"}, ) # {cache_key: (fetched_at_monotonic, score)} self._cache: dict[str, tuple[float, float]] = {} # ------------------------------------------------------------------ # Public API # ------------------------------------------------------------------ async def get_sentiment(self, question: str) -> float: """ Return a sentiment score ∈ [-1.0, +1.0] for the market question. - Positive: most recent headlines suggest the YES outcome is more likely - Negative: headlines suggest the YES outcome is less likely - 0.0: neutral, no data, or API unavailable """ if not self._api_key: log.debug("GNEWS_API_KEY not set — skipping news signal") return 0.0 query = self._build_query(question) if len(query) < 3: return 0.0 cache_key = query.lower() now = time.monotonic() cached = self._cache.get(cache_key) if cached is not None: fetched_at, score = cached if now - fetched_at < CACHE_TTL: log.debug("News cache hit %r → %.3f", query, score) return score # Build URL exactly as documented for free tier: # https://gnews.io/api/v4/search?q=...&lang=en&max=10&token=... # NOTE: "from"/"to" date filters are paid-tier only — omit them. try: resp = await self._client.get( GNEWS_API, params={ "q": query, "lang": "en", "max": 10, "token": self._api_key, }, ) except Exception as exc: log.warning("GNews network error for %r: %s", query, exc) return 0.0 finally: # Always sleep after a real network attempt to avoid burst 429s await asyncio.sleep(_INTER_REQUEST_SLEEP) log.info("GNews HTTP %d for query %r", resp.status_code, query) if resp.status_code != 200: try: body = resp.json() except Exception: body = resp.text[:200] log.warning("GNews error body: %s", body) # Cache neutral for 1 h on client errors to avoid hammering the endpoint if resp.status_code in (400, 401, 403, 429): self._cache[cache_key] = (now, 0.0) return 0.0 try: data = resp.json() except Exception as exc: log.warning("GNews JSON decode error for %r: %s", query, exc) return 0.0 articles = data.get("articles", []) score = self._score_headlines(articles) self._cache[cache_key] = (now, score) log.info( "GNews %r → %d articles, sentiment=%.3f", query, len(articles), score, ) return score async def close(self) -> None: await self._client.aclose() def get_freshness(self, question: str) -> float: """ Return a freshness score [0.1, 1.0] for GNews priority calculation. Score interpretation: 1.00 — never queried (maximum priority for GNews budget) 0.75 — last queried >6 h ago (cache expired, worth re-querying) 0.40 — queried 2–6 h ago (in-cache but moderately stale) 0.10 — queried <2 h ago (cache very fresh, low re-query value) If the API key is absent, always returns 1.0 (key missing means the query will be skipped anyway; don't penalise the priority score). """ if not self._api_key: return 1.0 query = self._build_query(question) cached = self._cache.get(query.lower()) if cached is None: return 1.0 age_seconds = time.monotonic() - cached[0] if age_seconds > 6 * 3600: return 0.75 if age_seconds > 2 * 3600: return 0.40 return 0.10 # ------------------------------------------------------------------ # Internal helpers # ------------------------------------------------------------------ @staticmethod def _build_query(question: str) -> str: """Extract meaningful search terms from a market question.""" q = _DATE_RE.sub(" ", question) q = _PUNCT_RE.sub(" ", q) tokens = [ w for w in q.split() if w.lower() not in _QUERY_STOPWORDS and len(w) > 2 ] return " ".join(tokens[:8]) # GNews handles ~8 keyword queries well @staticmethod def _score_headlines(articles: list[dict]) -> float: """ Score articles with VADER, weight by recency, return ∈ [-1, +1]. Algorithm: 1. For each article: run VADER on title + description → compound ∈ [-1, +1] 2. Exclude articles with |compound| ≤ _NEUTRAL_THRESHOLD (no clear signal) 3. GNews returns articles newest-first, so assign linear recency weights: article[0] → weight N, article[1] → weight N-1, ..., article[N-1] → weight 1 (only counted for articles that passed the threshold filter) 4. Return weighted mean, clamped to [-1, +1] """ if not articles: return 0.0 scored: list[tuple[float, int]] = [] # (compound, original_index) for idx, art in enumerate(articles): text = f"{art.get('title', '')} {art.get('description', '')}" compound = _vader.polarity_scores(text)["compound"] if abs(compound) > _NEUTRAL_THRESHOLD: scored.append((compound, idx)) if not scored: return 0.0 n = len(scored) # Weight by recency: earlier index (newer article) → higher weight. # scored is already in original (newest-first) order since we enumerate # articles sequentially and only append those that pass the threshold. total_weight = 0.0 weighted_sum = 0.0 for rank, (compound, _idx) in enumerate(scored): weight = n - rank # rank 0 → weight n, rank n-1 → weight 1 weighted_sum += compound * weight total_weight += weight return max(-1.0, min(1.0, weighted_sum / total_weight))