feat: add GNews sentiment signal for politics/tech/events markets

bot/data/news.py (new): - NewsClient with in-memory cache (TTL=4h) to stay within 100 req/day limit - _build_query(): strips dates, punctuation and stopwords from market question - _score_headlines(): keyword-based pos/neg vote per article, averaged ∈ [-1, +1] - Degrades to 0.0 on missing key, 403 quota, or network error bot/strategy/bayesian.py: - BayesianStrategy(news=NewsClient) — optional, backwards compatible - Signal 4: GNews sentiment applied as direct log-odds shift (weight=1.5) so a ±1.0 sentiment score moves a 50% prior to 82%/18% - +0.10 confidence boost when news signal is present - NEWS_LOGODDS_WEIGHT constant documented at module level bot/main.py: - Instantiate NewsClient, pass to BayesianStrategy, close in finally block Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 08:24:11 +00:00
parent 98e7f5fe73
commit 4dadd3c2c4
3 changed files with 223 additions and 4 deletions
@@ -0,0 +1,193 @@
 """
 News sentiment client for GNews API.
 Free tier: 100 requests/day — we stay well within this by caching each
 unique query for CACHE_TTL seconds (4 hours).  With ~9 political markets
 refreshed every 4 h that is 9 × 6 = 54 requests/day.
 Score returned: -1.0 (very negative headlines) → +1.0 (very positive).
 Returns 0.0 on any error or missing API key so the caller degrades gracefully.
 """
 import logging
 import os
 import re
 import time
 from datetime import datetime, timezone, timedelta
 import httpx
 log = logging.getLogger(__name__)
 GNEWS_API = "https://gnews.io/api/v4/search"
 CACHE_TTL = 4 * 3600   # seconds — fits 100 req/day free tier
 # ---------------------------------------------------------------------------
 # Keyword lists for headline sentiment
 # ---------------------------------------------------------------------------
 _POSITIVE = {
    "win", "wins", "won", "victory", "success", "successful",
    "agree", "agreed", "agreement", "approve", "approved", "approval",
    "confirm", "confirmed", "sign", "signed", "deal", "advance",
    "progress", "support", "peace", "likely", "probable", "imminent",
    "historic", "breakthrough", "resolve", "resolved", "resume", "resumed",
 }
 _NEGATIVE = {
    "fail", "fails", "failed", "failure", "reject", "rejected", "rejection",
    "block", "blocked", "refuse", "refused", "deny", "denied",
    "lose", "lost", "collapse", "collapsed", "crisis", "war", "attack",
    "veto", "oppose", "opposed", "unlikely", "impossible", "never",
    "stall", "stalled", "withdraw", "withdrew", "sanction", "sanctions",
    "threat", "threatens", "dead", "halt", "halted", "cancel", "cancelled",
    "breakdown", "escalate", "escalation",
 }
 # Words stripped when building the search query (too generic to help relevance)
 _QUERY_STOPWORDS = {
    "will", "the", "a", "an", "by", "in", "on", "at", "to", "of",
    "and", "or", "is", "be", "are", "was", "were", "have", "has",
    "had", "do", "does", "did", "for", "from", "with", "not", "no",
    "this", "that", "it", "its", "their", "they", "he", "she", "we",
    "most", "more", "least", "less", "any", "all", "both", "each",
    "win", "lose", "get", "make", "take",
 }
 # Regex patterns for dates / noise
 _DATE_RE = re.compile(
    r"\b(january|february|march|april|may|june|july|august|"
    r"september|october|november|december)\s+\d{1,2}\b"
    r"|\b20\d{2}\b"
    r"|\bQ[1-4]\b",
    flags=re.IGNORECASE,
 )
 _PUNCT_RE = re.compile(r"[?!\"'.,;:()\[\]{}]")
 class NewsClient:
    """
    Async GNews client with in-memory result cache.
    Usage::
        client = NewsClient()
        score = await client.get_sentiment("Will Trump visit China")
        # score ∈ [-1.0, +1.0] — positive means bullish for the YES outcome
        await client.close()
    """
    def __init__(self) -> None:
        self._api_key = os.getenv("GNEWS_API_KEY", "")
        self._client = httpx.AsyncClient(timeout=10)
        # {cache_key: (fetched_at_monotonic, score)}
        self._cache: dict[str, tuple[float, float]] = {}
    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------
    async def get_sentiment(self, question: str, days: int = 7) -> float:
        """
        Return a sentiment score ∈ [-1.0, +1.0] for the market question.
        - Positive: most recent headlines suggest the YES outcome is more likely
        - Negative: headlines suggest the YES outcome is less likely
        - 0.0: neutral, no data, or API unavailable
        """
        if not self._api_key:
            log.debug("GNEWS_API_KEY not set — skipping news signal")
            return 0.0
        query = self._build_query(question)
        if len(query) < 3:
            return 0.0
        cache_key = query.lower()
        now = time.monotonic()
        cached = self._cache.get(cache_key)
        if cached is not None:
            fetched_at, score = cached
            if now - fetched_at < CACHE_TTL:
                log.debug("News cache hit %r → %.3f", query, score)
                return score
        try:
            resp = await self._client.get(
                GNEWS_API,
                params={
                    "q": query,
                    "lang": "en",
                    "max": 10,
                    "from": _iso_days_ago(days),
                    "token": self._api_key,
                },
            )
        except Exception as exc:
            log.warning("GNews network error for %r: %s", query, exc)
            return 0.0
        if resp.status_code == 403:
            log.warning("GNews: 403 — invalid key or daily quota exhausted")
            # Cache a neutral result for 1 h to avoid hammering the endpoint
            self._cache[cache_key] = (now, 0.0)
            return 0.0
        try:
            resp.raise_for_status()
            data = resp.json()
        except Exception as exc:
            log.warning("GNews bad response for %r: %s", query, exc)
            return 0.0
        articles = data.get("articles", [])
        score = self._score_headlines(articles)
        self._cache[cache_key] = (now, score)
        log.info(
            "GNews %r → %d articles, sentiment=%.3f",
            query, len(articles), score,
        )
        return score
    async def close(self) -> None:
        await self._client.aclose()
    # ------------------------------------------------------------------
    # Internal helpers
    # ------------------------------------------------------------------
    @staticmethod
    def _build_query(question: str) -> str:
        """Extract meaningful search terms from a market question."""
        q = _DATE_RE.sub(" ", question)
        q = _PUNCT_RE.sub(" ", q)
        tokens = [
            w for w in q.split()
            if w.lower() not in _QUERY_STOPWORDS and len(w) > 2
        ]
        return " ".join(tokens[:8])   # GNews handles ~8 keyword queries well
    @staticmethod
    def _score_headlines(articles: list[dict]) -> float:
        """
        Score each article title + description independently, then average.
        Each article vote: (pos_hits - neg_hits) / (pos_hits + neg_hits) ∈ [-1, 1].
        Articles with no sentiment keywords contribute 0 (not excluded).
        """
        if not articles:
            return 0.0
        votes: list[float] = []
        for art in articles:
            text = (
                f"{art.get('title', '')} {art.get('description', '')}"
            ).lower()
            words = set(re.findall(r"\b\w+\b", text))
            pos = len(words & _POSITIVE)
            neg = len(words & _NEGATIVE)
            total = pos + neg
            votes.append((pos - neg) / total if total > 0 else 0.0)
        return max(-1.0, min(1.0, sum(votes) / len(votes)))
 def _iso_days_ago(days: int) -> str:
    dt = datetime.now(timezone.utc) - timedelta(days=days)
    return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
@@ -9,6 +9,7 @@ from contextlib import asynccontextmanager
 from bot.data.polymarket import PolymarketClient
 from bot.data.external import ExternalDataClient
 from bot.data.news import NewsClient
 from bot.strategy.bayesian import BayesianStrategy
 from bot.risk.manager import RiskManager
 from bot.executor.paper import PaperExecutor
@@ -98,7 +99,8 @@ async def main() -> None:
    poly = PolymarketClient()
    external = ExternalDataClient()
-    strategy = BayesianStrategy()
+    news = NewsClient()
    strategy = BayesianStrategy(news=news)
    risk = RiskManager(max_position_pct=0.05, max_exposure_pct=0.30)
    executor = PaperExecutor(db=db, bankroll=PAPER_BANKROLL) if PAPER_MODE else None
    metrics = MetricsTracker(db=db)
@@ -115,6 +117,7 @@ async def main() -> None:
        await run_trading_loop(poly, external, strategy, risk, executor, metrics)
    finally:
        await db.disconnect()
        await news.close()
 if __name__ == "__main__":
@@ -17,6 +17,7 @@ from typing import Optional
 from bot.data.polymarket import Market
 from bot.data.external import ExternalSignals
 from bot.data.news import NewsClient
 log = logging.getLogger(__name__)
@@ -26,6 +27,11 @@ log = logging.getLogger(__name__)
 MIN_EDGE = 0.10  # 10% edge minimum
 MIN_CONFIDENCE = 0.55  # Minimum confidence in our estimate
 # Log-odds weight applied to the GNews sentiment score (range ±1.0).
 # A weight of 1.5 means a fully negative/positive signal shifts log-odds by ±1.5,
 # which moves a 50% prior to ~18%/82% — strong but not overwhelming.
 NEWS_LOGODDS_WEIGHT = 1.5
@dataclass
 class TradingSignal:
@@ -53,8 +59,9 @@ class BayesianStrategy:
    to justify the fee + slippage cost (MIN_EDGE).
    """
-    def __init__(self) -> None:
+    def __init__(self, news: Optional[NewsClient] = None) -> None:
        self._signal_count = 0
        self._news = news  # Optional; degrades gracefully when None or key missing
    async def evaluate(
        self,
@@ -165,16 +172,29 @@ class BayesianStrategy:
            adjustments.append(dom_adj)
            sources.append(f"BTC dom: {ext.btc_dominance:.1f}% (low → alt season)")
        # Signal 4: GNews sentiment (politics / tech / events only)
        # Applied as a direct log-odds shift — stronger signal than macro proxies.
        # Weight NEWS_LOGODDS_WEIGHT=1.5 means a ±1.0 sentiment score shifts
        # log-odds by ±1.5 (e.g. 50% prior → ~82% / ~18%).
        news_log_adj = 0.0
        if (is_politics or is_tech or is_events) and self._news is not None:
            sentiment = await self._news.get_sentiment(market.question)
            if abs(sentiment) > 0.05:
                news_log_adj = sentiment * NEWS_LOGODDS_WEIGHT
                sources.append(f"GNews: {sentiment:+.2f}")
        # Macro/politics/tech/events: cap confidence lower to reflect weaker signal quality
        if is_macro or is_politics or is_tech or is_events:
            confidence_cap = 0.65
        else:
            confidence_cap = 0.90
-        # Compute posterior using log-odds updating
+        # Compute posterior using log-odds updating.
        # total_adj (BTC/F&G/dominance) is amplified ×2 because those are weak proxies.
        # news_log_adj is applied at face value — it IS a direct log-odds signal.
        log_odds_prior = math.log(prior / (1 - prior))
        total_adj = sum(adjustments)
-        estimated_prob = _sigmoid(log_odds_prior + total_adj * 2)
+        estimated_prob = _sigmoid(log_odds_prior + total_adj * 2 + news_log_adj)
        estimated_prob = max(0.05, min(0.95, estimated_prob))
        # Compute edge
@@ -185,6 +205,9 @@ class BayesianStrategy:
        # Confidence based on signal agreement
        agreement = sum(1 for a in adjustments if (a > 0) == (total_adj > 0))
        confidence = min(confidence_cap, 0.4 + (agreement / max(len(adjustments), 1)) * 0.5)
        # News signal available → boost confidence by 0.10 (news corroborates macro signals)
        if news_log_adj != 0.0:
            confidence = min(confidence_cap, confidence + 0.10)
        # Log evaluation result for every market
        action = "TRADE" if (abs_edge >= MIN_EDGE and confidence >= MIN_CONFIDENCE) else "SKIP"