polymarket-bot/bot/data/news.py

"""
News sentiment client for GNews API.

Free tier: 100 requests/day — we stay well within this by caching each
unique query for CACHE_TTL seconds (4 hours).  With ~9 political markets
refreshed every 4 h that is 9 × 6 = 54 requests/day.

Score returned: -1.0 (very negative headlines) → +1.0 (very positive).
Returns 0.0 on any error or missing API key so the caller degrades gracefully.
"""
import logging
import os
import re
import time

import httpx

log = logging.getLogger(__name__)

GNEWS_API = "https://gnews.io/api/v4/search"
CACHE_TTL = 4 * 3600   # seconds — fits 100 req/day free tier

# ---------------------------------------------------------------------------
# Keyword lists for headline sentiment
# ---------------------------------------------------------------------------
_POSITIVE = {
    "win", "wins", "won", "victory", "success", "successful",
    "agree", "agreed", "agreement", "approve", "approved", "approval",
    "confirm", "confirmed", "sign", "signed", "deal", "advance",
    "progress", "support", "peace", "likely", "probable", "imminent",
    "historic", "breakthrough", "resolve", "resolved", "resume", "resumed",
}
_NEGATIVE = {
    "fail", "fails", "failed", "failure", "reject", "rejected", "rejection",
    "block", "blocked", "refuse", "refused", "deny", "denied",
    "lose", "lost", "collapse", "collapsed", "crisis", "war", "attack",
    "veto", "oppose", "opposed", "unlikely", "impossible", "never",
    "stall", "stalled", "withdraw", "withdrew", "sanction", "sanctions",
    "threat", "threatens", "dead", "halt", "halted", "cancel", "cancelled",
    "breakdown", "escalate", "escalation",
}

# Words stripped when building the search query (too generic to help relevance)
_QUERY_STOPWORDS = {
    "will", "the", "a", "an", "by", "in", "on", "at", "to", "of",
    "and", "or", "is", "be", "are", "was", "were", "have", "has",
    "had", "do", "does", "did", "for", "from", "with", "not", "no",
    "this", "that", "it", "its", "their", "they", "he", "she", "we",
    "most", "more", "least", "less", "any", "all", "both", "each",
    "win", "lose", "get", "make", "take",
}

# Regex patterns for dates / noise
_DATE_RE = re.compile(
    r"\b(january|february|march|april|may|june|july|august|"
    r"september|october|november|december)\s+\d{1,2}\b"
    r"|\b20\d{2}\b"
    r"|\bQ[1-4]\b",
    flags=re.IGNORECASE,
)
_PUNCT_RE = re.compile(r"[?!\"'.,;:()\[\]{}]")


class NewsClient:
    """
    Async GNews client with in-memory result cache.

    Usage::

        client = NewsClient()
        score = await client.get_sentiment("Will Trump visit China")
        # score ∈ [-1.0, +1.0] — positive means bullish for the YES outcome
        await client.close()
    """

    def __init__(self) -> None:
        self._api_key = os.getenv("GNEWS_API_KEY", "")
        self._client = httpx.AsyncClient(
            timeout=10,
            headers={"User-Agent": "Mozilla/5.0 (compatible; polymarket-bot/1.0)"},
        )
        # {cache_key: (fetched_at_monotonic, score)}
        self._cache: dict[str, tuple[float, float]] = {}

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------

    async def get_sentiment(self, question: str) -> float:
        """
        Return a sentiment score ∈ [-1.0, +1.0] for the market question.

        - Positive: most recent headlines suggest the YES outcome is more likely
        - Negative: headlines suggest the YES outcome is less likely
        - 0.0: neutral, no data, or API unavailable
        """
        if not self._api_key:
            log.debug("GNEWS_API_KEY not set — skipping news signal")
            return 0.0

        query = self._build_query(question)
        if len(query) < 3:
            return 0.0

        cache_key = query.lower()
        now = time.monotonic()
        cached = self._cache.get(cache_key)
        if cached is not None:
            fetched_at, score = cached
            if now - fetched_at < CACHE_TTL:
                log.debug("News cache hit %r → %.3f", query, score)
                return score

        # Build URL exactly as documented for free tier:
        # https://gnews.io/api/v4/search?q=...&lang=en&max=10&token=...
        # NOTE: "from"/"to" date filters are paid-tier only — omit them.
        try:
            resp = await self._client.get(
                GNEWS_API,
                params={
                    "q": query,
                    "lang": "en",
                    "max": 10,
                    "token": self._api_key,
                },
            )
        except Exception as exc:
            log.warning("GNews network error for %r: %s", query, exc)
            return 0.0

        log.info("GNews HTTP %d for query %r", resp.status_code, query)

        if resp.status_code != 200:
            try:
                body = resp.json()
            except Exception:
                body = resp.text[:200]
            log.warning("GNews error body: %s", body)
            # Cache neutral for 1 h on client errors to avoid hammering the endpoint
            if resp.status_code in (400, 401, 403, 429):
                self._cache[cache_key] = (now, 0.0)
            return 0.0

        try:
            data = resp.json()
        except Exception as exc:
            log.warning("GNews JSON decode error for %r: %s", query, exc)
            return 0.0

        articles = data.get("articles", [])
        score = self._score_headlines(articles)
        self._cache[cache_key] = (now, score)
        log.info(
            "GNews %r → %d articles, sentiment=%.3f",
            query, len(articles), score,
        )
        return score

    async def close(self) -> None:
        await self._client.aclose()

    # ------------------------------------------------------------------
    # Internal helpers
    # ------------------------------------------------------------------

    @staticmethod
    def _build_query(question: str) -> str:
        """Extract meaningful search terms from a market question."""
        q = _DATE_RE.sub(" ", question)
        q = _PUNCT_RE.sub(" ", q)
        tokens = [
            w for w in q.split()
            if w.lower() not in _QUERY_STOPWORDS and len(w) > 2
        ]
        return " ".join(tokens[:8])   # GNews handles ~8 keyword queries well

    @staticmethod
    def _score_headlines(articles: list[dict]) -> float:
        """
        Score each article title + description independently, then average.
        Each article vote: (pos_hits - neg_hits) / (pos_hits + neg_hits) ∈ [-1, 1].
        Articles with no sentiment keywords contribute 0 (not excluded).
        """
        if not articles:
            return 0.0

        votes: list[float] = []
        for art in articles:
            text = (
                f"{art.get('title', '')} {art.get('description', '')}"
            ).lower()
            words = set(re.findall(r"\b\w+\b", text))
            pos = len(words & _POSITIVE)
            neg = len(words & _NEGATIVE)
            total = pos + neg
            votes.append((pos - neg) / total if total > 0 else 0.0)

        return max(-1.0, min(1.0, sum(votes) / len(votes)))