63d9f637ff
CI/CD / build-and-push (push) Successful in 2m30s
Phase 1 — Edge neto real (paper.py, bayesian.py, risk/manager.py, db.py):
- Trade records now store edge_gross, edge_net, prior_prob, final_prob,
mid_price, spread_estimate, commission, family_key
- edge_net = edge_gross - SPREAD_ESTIMATE(0.02) - COMMISSION_RATE(0.02)
NOTE: both constants are heuristics, not exact Polymarket exchange costs
- Execution gate changed from edge_gross > MIN_EDGE to edge_net > regime_min_edge
Phase 2 — Market families (polymarket.py):
- market_family_key(market) groups related markets:
texas-republican-2026, fed-april-2026, openai-2026, etc.
- At most 1 trade per family per cycle; occupied_families propagated via main.py
- Family key logged on every TRADE and SKIP line
Phase 3 — GNews priority (news.py, bayesian.py, main.py):
- NewsClient.get_freshness() returns 1.0/0.75/0.40/0.10 by cache age
- gnews_priority(market, news) = uncertainty × volume_score × freshness
- Politics markets sorted by priority DESC before eval so best markets get
the 5-query/cycle GNews budget first
Phase 4 — Regime min-edge by category/horizon (bayesian.py):
- politics >60d → 0.12, 30-60d → 0.10, <30d → 0.08
- tech / crypto/finance → 0.10
- All thresholds applied to edge_net (not edge_gross)
Phase 5 — Observability (bayesian.py, main.py):
- Structured skip labels: SKIP_UNSUPPORTED, SKIP_NO_SIGNALS,
SKIP_PRIOR_EXTREME, SKIP_FAMILY, SKIP_GNEWS_PRIORITY, SKIP_EDGE_NET
- TRADE lines now include family_key, edge_gross, edge_net, regime_min, days
- schema.sql: 8 new cols on trades, 7 new cols on signals (via ALTER TABLE IF NOT EXISTS)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
237 lines
8.7 KiB
Python
237 lines
8.7 KiB
Python
"""
|
||
News sentiment client for GNews API.
|
||
|
||
Free tier: 100 requests/day. Budget:
|
||
- Cache TTL: 6 hours — same query is never repeated within 6 h
|
||
- Max 5 queries per trading cycle (politics markets only)
|
||
- 2-second sleep between actual API calls to avoid burst 429s
|
||
With ≤9 politics markets and 6 h cache → ≤9 requests per 6 h = ≤36/day.
|
||
|
||
Sentiment engine: VADER (Valence Aware Dictionary and sEntiment Reasoner).
|
||
Designed for short social/news text — handles negations, intensifiers, and
|
||
punctuation natively. Returns compound ∈ [-1, +1] per article.
|
||
|
||
Score returned: -1.0 (very negative headlines) → +1.0 (very positive).
|
||
Returns 0.0 on any error or missing API key so the caller degrades gracefully.
|
||
"""
|
||
import asyncio
|
||
import logging
|
||
import os
|
||
import re
|
||
import time
|
||
|
||
import httpx
|
||
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
||
|
||
log = logging.getLogger(__name__)
|
||
|
||
GNEWS_API = "https://gnews.io/api/v4/search"
|
||
CACHE_TTL = 6 * 3600 # seconds — ≤9 politics markets × 4 cycles/day = ≤36 req/day
|
||
_INTER_REQUEST_SLEEP = 2 # seconds between consecutive real API calls
|
||
_NEUTRAL_THRESHOLD = 0.05 # |compound| below this → article excluded from average
|
||
|
||
# Shared VADER analyzer (stateless, thread-safe, cheap to create once)
|
||
_vader = SentimentIntensityAnalyzer()
|
||
|
||
# Words stripped when building the search query (too generic to help relevance)
|
||
_QUERY_STOPWORDS = {
|
||
"will", "the", "a", "an", "by", "in", "on", "at", "to", "of",
|
||
"and", "or", "is", "be", "are", "was", "were", "have", "has",
|
||
"had", "do", "does", "did", "for", "from", "with", "not", "no",
|
||
"this", "that", "it", "its", "their", "they", "he", "she", "we",
|
||
"most", "more", "least", "less", "any", "all", "both", "each",
|
||
"win", "lose", "get", "make", "take",
|
||
}
|
||
|
||
# Regex patterns for dates / noise in market questions
|
||
_DATE_RE = re.compile(
|
||
r"\b(january|february|march|april|may|june|july|august|"
|
||
r"september|october|november|december)\s+\d{1,2}\b"
|
||
r"|\b20\d{2}\b"
|
||
r"|\bQ[1-4]\b",
|
||
flags=re.IGNORECASE,
|
||
)
|
||
_PUNCT_RE = re.compile(r"[?!\"'.,;:()\[\]{}]")
|
||
|
||
|
||
class NewsClient:
|
||
"""
|
||
Async GNews client with in-memory result cache.
|
||
|
||
Usage::
|
||
|
||
client = NewsClient()
|
||
score = await client.get_sentiment("Will Trump visit China")
|
||
# score ∈ [-1.0, +1.0] — positive means bullish for the YES outcome
|
||
await client.close()
|
||
"""
|
||
|
||
def __init__(self) -> None:
|
||
self._api_key = os.getenv("GNEWS_API_KEY", "")
|
||
self._client = httpx.AsyncClient(
|
||
timeout=10,
|
||
headers={"User-Agent": "Mozilla/5.0 (compatible; polymarket-bot/1.0)"},
|
||
)
|
||
# {cache_key: (fetched_at_monotonic, score)}
|
||
self._cache: dict[str, tuple[float, float]] = {}
|
||
|
||
# ------------------------------------------------------------------
|
||
# Public API
|
||
# ------------------------------------------------------------------
|
||
|
||
async def get_sentiment(self, question: str) -> float:
|
||
"""
|
||
Return a sentiment score ∈ [-1.0, +1.0] for the market question.
|
||
|
||
- Positive: most recent headlines suggest the YES outcome is more likely
|
||
- Negative: headlines suggest the YES outcome is less likely
|
||
- 0.0: neutral, no data, or API unavailable
|
||
"""
|
||
if not self._api_key:
|
||
log.debug("GNEWS_API_KEY not set — skipping news signal")
|
||
return 0.0
|
||
|
||
query = self._build_query(question)
|
||
if len(query) < 3:
|
||
return 0.0
|
||
|
||
cache_key = query.lower()
|
||
now = time.monotonic()
|
||
cached = self._cache.get(cache_key)
|
||
if cached is not None:
|
||
fetched_at, score = cached
|
||
if now - fetched_at < CACHE_TTL:
|
||
log.debug("News cache hit %r → %.3f", query, score)
|
||
return score
|
||
|
||
# Build URL exactly as documented for free tier:
|
||
# https://gnews.io/api/v4/search?q=...&lang=en&max=10&token=...
|
||
# NOTE: "from"/"to" date filters are paid-tier only — omit them.
|
||
try:
|
||
resp = await self._client.get(
|
||
GNEWS_API,
|
||
params={
|
||
"q": query,
|
||
"lang": "en",
|
||
"max": 10,
|
||
"token": self._api_key,
|
||
},
|
||
)
|
||
except Exception as exc:
|
||
log.warning("GNews network error for %r: %s", query, exc)
|
||
return 0.0
|
||
finally:
|
||
# Always sleep after a real network attempt to avoid burst 429s
|
||
await asyncio.sleep(_INTER_REQUEST_SLEEP)
|
||
|
||
log.info("GNews HTTP %d for query %r", resp.status_code, query)
|
||
|
||
if resp.status_code != 200:
|
||
try:
|
||
body = resp.json()
|
||
except Exception:
|
||
body = resp.text[:200]
|
||
log.warning("GNews error body: %s", body)
|
||
# Cache neutral for 1 h on client errors to avoid hammering the endpoint
|
||
if resp.status_code in (400, 401, 403, 429):
|
||
self._cache[cache_key] = (now, 0.0)
|
||
return 0.0
|
||
|
||
try:
|
||
data = resp.json()
|
||
except Exception as exc:
|
||
log.warning("GNews JSON decode error for %r: %s", query, exc)
|
||
return 0.0
|
||
|
||
articles = data.get("articles", [])
|
||
score = self._score_headlines(articles)
|
||
self._cache[cache_key] = (now, score)
|
||
log.info(
|
||
"GNews %r → %d articles, sentiment=%.3f",
|
||
query, len(articles), score,
|
||
)
|
||
return score
|
||
|
||
async def close(self) -> None:
|
||
await self._client.aclose()
|
||
|
||
def get_freshness(self, question: str) -> float:
|
||
"""
|
||
Return a freshness score [0.1, 1.0] for GNews priority calculation.
|
||
|
||
Score interpretation:
|
||
1.00 — never queried (maximum priority for GNews budget)
|
||
0.75 — last queried >6 h ago (cache expired, worth re-querying)
|
||
0.40 — queried 2–6 h ago (in-cache but moderately stale)
|
||
0.10 — queried <2 h ago (cache very fresh, low re-query value)
|
||
|
||
If the API key is absent, always returns 1.0 (key missing means the
|
||
query will be skipped anyway; don't penalise the priority score).
|
||
"""
|
||
if not self._api_key:
|
||
return 1.0
|
||
query = self._build_query(question)
|
||
cached = self._cache.get(query.lower())
|
||
if cached is None:
|
||
return 1.0
|
||
age_seconds = time.monotonic() - cached[0]
|
||
if age_seconds > 6 * 3600:
|
||
return 0.75
|
||
if age_seconds > 2 * 3600:
|
||
return 0.40
|
||
return 0.10
|
||
|
||
# ------------------------------------------------------------------
|
||
# Internal helpers
|
||
# ------------------------------------------------------------------
|
||
|
||
@staticmethod
|
||
def _build_query(question: str) -> str:
|
||
"""Extract meaningful search terms from a market question."""
|
||
q = _DATE_RE.sub(" ", question)
|
||
q = _PUNCT_RE.sub(" ", q)
|
||
tokens = [
|
||
w for w in q.split()
|
||
if w.lower() not in _QUERY_STOPWORDS and len(w) > 2
|
||
]
|
||
return " ".join(tokens[:8]) # GNews handles ~8 keyword queries well
|
||
|
||
@staticmethod
|
||
def _score_headlines(articles: list[dict]) -> float:
|
||
"""
|
||
Score articles with VADER, weight by recency, return ∈ [-1, +1].
|
||
|
||
Algorithm:
|
||
1. For each article: run VADER on title + description → compound ∈ [-1, +1]
|
||
2. Exclude articles with |compound| ≤ _NEUTRAL_THRESHOLD (no clear signal)
|
||
3. GNews returns articles newest-first, so assign linear recency weights:
|
||
article[0] → weight N, article[1] → weight N-1, ..., article[N-1] → weight 1
|
||
(only counted for articles that passed the threshold filter)
|
||
4. Return weighted mean, clamped to [-1, +1]
|
||
"""
|
||
if not articles:
|
||
return 0.0
|
||
|
||
scored: list[tuple[float, int]] = [] # (compound, original_index)
|
||
for idx, art in enumerate(articles):
|
||
text = f"{art.get('title', '')} {art.get('description', '')}"
|
||
compound = _vader.polarity_scores(text)["compound"]
|
||
if abs(compound) > _NEUTRAL_THRESHOLD:
|
||
scored.append((compound, idx))
|
||
|
||
if not scored:
|
||
return 0.0
|
||
|
||
n = len(scored)
|
||
# Weight by recency: earlier index (newer article) → higher weight.
|
||
# scored is already in original (newest-first) order since we enumerate
|
||
# articles sequentially and only append those that pass the threshold.
|
||
total_weight = 0.0
|
||
weighted_sum = 0.0
|
||
for rank, (compound, _idx) in enumerate(scored):
|
||
weight = n - rank # rank 0 → weight n, rank n-1 → weight 1
|
||
weighted_sum += compound * weight
|
||
total_weight += weight
|
||
|
||
return max(-1.0, min(1.0, weighted_sum / total_weight))
|