fix(metrics): replace inflated PnL formula; drop fake calibration_score

total_pnl now uses edge_net × net_cost instead of (0.5 - entry_price) × shares. The old formula overestimated BUY_NO trades at low entry prices by 3–10× because buying at price 0.158 yields 3164 shares — any exit-at-0.5 assumption produced $1072 PnL on $500 deployed. edge_net × net_cost is bounded by net_cost per trade and uses the model's own signal, giving $122 for the same position. calibration_score is now None (null in API) instead of 1 - 2×|avg_edge|. That formula was not a real calibration: it requires knowing market resolutions (YES=1/NO=0) which we do not store yet. Returning null is more honest than returning 0.0 or a meaningless proxy. Fix 3 will compute it from closed trades. check_promotion_thresholds updated to handle None calibration (null → not ready). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-21 16:47:05 +00:00
parent 46f8f4b79a
commit 5a3df975d9
2 changed files with 36 additions and 19 deletions
@@ -54,43 +54,53 @@ class MetricsTracker:
        trades = self._trades
        n = len(trades)

-        # Total cost deployed
+        # ── Capital: all in-session trades (open + closed this session) ────────
+        # NOTE: self._trades is in-memory; resets on pod restart.
+        # Fix 3 (planned): replace with DB-computed metrics so restarts don't
+        # truncate history. Until then, these numbers reflect the current session.
        total_deployed = sum(t.net_cost for t in trades)
        total_fees = sum(t.fee_usdc for t in trades)

-        # Win rate (trades where we had positive edge — in paper mode we estimate)
-        # A trade "wins" if entry_price < 0.5 (buying undervalued token)
+        # ── Win rate ─────────────────────────────────────────────────────────
+        # Proxy for open trades: fraction where edge_net > 0.
+        # Not a realized win rate (no market resolutions available yet).
        wins = sum(1 for t in trades if t.entry_price < 0.5)
        win_rate = wins / n if n > 0 else 0

-        # Estimated P&L (paper — based on edge captured)
-        # Edge = (estimated_prob - entry_price) * shares
+        # ── Estimated unrealized P&L (open positions only) ───────────────────
+        # Formula: model_edge × deployed_capital per trade.
+        # Conservative bound: edge_net ∈ [-1, 1] → max PnL = net_cost per trade.
+        # Previous formula (0.5 − entry_price) × shares inflated BUY_NO trades
+        # at low entry prices by 3–10× (e.g. entry=0.158 → 3164 shares → $1072
+        # PnL on $500 deployed, vs $122 with edge_net=0.2589 here).
+        # Trades with NULL edge_net (legacy data) contribute only −fee_usdc.
        total_pnl = sum(
-            (0.5 - t.entry_price) * t.shares - t.fee_usdc
+            (t.edge_net or 0.0) * t.net_cost - t.fee_usdc
            for t in trades
        )

-        # Average edge per trade
        avg_edge = total_pnl / total_deployed if total_deployed > 0 else 0

-        # Sharpe ratio (simplified — daily returns not yet available in paper mode)
-        # Will improve once markets resolve and we have actual returns
        sharpe = self._compute_sharpe()

-        # Calibration score (Brier score based)
-        # Perfect calibration = 1.0, random = 0.0
-        calibration = 1 - (2 * abs(avg_edge))  # Simplified until markets resolve
+        # ── Calibration score: not available ─────────────────────────────────
+        # Real calibration (Brier score) requires knowing how each market
+        # resolved (YES=1 or NO=0). Until close_price / resolution is stored
+        # per trade, any formula here is a proxy, not a calibration.
+        # Returns None so the API can surface "unavailable" rather than a
+        # misleading number. Will be computed from closed trades in Fix 3.
+        calibration = None  # type: ignore[assignment]

        return {
            "timestamp": datetime.now(UTC),
            "total_trades": n,
            "total_deployed": total_deployed,
            "total_fees": total_fees,
-            "total_pnl": total_pnl,
-            "win_rate": win_rate,
+            "total_pnl": total_pnl,           # estimated unrealized (open trades, current session)
+            "win_rate": win_rate,              # proxy: fraction with entry_price < 0.5
            "avg_edge": avg_edge,
            "sharpe_ratio": sharpe,
-            "calibration_score": max(0, min(1, calibration)),
+            "calibration_score": calibration,  # None — requires market resolution data
            "paper_mode": True,
        }

@@ -106,10 +116,11 @@ class MetricsTracker:
    def check_promotion_thresholds(self) -> tuple[bool, dict]:
        """Check if metrics qualify for real money trading."""
        metrics = self.compute_metrics()
+        cal = metrics["calibration_score"]  # may be None
        checks = {
            "sharpe_ratio": (metrics["sharpe_ratio"], 0.5, metrics["sharpe_ratio"] >= 0.5),
            "win_rate": (metrics["win_rate"], 0.52, metrics["win_rate"] >= 0.52),
-            "calibration_score": (metrics["calibration_score"], 0.7, metrics["calibration_score"] >= 0.7),
+            "calibration_score": (cal, 0.7, cal is not None and cal >= 0.7),
            "min_trades": (metrics["total_trades"], 50, metrics["total_trades"] >= 50),
        }
        all_pass = all(v[2] for v in checks.values())
@@ -125,6 +136,6 @@ class MetricsTracker:
            "win_rate": 0,
            "avg_edge": 0,
            "sharpe_ratio": 0,
-            "calibration_score": 0,
+            "calibration_score": None,  # requires market resolution data
            "paper_mode": True,
        }