feat: modo diff para /watch — notifica solo si hay novedades reales

2026-05-05 07:43:41 +00:00
parent f4e167f3b6
commit 53cf7a04a8
3 changed files with 142 additions and 6 deletions
@@ -73,7 +73,8 @@ async def send_chunked(message: Message, text: str, parse_mode=None):

 async def run_scheduled_research(bot, chat_id: int, topic: str,
                                  session_id: int, db: ResearchDB,
-                                  progress_message=None):
+                                  progress_message=None,
+                                  silent_completion: bool = False):
    if progress_message is not None:
        reporter = ProgressReporter(progress_message)
    else:
@@ -106,6 +107,11 @@ async def run_scheduled_research(bot, chat_id: int, topic: str,

            await processor.process_session(session_id, topic, proc_progress)
            chunk_count = await db.get_chunks_count(session_id)
+            if silent_completion:
+                await reporter.done(
+                    f"🔍 Investigación completada — analizando novedades…"
+                )
+            else:
                await reporter.done(
                    f"✅ Listo — `{scraped}` fuentes · `{chunk_count}` chunks · usa /generate <tipo>"
                )
@@ -726,11 +732,41 @@ async def _scheduler_loop(app: Application):
                _active_sessions[chat_id] = session_id
                await db.update_watch_run(watch["id"])

-                async def _task(c=chat_id, t=topic, s=session_id):
+                async def _task(c=chat_id, t=topic, s=session_id, w_id=watch["id"]):
                    inner_db_conn = await get_db()
                    inner_db = ResearchDB(inner_db_conn)
                    try:
-                        await run_scheduled_research(app.bot, c, t, s, inner_db)
+                        await run_scheduled_research(app.bot, c, t, s, inner_db,
+                                                     silent_completion=True)
+
+                        prev_session = await inner_db.get_previous_session(c, t, s)
+                        new_urls = await inner_db.get_session_urls(s)
+                        old_urls = await inner_db.get_session_urls(prev_session["id"]) \
+                                   if prev_session else set()
+                        new_chunks = await inner_db.get_top_chunks(s, limit=30)
+
+                        try:
+                            from src.generator.generator import generate_diff_summary
+                            summary = await generate_diff_summary(
+                                t, new_urls, old_urls, new_chunks, s, inner_db
+                            )
+                        except Exception as e:
+                            logger.warning("Diff summary failed", error=str(e))
+                            summary = (
+                                f"📊 *Actualización disponible — {t}*\n\n"
+                                f"Usa /generate report para ver el análisis completo."
+                            )
+
+                        if summary:
+                            await app.bot.send_message(
+                                c, summary, parse_mode=ParseMode.MARKDOWN
+                            )
+                        else:
+                            await app.bot.send_message(
+                                c,
+                                f"🔄 *{t}* — sin novedades significativas esta vez.",
+                                parse_mode=ParseMode.MARKDOWN
+                            )
                    finally:
                        await inner_db_conn.close()

@@ -159,6 +159,27 @@ class ResearchDB:
        row = await cursor.fetchone()
        return dict(row) if row else None

+    async def get_session_urls(self, session_id: int) -> set:
+        async with self.db.execute(
+            "SELECT url FROM sources WHERE session_id = ?", (session_id,)
+        ) as cur:
+            rows = await cur.fetchall()
+            return {r[0] for r in rows}
+
+    async def get_previous_session(self, chat_id: int, topic: str,
+                                    exclude_session_id: int) -> Optional[dict]:
+        async with self.db.execute(
+            """SELECT id, topic, status, created_at FROM research_sessions
+               WHERE telegram_chat_id = ? AND topic = ? AND id != ?
+               ORDER BY created_at DESC LIMIT 1""",
+            (chat_id, topic, exclude_session_id)
+        ) as cur:
+            row = await cur.fetchone()
+            if not row:
+                return None
+            return {"id": row[0], "topic": row[1],
+                    "status": row[2], "created_at": row[3]}
+
    async def get_active_session(self, chat_id: int) -> Optional[dict]:
        cursor = await self.db.execute(
            """SELECT * FROM research_sessions 
@@ -574,3 +574,82 @@ def generate_pdf(content: str, title: str = "ResearchOwl Output") -> bytes:

    doc.build(story)
    return buf.getvalue()
+
+
+async def generate_diff_summary(
+    topic: str,
+    new_urls: set,
+    old_urls: set,
+    new_chunks: list,
+    session_id: int,
+    db,
+) -> str | None:
+    from src.config import settings
+    import structlog
+    diff_logger = structlog.get_logger()
+
+    added_urls = new_urls - old_urls
+    pct_new = len(added_urls) / max(len(new_urls), 1)
+
+    diff_logger.info("Diff analysis", topic=topic,
+                     new_urls=len(new_urls), old_urls=len(old_urls),
+                     added=len(added_urls), pct_new=round(pct_new, 2))
+
+    if pct_new < 0.20 and len(added_urls) < 5:
+        diff_logger.info("Diff: no significant new sources", topic=topic)
+        return None
+
+    if not new_chunks:
+        return None
+
+    context = "\n\n---\n\n".join(
+        f"[{c.get('source_type', 'web').upper()}] {c.get('title', '')}\n{c['content'][:400]}"
+        for c in new_chunks[:20]
+    )
+
+    if not settings.anthropic_api_key:
+        return (
+            f"📊 *Novedades detectadas sobre {topic}*\n\n"
+            f"• {len(added_urls)} fuentes nuevas encontradas\n"
+            f"• {len(new_chunks)} chunks de contenido procesados\n\n"
+            f"Usa /generate report para ver el análisis completo."
+        )
+
+    try:
+        import anthropic
+        client = anthropic.AsyncAnthropic(api_key=settings.anthropic_api_key)
+        prompt = (
+            f'Analiza el siguiente material de investigación sobre "{topic}" '
+            f'y genera un resumen BREVE (máximo 300 palabras) de las novedades '
+            f'más importantes encontradas. Escribe en español.\n\n'
+            f'Si el contenido es muy similar a investigaciones anteriores o no '
+            f'contiene información genuinamente nueva, responde SOLO con: '
+            f'"SIN_NOVEDADES"\n\n'
+            f'Material nuevo:\n{context}'
+        )
+        msg = await client.messages.create(
+            model=settings.claude_model,
+            max_tokens=500,
+            messages=[{"role": "user", "content": prompt}]
+        )
+        summary = msg.content[0].text.strip()
+
+        if summary == "SIN_NOVEDADES":
+            diff_logger.info("Diff: Claude found no new information", topic=topic)
+            return None
+
+        try:
+            await db.log_api_call(session_id, "diff", settings.claude_model,
+                                  msg.usage.input_tokens, msg.usage.output_tokens)
+        except Exception:
+            pass
+
+        return f"🔔 *Novedades — {topic}*\n\n{summary}\n\nUsa /generate para report completo."
+
+    except Exception as e:
+        diff_logger.warning("Diff summary generation failed", error=str(e))
+        return (
+            f"📊 *Actualización — {topic}*\n\n"
+            f"• {len(added_urls)} fuentes nuevas\n"
+            f"Usa /generate report para ver el análisis completo."
+        )