diff --git a/src/bot/bot.py b/src/bot/bot.py index 1258cb8..6e97016 100644 --- a/src/bot/bot.py +++ b/src/bot/bot.py @@ -73,7 +73,8 @@ async def send_chunked(message: Message, text: str, parse_mode=None): async def run_scheduled_research(bot, chat_id: int, topic: str, session_id: int, db: ResearchDB, - progress_message=None): + progress_message=None, + silent_completion: bool = False): if progress_message is not None: reporter = ProgressReporter(progress_message) else: @@ -106,9 +107,14 @@ async def run_scheduled_research(bot, chat_id: int, topic: str, await processor.process_session(session_id, topic, proc_progress) chunk_count = await db.get_chunks_count(session_id) - await reporter.done( - f"✅ Listo — `{scraped}` fuentes · `{chunk_count}` chunks · usa /generate " - ) + if silent_completion: + await reporter.done( + f"🔍 Investigación completada — analizando novedades…" + ) + else: + await reporter.done( + f"✅ Listo — `{scraped}` fuentes · `{chunk_count}` chunks · usa /generate " + ) else: await reporter.done( f"⚠️ Ollama no disponible — `{scraped}` fuentes scraped.\n" @@ -726,11 +732,41 @@ async def _scheduler_loop(app: Application): _active_sessions[chat_id] = session_id await db.update_watch_run(watch["id"]) - async def _task(c=chat_id, t=topic, s=session_id): + async def _task(c=chat_id, t=topic, s=session_id, w_id=watch["id"]): inner_db_conn = await get_db() inner_db = ResearchDB(inner_db_conn) try: - await run_scheduled_research(app.bot, c, t, s, inner_db) + await run_scheduled_research(app.bot, c, t, s, inner_db, + silent_completion=True) + + prev_session = await inner_db.get_previous_session(c, t, s) + new_urls = await inner_db.get_session_urls(s) + old_urls = await inner_db.get_session_urls(prev_session["id"]) \ + if prev_session else set() + new_chunks = await inner_db.get_top_chunks(s, limit=30) + + try: + from src.generator.generator import generate_diff_summary + summary = await generate_diff_summary( + t, new_urls, old_urls, new_chunks, s, inner_db + ) + except Exception as e: + logger.warning("Diff summary failed", error=str(e)) + summary = ( + f"📊 *Actualización disponible — {t}*\n\n" + f"Usa /generate report para ver el análisis completo." + ) + + if summary: + await app.bot.send_message( + c, summary, parse_mode=ParseMode.MARKDOWN + ) + else: + await app.bot.send_message( + c, + f"🔄 *{t}* — sin novedades significativas esta vez.", + parse_mode=ParseMode.MARKDOWN + ) finally: await inner_db_conn.close() diff --git a/src/db/database.py b/src/db/database.py index cd4beaa..f6312bf 100644 --- a/src/db/database.py +++ b/src/db/database.py @@ -159,6 +159,27 @@ class ResearchDB: row = await cursor.fetchone() return dict(row) if row else None + async def get_session_urls(self, session_id: int) -> set: + async with self.db.execute( + "SELECT url FROM sources WHERE session_id = ?", (session_id,) + ) as cur: + rows = await cur.fetchall() + return {r[0] for r in rows} + + async def get_previous_session(self, chat_id: int, topic: str, + exclude_session_id: int) -> Optional[dict]: + async with self.db.execute( + """SELECT id, topic, status, created_at FROM research_sessions + WHERE telegram_chat_id = ? AND topic = ? AND id != ? + ORDER BY created_at DESC LIMIT 1""", + (chat_id, topic, exclude_session_id) + ) as cur: + row = await cur.fetchone() + if not row: + return None + return {"id": row[0], "topic": row[1], + "status": row[2], "created_at": row[3]} + async def get_active_session(self, chat_id: int) -> Optional[dict]: cursor = await self.db.execute( """SELECT * FROM research_sessions diff --git a/src/generator/generator.py b/src/generator/generator.py index f7f3e6a..5ddeb5f 100644 --- a/src/generator/generator.py +++ b/src/generator/generator.py @@ -574,3 +574,82 @@ def generate_pdf(content: str, title: str = "ResearchOwl Output") -> bytes: doc.build(story) return buf.getvalue() + + +async def generate_diff_summary( + topic: str, + new_urls: set, + old_urls: set, + new_chunks: list, + session_id: int, + db, +) -> str | None: + from src.config import settings + import structlog + diff_logger = structlog.get_logger() + + added_urls = new_urls - old_urls + pct_new = len(added_urls) / max(len(new_urls), 1) + + diff_logger.info("Diff analysis", topic=topic, + new_urls=len(new_urls), old_urls=len(old_urls), + added=len(added_urls), pct_new=round(pct_new, 2)) + + if pct_new < 0.20 and len(added_urls) < 5: + diff_logger.info("Diff: no significant new sources", topic=topic) + return None + + if not new_chunks: + return None + + context = "\n\n---\n\n".join( + f"[{c.get('source_type', 'web').upper()}] {c.get('title', '')}\n{c['content'][:400]}" + for c in new_chunks[:20] + ) + + if not settings.anthropic_api_key: + return ( + f"📊 *Novedades detectadas sobre {topic}*\n\n" + f"• {len(added_urls)} fuentes nuevas encontradas\n" + f"• {len(new_chunks)} chunks de contenido procesados\n\n" + f"Usa /generate report para ver el análisis completo." + ) + + try: + import anthropic + client = anthropic.AsyncAnthropic(api_key=settings.anthropic_api_key) + prompt = ( + f'Analiza el siguiente material de investigación sobre "{topic}" ' + f'y genera un resumen BREVE (máximo 300 palabras) de las novedades ' + f'más importantes encontradas. Escribe en español.\n\n' + f'Si el contenido es muy similar a investigaciones anteriores o no ' + f'contiene información genuinamente nueva, responde SOLO con: ' + f'"SIN_NOVEDADES"\n\n' + f'Material nuevo:\n{context}' + ) + msg = await client.messages.create( + model=settings.claude_model, + max_tokens=500, + messages=[{"role": "user", "content": prompt}] + ) + summary = msg.content[0].text.strip() + + if summary == "SIN_NOVEDADES": + diff_logger.info("Diff: Claude found no new information", topic=topic) + return None + + try: + await db.log_api_call(session_id, "diff", settings.claude_model, + msg.usage.input_tokens, msg.usage.output_tokens) + except Exception: + pass + + return f"🔔 *Novedades — {topic}*\n\n{summary}\n\nUsa /generate para report completo." + + except Exception as e: + diff_logger.warning("Diff summary generation failed", error=str(e)) + return ( + f"📊 *Actualización — {topic}*\n\n" + f"• {len(added_urls)} fuentes nuevas\n" + f"Usa /generate report para ver el análisis completo." + )