feat: retry+backoff en scraper, ProgressReporter en bot
Build & Deploy ResearchOwl / build-and-push (push) Successful in 6s

This commit is contained in:
ChemaVX
2026-05-03 16:40:37 +00:00
parent e66d728d68
commit 7704f071d6
2 changed files with 77 additions and 66 deletions
+44 -54
View File
@@ -33,17 +33,24 @@ def is_authorized(user_id: int) -> bool:
return not allowed or user_id in allowed
def fmt_progress(iteration: int, total: int, new: int, stats: dict) -> str:
scraped = stats.get("scraped") or 0
failed = stats.get("failed") or 0
pending = stats.get("pending") or 0
skipped = stats.get("skipped") or 0
return (
f"🔄 *Iteration {iteration}*\n"
f"📚 Sources found: `{total}`\n"
f"✅ Scraped: `{scraped}` | ⏭️ Skipped: `{skipped}` | ❌ Failed: `{failed}` | ⏳ Pending: `{pending}`\n"
f"🆕 New URLs this round: `{new}`"
)
class ProgressReporter:
def __init__(self, reply_target: Message):
self._reply_target = reply_target
self._msg: Optional[Message] = None
async def start(self, text: str):
self._msg = await self._reply_target.reply_text(text, parse_mode=ParseMode.MARKDOWN)
async def update(self, text: str):
if not self._msg:
return
try:
await self._msg.edit_text(text, parse_mode=ParseMode.MARKDOWN)
except Exception:
pass
async def done(self, text: str):
await self.update(text)
async def send_chunked(message: Message, text: str, parse_mode=None):
@@ -91,77 +98,54 @@ async def cmd_research(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
)
return
# Check for existing active research
if chat_id in _active_tasks and not _active_tasks[chat_id].done():
await update.message.reply_text(
"⚠️ Research already in progress. Use /status or /finish first."
)
return
msg = await update.message.reply_text(
f"🦉 *ResearchOwl* starting research on:\n`{topic}`\n\n"
f"🌱 Seeding sources from:\n"
f"• DuckDuckGo (8 queries)\n"
f"• Wikipedia + internal links\n"
f"• Reddit top posts\n"
f"• YouTube transcripts\n\n"
f"This will run exhaustively until saturation. Use /finish to stop early.",
parse_mode=ParseMode.MARKDOWN
)
async def run_research():
db_conn = await get_db()
db = ResearchDB(db_conn)
reporter = None
try:
session_id = await db.create_session(topic, chat_id)
_active_sessions[chat_id] = session_id
progress_msg = msg
iteration_count = [0]
reporter = ProgressReporter(update.message)
await reporter.start(f"🔍 Iniciando scraping de `{topic}`…")
async def on_progress(iteration, total, new_this_round, stats):
iteration_count[0] = iteration
text = fmt_progress(iteration, total, new_this_round, stats)
try:
await progress_msg.edit_text(text, parse_mode=ParseMode.MARKDOWN)
except Exception:
pass
async def on_progress(iter_num, total_sources):
await reporter.update(
f"🔍 Scraping — iteración `{iter_num}` | `{total_sources}` fuentes encontradas"
)
scraper = ExhaustiveScraper(db, session_id, topic, on_progress)
final_stats = await scraper.run()
await db.update_session(session_id, status=ResearchStatus.SATURATED)
scraped = final_stats.get("scraped", 0)
await update.message.reply_text(
f"✅ *Research complete!*\n\n"
f"📊 Results:\n"
f"• Sources found & scraped: `{scraped}`\n"
f"• Iterations: `{iteration_count[0]}`\n\n"
f"Now processing content with Ollama...\n"
f"Use `/generate podcast|blog|report|thread` when ready.",
parse_mode=ParseMode.MARKDOWN
)
# Auto-process after scraping
await reporter.update(f"⚡ Procesando `{scraped}` fuentes…")
ollama = OllamaClient()
if await ollama.is_available():
processor = ContentProcessor(db, ollama)
async def proc_progress(total_chunks, total_words):
await update.message.reply_text(
f"🧠 *Processing complete!*\n"
f"• Chunks stored: `{total_chunks}`\n"
f"• Words researched: `{total_words:,}`\n\n"
f"Ready! Use `/generate podcast|blog|report|thread`",
parse_mode=ParseMode.MARKDOWN
await reporter.update(
f"⚡ Scoring chunks… (`{total_chunks}` procesados)"
)
await processor.process_session(session_id, topic, proc_progress)
chunk_count = await db.get_chunks_count(session_id)
await reporter.done(
f"✅ Listo — `{scraped}` fuentes · `{chunk_count}` chunks · usa /generate <tipo>"
)
else:
await update.message.reply_text(
"⚠️ Ollama not reachable — skipping processing.\n"
"You can still use `/generate` (will use raw content)."
await reporter.done(
f"⚠️ Ollama no disponible — `{scraped}` fuentes scraped.\n"
f"Usa /generate para generar contenido."
)
except asyncio.CancelledError:
@@ -169,10 +153,16 @@ async def cmd_research(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
_active_sessions.get(chat_id, 0),
status=ResearchStatus.FINISHED
)
await update.message.reply_text("🛑 Research cancelled.")
if reporter:
await reporter.done("🛑 Investigación cancelada.")
else:
await update.message.reply_text("🛑 Research cancelled.")
except Exception as e:
logger.error("Research task failed", error=str(e))
await update.message.reply_text(f"❌ Research failed: {str(e)[:200]}")
if reporter:
await reporter.done(f"❌ Error: {str(e)[:200]}")
else:
await update.message.reply_text(f"❌ Research failed: {str(e)[:200]}")
finally:
await db_conn.close()