Files
researchowl/src/bot/bot.py
T
ChemaVX 0c7176dd0b
Build & Deploy ResearchOwl / build-and-push (push) Successful in 5s
fix: add /process command, log quality filtering, improve Reddit headers
- bot.py: add cmd_process handler to manually trigger chunk processing
  on the last session; register CommandHandler("process")
- processor.py: log exceptions from asyncio.gather instead of silently
  dropping them; add per-chunk quality score debug logging; warn when
  all chunks filtered by quality threshold with actionable hint;
  raise fallback score to 0.6 so Ollama failures don't filter chunks
- exhaustive.py: replace bot User-Agent with full browser UA + headers
  for REDDIT_HEADERS; downgrade Reddit 403 from warning to info since
  server IPs are routinely blocked; use content_type=None on json()
  to avoid aiohttp content-type mismatch errors

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-27 20:37:39 +00:00

528 lines
19 KiB
Python

"""
ResearchOwl Telegram Bot
Main user interface — all commands handled here
"""
import asyncio
import os
from datetime import datetime
from typing import Optional
import structlog
from telegram import Update, Message
from telegram.ext import (
Application, CommandHandler, MessageHandler,
filters, ContextTypes
)
from telegram.constants import ParseMode
from src.config import settings
from src.db.database import get_db, ResearchDB, ResearchStatus, OutputType
from src.scraper.exhaustive import ExhaustiveScraper
from src.processor.processor import OllamaClient, ContentProcessor
from src.generator.generator import OutputGenerator
logger = structlog.get_logger()
# Active research tasks per chat
_active_tasks: dict[int, asyncio.Task] = {}
_active_sessions: dict[int, int] = {} # chat_id -> session_id
def is_authorized(user_id: int) -> bool:
allowed = settings.allowed_user_ids
return not allowed or user_id in allowed
def fmt_progress(iteration: int, total: int, new: int, stats: dict) -> str:
scraped = stats.get("scraped", 0)
failed = stats.get("failed", 0)
pending = stats.get("pending", 0)
return (
f"🔄 *Iteration {iteration}*\n"
f"📚 Sources found: `{total}`\n"
f"✅ Scraped: `{scraped}` | ❌ Failed: `{failed}` | ⏳ Pending: `{pending}`\n"
f"🆕 New this round: `{new}`"
)
async def send_chunked(message: Message, text: str, parse_mode=None):
"""Send long text in chunks of 4000 chars (Telegram limit)"""
max_len = 4000
for i in range(0, len(text), max_len):
chunk = text[i:i + max_len]
await message.reply_text(chunk, parse_mode=parse_mode)
if len(text) > max_len:
await asyncio.sleep(0.5)
# ─── Commands ─────────────────────────────────────────────────────────────────
async def cmd_start(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
if not is_authorized(update.effective_user.id):
return
await update.message.reply_text(
"🦉 *ResearchOwl* — Exhaustive Research Engine\n\n"
"Commands:\n"
"`/research <topic>` — Start exhaustive research\n"
"`/status` — Check current research progress\n"
"`/finish` — Stop research and proceed to generation\n"
"`/process` — Manually trigger chunk processing\n"
"`/generate <type>` — Generate output (podcast|blog|report|thread)\n"
"`/sources` — List all sources found\n"
"`/outputs` — List generated outputs\n"
"`/cancel` — Cancel current research\n"
"`/help` — Show this message",
parse_mode=ParseMode.MARKDOWN
)
async def cmd_research(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
if not is_authorized(update.effective_user.id):
return
chat_id = update.effective_chat.id
topic = " ".join(ctx.args).strip() if ctx.args else ""
if not topic:
await update.message.reply_text(
"❌ Please provide a topic.\nExample: `/research Roswell incident`",
parse_mode=ParseMode.MARKDOWN
)
return
# Check for existing active research
if chat_id in _active_tasks and not _active_tasks[chat_id].done():
await update.message.reply_text(
"⚠️ Research already in progress. Use /status or /finish first."
)
return
msg = await update.message.reply_text(
f"🦉 *ResearchOwl* starting research on:\n`{topic}`\n\n"
f"🌱 Seeding sources from:\n"
f"• DuckDuckGo (8 queries)\n"
f"• Wikipedia + internal links\n"
f"• Reddit top posts\n"
f"• YouTube transcripts\n\n"
f"This will run exhaustively until saturation. Use /finish to stop early.",
parse_mode=ParseMode.MARKDOWN
)
async def run_research():
db_conn = await get_db()
db = ResearchDB(db_conn)
try:
session_id = await db.create_session(topic, chat_id)
_active_sessions[chat_id] = session_id
progress_msg = msg
iteration_count = [0]
async def on_progress(iteration, total, new_this_round, stats):
iteration_count[0] = iteration
text = fmt_progress(iteration, total, new_this_round, stats)
try:
await progress_msg.edit_text(text, parse_mode=ParseMode.MARKDOWN)
except Exception:
pass
scraper = ExhaustiveScraper(db, session_id, topic, on_progress)
final_stats = await scraper.run()
await db.update_session(session_id, status=ResearchStatus.SATURATED)
scraped = final_stats.get("scraped", 0)
await update.message.reply_text(
f"✅ *Research complete!*\n\n"
f"📊 Results:\n"
f"• Sources found & scraped: `{scraped}`\n"
f"• Iterations: `{iteration_count[0]}`\n\n"
f"Now processing content with Ollama...\n"
f"Use `/generate podcast|blog|report|thread` when ready.",
parse_mode=ParseMode.MARKDOWN
)
# Auto-process after scraping
ollama = OllamaClient()
if await ollama.is_available():
processor = ContentProcessor(db, ollama)
async def proc_progress(total_chunks, total_words):
await update.message.reply_text(
f"🧠 *Processing complete!*\n"
f"• Chunks stored: `{total_chunks}`\n"
f"• Words researched: `{total_words:,}`\n\n"
f"Ready! Use `/generate podcast|blog|report|thread`",
parse_mode=ParseMode.MARKDOWN
)
await processor.process_session(session_id, topic, proc_progress)
else:
await update.message.reply_text(
"⚠️ Ollama not reachable — skipping processing.\n"
"You can still use `/generate` (will use raw content)."
)
except asyncio.CancelledError:
await db.update_session(
_active_sessions.get(chat_id, 0),
status=ResearchStatus.FINISHED
)
await update.message.reply_text("🛑 Research cancelled.")
except Exception as e:
logger.error("Research task failed", error=str(e))
await update.message.reply_text(f"❌ Research failed: {str(e)[:200]}")
finally:
await db_conn.close()
task = asyncio.create_task(run_research())
_active_tasks[chat_id] = task
async def cmd_status(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
if not is_authorized(update.effective_user.id):
return
chat_id = update.effective_chat.id
db_conn = await get_db()
db = ResearchDB(db_conn)
try:
session = await db.get_active_session(chat_id)
if not session:
# Try to find last session
cursor = await db_conn.execute(
"SELECT * FROM research_sessions WHERE telegram_chat_id = ? ORDER BY created_at DESC LIMIT 1",
(chat_id,)
)
row = await cursor.fetchone()
session = dict(row) if row else None
if not session:
await update.message.reply_text("No research sessions found. Start with /research <topic>")
return
stats = await db.get_session_stats(session["id"])
is_active = chat_id in _active_tasks and not _active_tasks[chat_id].done()
status_emoji = {"running": "🔄", "saturated": "", "finished": "🏁", "error": ""}
emoji = status_emoji.get(session["status"], "")
await update.message.reply_text(
f"{emoji} *Research Status*\n\n"
f"📝 Topic: `{session['topic']}`\n"
f"🔁 Status: `{session['status']}`\n"
f"🔢 Iterations: `{session.get('iterations', 0)}`\n"
f"📚 Total sources: `{stats.get('total', 0)}`\n"
f"✅ Scraped: `{stats.get('scraped', 0)}`\n"
f"❌ Failed: `{stats.get('failed', 0)}`\n"
f"⏳ Pending: `{stats.get('pending', 0)}`\n"
f"💬 Chunks: `{session.get('total_chunks', 0)}`\n"
f"📖 Words: `{session.get('total_words', 0):,}`\n"
f"{'🟢 Active' if is_active else '⚫ Idle'}",
parse_mode=ParseMode.MARKDOWN
)
finally:
await db_conn.close()
async def cmd_finish(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
if not is_authorized(update.effective_user.id):
return
chat_id = update.effective_chat.id
task = _active_tasks.get(chat_id)
if task and not task.done():
task.cancel()
await update.message.reply_text(
"🛑 Stopping research...\n"
"Use `/generate podcast|blog|report|thread` to generate output.",
parse_mode=ParseMode.MARKDOWN
)
else:
await update.message.reply_text(
"No active research. Use `/generate` to create output from last session.",
parse_mode=ParseMode.MARKDOWN
)
async def cmd_generate(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
if not is_authorized(update.effective_user.id):
return
chat_id = update.effective_chat.id
output_arg = ctx.args[0].lower() if ctx.args else ""
type_map = {
"podcast": OutputType.PODCAST,
"blog": OutputType.BLOG,
"report": OutputType.REPORT,
"thread": OutputType.THREAD,
"hilo": OutputType.THREAD,
"informe": OutputType.REPORT,
}
if output_arg not in type_map:
await update.message.reply_text(
"❌ Invalid output type.\n"
"Use: `/generate podcast|blog|report|thread`",
parse_mode=ParseMode.MARKDOWN
)
return
output_type = type_map[output_arg]
db_conn = await get_db()
db = ResearchDB(db_conn)
try:
# Find last session for this chat
cursor = await db_conn.execute(
"""SELECT * FROM research_sessions WHERE telegram_chat_id = ?
ORDER BY created_at DESC LIMIT 1""",
(chat_id,)
)
row = await cursor.fetchone()
if not row:
await update.message.reply_text("No research sessions found. Start with /research <topic>")
return
session = dict(row)
session_id = session["id"]
msg = await update.message.reply_text(
f"⚙️ Generating *{output_type}* for: `{session['topic']}`\n"
f"Using Ollama ({settings.ollama_model})...\n"
f"This may take 2-5 minutes ☕",
parse_mode=ParseMode.MARKDOWN
)
async def gen_progress(text):
try:
await msg.edit_text(text)
except Exception:
pass
ollama = OllamaClient()
processor = ContentProcessor(db, ollama)
generator = OutputGenerator(db, ollama, processor)
output = await generator.generate(session_id, output_type, gen_progress)
# Send as file if very long
if len(output) > 8000:
import tempfile
ext_map = {
OutputType.PODCAST: "script.md",
OutputType.BLOG: "post.md",
OutputType.REPORT: "report.md",
OutputType.THREAD: "thread.txt",
}
filename = f"researchowl_{session['topic'][:30].replace(' ', '_')}_{ext_map[output_type]}"
with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
f.write(output)
tmp_path = f.name
with open(tmp_path, "rb") as f:
await update.message.reply_document(
document=f,
filename=filename,
caption=f"📄 *{output_type.upper()}* — {session['topic']}\n"
f"Generated by ResearchOwl 🦉",
parse_mode=ParseMode.MARKDOWN
)
os.unlink(tmp_path)
else:
await send_chunked(update.message, output)
except Exception as e:
logger.error("Generate failed", error=str(e))
await update.message.reply_text(f"❌ Generation failed: {str(e)[:200]}")
finally:
await db_conn.close()
async def cmd_sources(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
if not is_authorized(update.effective_user.id):
return
chat_id = update.effective_chat.id
db_conn = await get_db()
db = ResearchDB(db_conn)
try:
cursor = await db_conn.execute(
"SELECT * FROM research_sessions WHERE telegram_chat_id = ? ORDER BY created_at DESC LIMIT 1",
(chat_id,)
)
row = await cursor.fetchone()
if not row:
await update.message.reply_text("No sessions found.")
return
session_id = row["id"]
sources = await db.get_all_sources(session_id)
by_type: dict = {}
for s in sources:
t = s["source_type"]
by_type.setdefault(t, []).append(s)
lines = [f"📚 *Sources for session #{session_id}*\n"]
for stype, srcs in by_type.items():
scraped = sum(1 for s in srcs if s["status"] == "scraped")
lines.append(f"\n*{stype.upper()}* ({scraped}/{len(srcs)} scraped)")
for s in srcs[:5]: # show top 5 per type
quality = s.get("quality_score", 0)
status_icon = {"scraped": "", "failed": "", "pending": "", "skipped": "⏭️"}.get(s["status"], "")
title = (s.get("title") or s["url"])[:50]
lines.append(f"{status_icon} {title} (q:{quality:.1f})")
if len(srcs) > 5:
lines.append(f" ... and {len(srcs)-5} more")
await send_chunked(update.message, "\n".join(lines), parse_mode=ParseMode.MARKDOWN)
finally:
await db_conn.close()
async def cmd_outputs(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
if not is_authorized(update.effective_user.id):
return
chat_id = update.effective_chat.id
db_conn = await get_db()
db = ResearchDB(db_conn)
try:
cursor = await db_conn.execute(
"SELECT * FROM research_sessions WHERE telegram_chat_id = ? ORDER BY created_at DESC LIMIT 1",
(chat_id,)
)
row = await cursor.fetchone()
if not row:
await update.message.reply_text("No sessions found.")
return
outputs = await db.get_outputs(row["id"])
if not outputs:
await update.message.reply_text(
"No outputs generated yet. Use `/generate podcast|blog|report|thread`",
parse_mode=ParseMode.MARKDOWN
)
return
lines = [f"📄 *Outputs for: {row['topic']}*\n"]
for o in outputs:
from datetime import datetime
dt = datetime.utcfromtimestamp(o['created_at']).strftime("%Y-%m-%d %H:%M")
lines.append(f"• `{o['output_type']}` — {dt} ({len(o['content'])} chars)")
await update.message.reply_text(
"\n".join(lines),
parse_mode=ParseMode.MARKDOWN
)
finally:
await db_conn.close()
async def cmd_process(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
if not is_authorized(update.effective_user.id):
return
chat_id = update.effective_chat.id
db_conn = await get_db()
db = ResearchDB(db_conn)
try:
cursor = await db_conn.execute(
"SELECT * FROM research_sessions WHERE telegram_chat_id = ? ORDER BY created_at DESC LIMIT 1",
(chat_id,)
)
row = await cursor.fetchone()
if not row:
await update.message.reply_text("No research sessions found. Start with /research <topic>")
return
session = dict(row)
session_id = session["id"]
topic = session["topic"]
msg = await update.message.reply_text(
f"🧠 Processing session #{session_id}: `{topic}`\n"
f"Chunking & scoring with Ollama ({settings.ollama_model})...\n"
f"This may take a few minutes.",
parse_mode=ParseMode.MARKDOWN
)
ollama = OllamaClient()
if not await ollama.is_available():
await msg.edit_text("❌ Ollama not reachable. Check OLLAMA_URL setting.")
return
processor = ContentProcessor(db, ollama)
async def proc_progress(total_chunks, total_words):
try:
await msg.edit_text(
f"🧠 *Processing complete!*\n"
f"• Chunks stored: `{total_chunks}`\n"
f"• Words researched: `{total_words:,}`\n\n"
f"Ready! Use `/generate podcast|blog|report|thread`\n"
f"_If 0 chunks: set `QUALITY_THRESHOLD=0.3` or `0` and retry_",
parse_mode=ParseMode.MARKDOWN
)
except Exception:
pass
await processor.process_session(session_id, topic, proc_progress)
except Exception as e:
logger.error("Process command failed", error=str(e))
await update.message.reply_text(f"❌ Processing failed: {str(e)[:200]}")
finally:
await db_conn.close()
async def cmd_cancel(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
if not is_authorized(update.effective_user.id):
return
chat_id = update.effective_chat.id
task = _active_tasks.get(chat_id)
if task and not task.done():
task.cancel()
await update.message.reply_text("🛑 Research cancelled.")
else:
await update.message.reply_text("No active research to cancel.")
async def cmd_help(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
await cmd_start(update, ctx)
# ─── Bot setup ────────────────────────────────────────────────────────────────
def create_bot() -> Application:
app = Application.builder().token(settings.telegram_bot_token).build()
app.add_handler(CommandHandler("start", cmd_start))
app.add_handler(CommandHandler("help", cmd_help))
app.add_handler(CommandHandler("research", cmd_research))
app.add_handler(CommandHandler("status", cmd_status))
app.add_handler(CommandHandler("finish", cmd_finish))
app.add_handler(CommandHandler("generate", cmd_generate))
app.add_handler(CommandHandler("sources", cmd_sources))
app.add_handler(CommandHandler("outputs", cmd_outputs))
app.add_handler(CommandHandler("process", cmd_process))
app.add_handler(CommandHandler("cancel", cmd_cancel))
return app
def run():
logger.info("Starting ResearchOwl bot")
app = create_bot()
app.run_polling(allowed_updates=Update.ALL_TYPES)