fix: add /process command, log quality filtering, improve Reddit headers

- bot.py: add cmd_process handler to manually trigger chunk processing on the last session; register CommandHandler("process") - processor.py: log exceptions from asyncio.gather instead of silently dropping them; add per-chunk quality score debug logging; warn when all chunks filtered by quality threshold with actionable hint; raise fallback score to 0.6 so Ollama failures don't filter chunks - exhaustive.py: replace bot User-Agent with full browser UA + headers for REDDIT_HEADERS; downgrade Reddit 403 from warning to info since server IPs are routinely blocked; use content_type=None on json() to avoid aiohttp content-type mismatch errors Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-27 20:37:39 +00:00
parent bb8171359d
commit 0c7176dd0b
3 changed files with 119 additions and 19 deletions
@@ -66,6 +66,7 @@ async def cmd_start(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
        "`/research <topic>` — Start exhaustive research\n"
        "`/status` — Check current research progress\n"
        "`/finish` — Stop research and proceed to generation\n"
        "`/process` — Manually trigger chunk processing\n"
        "`/generate <type>` — Generate output (podcast|blog|report|thread)\n"
        "`/sources` — List all sources found\n"
        "`/outputs` — List generated outputs\n"
@@ -426,6 +427,64 @@ async def cmd_outputs(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
        await db_conn.close()
 async def cmd_process(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
    if not is_authorized(update.effective_user.id):
        return
    chat_id = update.effective_chat.id
    db_conn = await get_db()
    db = ResearchDB(db_conn)
    try:
        cursor = await db_conn.execute(
            "SELECT * FROM research_sessions WHERE telegram_chat_id = ? ORDER BY created_at DESC LIMIT 1",
            (chat_id,)
        )
        row = await cursor.fetchone()
        if not row:
            await update.message.reply_text("No research sessions found. Start with /research <topic>")
            return
        session = dict(row)
        session_id = session["id"]
        topic = session["topic"]
        msg = await update.message.reply_text(
            f"🧠 Processing session #{session_id}: `{topic}`\n"
            f"Chunking & scoring with Ollama ({settings.ollama_model})...\n"
            f"This may take a few minutes.",
            parse_mode=ParseMode.MARKDOWN
        )
        ollama = OllamaClient()
        if not await ollama.is_available():
            await msg.edit_text("❌ Ollama not reachable. Check OLLAMA_URL setting.")
            return
        processor = ContentProcessor(db, ollama)
        async def proc_progress(total_chunks, total_words):
            try:
                await msg.edit_text(
                    f"🧠 *Processing complete!*\n"
                    f"• Chunks stored: `{total_chunks}`\n"
                    f"• Words researched: `{total_words:,}`\n\n"
                    f"Ready! Use `/generate podcast|blog|report|thread`\n"
                    f"_If 0 chunks: set `QUALITY_THRESHOLD=0.3` or `0` and retry_",
                    parse_mode=ParseMode.MARKDOWN
                )
            except Exception:
                pass
        await processor.process_session(session_id, topic, proc_progress)
    except Exception as e:
        logger.error("Process command failed", error=str(e))
        await update.message.reply_text(f"❌ Processing failed: {str(e)[:200]}")
    finally:
        await db_conn.close()
 async def cmd_cancel(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
    if not is_authorized(update.effective_user.id):
        return
@@ -456,6 +515,7 @@ def create_bot() -> Application:
    app.add_handler(CommandHandler("generate", cmd_generate))
    app.add_handler(CommandHandler("sources", cmd_sources))
    app.add_handler(CommandHandler("outputs", cmd_outputs))
    app.add_handler(CommandHandler("process", cmd_process))
    app.add_handler(CommandHandler("cancel", cmd_cancel))
    return app
@@ -137,8 +137,11 @@ class ContentProcessor:
        results = await asyncio.gather(*[process_one(s) for s in scraped],
                                        return_exceptions=True)
-        for r in results:
+        for i, r in enumerate(results):
-            if isinstance(r, int):
+            if isinstance(r, Exception):
                logger.error("Source processing raised exception",
                             source_id=scraped[i]["id"], error=str(r), exc_info=r)
            elif isinstance(r, int):
                total_chunks += r
        total_words = sum(s.get("word_count", 0) for s in scraped)
@@ -159,17 +162,27 @@ class ContentProcessor:
        content = await self.db.get_source_content(source_id)
        if not content:
            logger.warning("No content in source_contents", source_id=source_id)
            return 0
        chunks = simple_chunk(content, settings.chunk_size, settings.chunk_overlap)
        logger.info("Processing source", source_id=source_id,
                    content_len=len(content), num_chunks=len(chunks),
                    quality_threshold=settings.quality_threshold)
        stored = 0
        filtered_quality = 0
        for i, chunk in enumerate(chunks):
-            if len(chunk.split()) < 30:
+            words = len(chunk.split())
            if words < 30:
                continue
            quality = await self._score_quality(chunk, topic)
            if quality < settings.quality_threshold:
                filtered_quality += 1
                logger.debug("Chunk filtered by quality", source_id=source_id,
                             chunk_index=i, quality=round(quality, 2),
                             threshold=settings.quality_threshold, words=words)
                continue
            embedding = await self.ollama.embed(chunk[:1000])
@@ -179,12 +192,22 @@ class ContentProcessor:
                source_id=source_id,
                content=chunk,
                chunk_index=i,
-                token_count=len(chunk.split()),
+                token_count=words,
                quality_score=quality,
                embedding=embedding
            )
            stored += 1
        if filtered_quality > 0 and stored == 0:
            logger.warning(
                "All chunks filtered by quality — consider lowering QUALITY_THRESHOLD "
                "(currently %.1f) or set QUALITY_THRESHOLD=0 to disable",
                settings.quality_threshold,
                source_id=source_id, chunks_total=len(chunks),
                chunks_filtered=filtered_quality
            )
        logger.info("Source processed", source_id=source_id, stored=stored)
        return stored
    async def _score_quality(self, chunk: str, topic: str) -> float:
@@ -204,14 +227,17 @@ Respond with ONLY a single number 0-10. No explanation."""
        try:
            response = await self.ollama.generate(prompt)
            # Extract number from response
            numbers = re.findall(r'\b(\d+(?:\.\d+)?)\b', response)
            if numbers:
                score = float(numbers[0])
-                return min(1.0, score / 10.0)
+                normalized = min(1.0, score / 10.0)
-            return 0.5
+                logger.debug("Quality score", raw=score, normalized=round(normalized, 2))
-        except Exception:
+                return normalized
-            return 0.5  # default on error
+            logger.debug("No number in quality response", response=response[:80])
            return 0.6  # above threshold so chunk is kept
        except Exception as e:
            logger.warning("Quality scoring failed", error=str(e))
            return 0.6  # above threshold so chunk is kept on Ollama error
    async def rag_query(self, session_id: int, query: str, top_k: int = 20) -> str:
        """
@@ -23,9 +23,21 @@ from src.db.database import ResearchDB
 logger = structlog.get_logger()
 HEADERS = {
-    "User-Agent": "Mozilla/5.0 (compatible; ResearchOwl/1.0; +https://chemavx.xyz)",
+    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
-    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9,es;q=0.8",
    "Accept-Encoding": "gzip, deflate, br",
    "DNT": "1",
 }
 # Reddit requires its own headers — generic bots get 403
 REDDIT_HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
    "Accept": "application/json, text/javascript, */*; q=0.01",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Referer": "https://www.reddit.com/",
    "X-Requested-With": "XMLHttpRequest",
 }
 # Domains to skip — not useful for research
@@ -177,10 +189,10 @@ class ExhaustiveScraper:
        """Search Reddit — sequential to avoid rate limiting"""
        try:
            http = await self._get_http()
-            url = f"https://www.reddit.com/search.json?q={quote_plus(self.topic)}&sort=top&limit=15"
+            url = f"https://www.reddit.com/search.json?q={quote_plus(self.topic)}&sort=top&limit=15&type=link"
-            async with http.get(url, headers={**HEADERS, "User-Agent": "ResearchOwl/1.0"}) as resp:
+            async with http.get(url, headers=REDDIT_HEADERS) as resp:
                if resp.status == 200:
-                    data = await resp.json()
+                    data = await resp.json(content_type=None)
                    posts = data.get("data", {}).get("children", [])
                    for post in posts:
                        post_data = post.get("data", {})
@@ -192,6 +204,8 @@ class ExhaustiveScraper:
                                title=post_data.get("title")
                            )
                    logger.info("Reddit seed", found=len(posts), status=resp.status)
                elif resp.status == 403:
                    logger.info("Reddit seed blocked (403) — server IP likely blocked by Reddit; skipping")
                else:
                    logger.warning("Reddit seed non-200", status=resp.status)
        except Exception as e:
@@ -446,14 +460,14 @@ class ExhaustiveScraper:
        json_url = url.rstrip("/") + ".json?limit=100&sort=top"
        http = await self._get_http()
        try:
-            async with http.get(
+            async with http.get(json_url, headers=REDDIT_HEADERS) as resp:
-                json_url,
+                if resp.status == 403:
-                headers={**HEADERS, "User-Agent": "ResearchOwl/1.0"}
+                    logger.info("Reddit post blocked (403) — skipping", url=url[:60])
-            ) as resp:
+                    return None, None
                if resp.status != 200:
                    logger.debug("Reddit non-200", status=resp.status, url=url[:60])
                    return None, None
-                data = await resp.json()
+                data = await resp.json(content_type=None)
            post = data[0]["data"]["children"][0]["data"]
            title = post.get("title", "")