fix: add /process command, log quality filtering, improve Reddit headers

- bot.py: add cmd_process handler to manually trigger chunk processing on the last session; register CommandHandler("process") - processor.py: log exceptions from asyncio.gather instead of silently dropping them; add per-chunk quality score debug logging; warn when all chunks filtered by quality threshold with actionable hint; raise fallback score to 0.6 so Ollama failures don't filter chunks - exhaustive.py: replace bot User-Agent with full browser UA + headers for REDDIT_HEADERS; downgrade Reddit 403 from warning to info since server IPs are routinely blocked; use content_type=None on json() to avoid aiohttp content-type mismatch errors Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-27 20:37:39 +00:00
parent bb8171359d
commit 0c7176dd0b
3 changed files with 119 additions and 19 deletions
@@ -137,8 +137,11 @@ class ContentProcessor:
        results = await asyncio.gather(*[process_one(s) for s in scraped],
                                        return_exceptions=True)

-        for r in results:
-            if isinstance(r, int):
+        for i, r in enumerate(results):
+            if isinstance(r, Exception):
+                logger.error("Source processing raised exception",
+                             source_id=scraped[i]["id"], error=str(r), exc_info=r)
+            elif isinstance(r, int):
                total_chunks += r

        total_words = sum(s.get("word_count", 0) for s in scraped)
@@ -159,17 +162,27 @@ class ContentProcessor:

        content = await self.db.get_source_content(source_id)
        if not content:
+            logger.warning("No content in source_contents", source_id=source_id)
            return 0

        chunks = simple_chunk(content, settings.chunk_size, settings.chunk_overlap)
+        logger.info("Processing source", source_id=source_id,
+                    content_len=len(content), num_chunks=len(chunks),
+                    quality_threshold=settings.quality_threshold)
        stored = 0
+        filtered_quality = 0

        for i, chunk in enumerate(chunks):
-            if len(chunk.split()) < 30:
+            words = len(chunk.split())
+            if words < 30:
                continue

            quality = await self._score_quality(chunk, topic)
            if quality < settings.quality_threshold:
+                filtered_quality += 1
+                logger.debug("Chunk filtered by quality", source_id=source_id,
+                             chunk_index=i, quality=round(quality, 2),
+                             threshold=settings.quality_threshold, words=words)
                continue

            embedding = await self.ollama.embed(chunk[:1000])
@@ -179,12 +192,22 @@ class ContentProcessor:
                source_id=source_id,
                content=chunk,
                chunk_index=i,
-                token_count=len(chunk.split()),
+                token_count=words,
                quality_score=quality,
                embedding=embedding
            )
            stored += 1

+        if filtered_quality > 0 and stored == 0:
+            logger.warning(
+                "All chunks filtered by quality — consider lowering QUALITY_THRESHOLD "
+                "(currently %.1f) or set QUALITY_THRESHOLD=0 to disable",
+                settings.quality_threshold,
+                source_id=source_id, chunks_total=len(chunks),
+                chunks_filtered=filtered_quality
+            )
+
+        logger.info("Source processed", source_id=source_id, stored=stored)
        return stored

    async def _score_quality(self, chunk: str, topic: str) -> float:
@@ -204,14 +227,17 @@ Respond with ONLY a single number 0-10. No explanation."""

        try:
            response = await self.ollama.generate(prompt)
-            # Extract number from response
            numbers = re.findall(r'\b(\d+(?:\.\d+)?)\b', response)
            if numbers:
                score = float(numbers[0])
-                return min(1.0, score / 10.0)
-            return 0.5
-        except Exception:
-            return 0.5  # default on error
+                normalized = min(1.0, score / 10.0)
+                logger.debug("Quality score", raw=score, normalized=round(normalized, 2))
+                return normalized
+            logger.debug("No number in quality response", response=response[:80])
+            return 0.6  # above threshold so chunk is kept
+        except Exception as e:
+            logger.warning("Quality scoring failed", error=str(e))
+            return 0.6  # above threshold so chunk is kept on Ollama error

    async def rag_query(self, session_id: int, query: str, top_k: int = 20) -> str:
        """