fix: add /process command, log quality filtering, improve Reddit headers
Build & Deploy ResearchOwl / build-and-push (push) Successful in 5s

- bot.py: add cmd_process handler to manually trigger chunk processing
  on the last session; register CommandHandler("process")
- processor.py: log exceptions from asyncio.gather instead of silently
  dropping them; add per-chunk quality score debug logging; warn when
  all chunks filtered by quality threshold with actionable hint;
  raise fallback score to 0.6 so Ollama failures don't filter chunks
- exhaustive.py: replace bot User-Agent with full browser UA + headers
  for REDDIT_HEADERS; downgrade Reddit 403 from warning to info since
  server IPs are routinely blocked; use content_type=None on json()
  to avoid aiohttp content-type mismatch errors

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
ChemaVX
2026-04-27 20:37:39 +00:00
parent bb8171359d
commit 0c7176dd0b
3 changed files with 119 additions and 19 deletions
+35 -9
View File
@@ -137,8 +137,11 @@ class ContentProcessor:
results = await asyncio.gather(*[process_one(s) for s in scraped],
return_exceptions=True)
for r in results:
if isinstance(r, int):
for i, r in enumerate(results):
if isinstance(r, Exception):
logger.error("Source processing raised exception",
source_id=scraped[i]["id"], error=str(r), exc_info=r)
elif isinstance(r, int):
total_chunks += r
total_words = sum(s.get("word_count", 0) for s in scraped)
@@ -159,17 +162,27 @@ class ContentProcessor:
content = await self.db.get_source_content(source_id)
if not content:
logger.warning("No content in source_contents", source_id=source_id)
return 0
chunks = simple_chunk(content, settings.chunk_size, settings.chunk_overlap)
logger.info("Processing source", source_id=source_id,
content_len=len(content), num_chunks=len(chunks),
quality_threshold=settings.quality_threshold)
stored = 0
filtered_quality = 0
for i, chunk in enumerate(chunks):
if len(chunk.split()) < 30:
words = len(chunk.split())
if words < 30:
continue
quality = await self._score_quality(chunk, topic)
if quality < settings.quality_threshold:
filtered_quality += 1
logger.debug("Chunk filtered by quality", source_id=source_id,
chunk_index=i, quality=round(quality, 2),
threshold=settings.quality_threshold, words=words)
continue
embedding = await self.ollama.embed(chunk[:1000])
@@ -179,12 +192,22 @@ class ContentProcessor:
source_id=source_id,
content=chunk,
chunk_index=i,
token_count=len(chunk.split()),
token_count=words,
quality_score=quality,
embedding=embedding
)
stored += 1
if filtered_quality > 0 and stored == 0:
logger.warning(
"All chunks filtered by quality — consider lowering QUALITY_THRESHOLD "
"(currently %.1f) or set QUALITY_THRESHOLD=0 to disable",
settings.quality_threshold,
source_id=source_id, chunks_total=len(chunks),
chunks_filtered=filtered_quality
)
logger.info("Source processed", source_id=source_id, stored=stored)
return stored
async def _score_quality(self, chunk: str, topic: str) -> float:
@@ -204,14 +227,17 @@ Respond with ONLY a single number 0-10. No explanation."""
try:
response = await self.ollama.generate(prompt)
# Extract number from response
numbers = re.findall(r'\b(\d+(?:\.\d+)?)\b', response)
if numbers:
score = float(numbers[0])
return min(1.0, score / 10.0)
return 0.5
except Exception:
return 0.5 # default on error
normalized = min(1.0, score / 10.0)
logger.debug("Quality score", raw=score, normalized=round(normalized, 2))
return normalized
logger.debug("No number in quality response", response=response[:80])
return 0.6 # above threshold so chunk is kept
except Exception as e:
logger.warning("Quality scoring failed", error=str(e))
return 0.6 # above threshold so chunk is kept on Ollama error
async def rag_query(self, session_id: int, query: str, top_k: int = 20) -> str:
"""