fix: add /process command, log quality filtering, improve Reddit headers
Build & Deploy ResearchOwl / build-and-push (push) Successful in 5s
Build & Deploy ResearchOwl / build-and-push (push) Successful in 5s
- bot.py: add cmd_process handler to manually trigger chunk processing
on the last session; register CommandHandler("process")
- processor.py: log exceptions from asyncio.gather instead of silently
dropping them; add per-chunk quality score debug logging; warn when
all chunks filtered by quality threshold with actionable hint;
raise fallback score to 0.6 so Ollama failures don't filter chunks
- exhaustive.py: replace bot User-Agent with full browser UA + headers
for REDDIT_HEADERS; downgrade Reddit 403 from warning to info since
server IPs are routinely blocked; use content_type=None on json()
to avoid aiohttp content-type mismatch errors
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -137,8 +137,11 @@ class ContentProcessor:
|
||||
results = await asyncio.gather(*[process_one(s) for s in scraped],
|
||||
return_exceptions=True)
|
||||
|
||||
for r in results:
|
||||
if isinstance(r, int):
|
||||
for i, r in enumerate(results):
|
||||
if isinstance(r, Exception):
|
||||
logger.error("Source processing raised exception",
|
||||
source_id=scraped[i]["id"], error=str(r), exc_info=r)
|
||||
elif isinstance(r, int):
|
||||
total_chunks += r
|
||||
|
||||
total_words = sum(s.get("word_count", 0) for s in scraped)
|
||||
@@ -159,17 +162,27 @@ class ContentProcessor:
|
||||
|
||||
content = await self.db.get_source_content(source_id)
|
||||
if not content:
|
||||
logger.warning("No content in source_contents", source_id=source_id)
|
||||
return 0
|
||||
|
||||
chunks = simple_chunk(content, settings.chunk_size, settings.chunk_overlap)
|
||||
logger.info("Processing source", source_id=source_id,
|
||||
content_len=len(content), num_chunks=len(chunks),
|
||||
quality_threshold=settings.quality_threshold)
|
||||
stored = 0
|
||||
filtered_quality = 0
|
||||
|
||||
for i, chunk in enumerate(chunks):
|
||||
if len(chunk.split()) < 30:
|
||||
words = len(chunk.split())
|
||||
if words < 30:
|
||||
continue
|
||||
|
||||
quality = await self._score_quality(chunk, topic)
|
||||
if quality < settings.quality_threshold:
|
||||
filtered_quality += 1
|
||||
logger.debug("Chunk filtered by quality", source_id=source_id,
|
||||
chunk_index=i, quality=round(quality, 2),
|
||||
threshold=settings.quality_threshold, words=words)
|
||||
continue
|
||||
|
||||
embedding = await self.ollama.embed(chunk[:1000])
|
||||
@@ -179,12 +192,22 @@ class ContentProcessor:
|
||||
source_id=source_id,
|
||||
content=chunk,
|
||||
chunk_index=i,
|
||||
token_count=len(chunk.split()),
|
||||
token_count=words,
|
||||
quality_score=quality,
|
||||
embedding=embedding
|
||||
)
|
||||
stored += 1
|
||||
|
||||
if filtered_quality > 0 and stored == 0:
|
||||
logger.warning(
|
||||
"All chunks filtered by quality — consider lowering QUALITY_THRESHOLD "
|
||||
"(currently %.1f) or set QUALITY_THRESHOLD=0 to disable",
|
||||
settings.quality_threshold,
|
||||
source_id=source_id, chunks_total=len(chunks),
|
||||
chunks_filtered=filtered_quality
|
||||
)
|
||||
|
||||
logger.info("Source processed", source_id=source_id, stored=stored)
|
||||
return stored
|
||||
|
||||
async def _score_quality(self, chunk: str, topic: str) -> float:
|
||||
@@ -204,14 +227,17 @@ Respond with ONLY a single number 0-10. No explanation."""
|
||||
|
||||
try:
|
||||
response = await self.ollama.generate(prompt)
|
||||
# Extract number from response
|
||||
numbers = re.findall(r'\b(\d+(?:\.\d+)?)\b', response)
|
||||
if numbers:
|
||||
score = float(numbers[0])
|
||||
return min(1.0, score / 10.0)
|
||||
return 0.5
|
||||
except Exception:
|
||||
return 0.5 # default on error
|
||||
normalized = min(1.0, score / 10.0)
|
||||
logger.debug("Quality score", raw=score, normalized=round(normalized, 2))
|
||||
return normalized
|
||||
logger.debug("No number in quality response", response=response[:80])
|
||||
return 0.6 # above threshold so chunk is kept
|
||||
except Exception as e:
|
||||
logger.warning("Quality scoring failed", error=str(e))
|
||||
return 0.6 # above threshold so chunk is kept on Ollama error
|
||||
|
||||
async def rag_query(self, session_id: int, query: str, top_k: int = 20) -> str:
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user