fix: add /process command, log quality filtering, improve Reddit headers
Build & Deploy ResearchOwl / build-and-push (push) Successful in 5s

- bot.py: add cmd_process handler to manually trigger chunk processing
  on the last session; register CommandHandler("process")
- processor.py: log exceptions from asyncio.gather instead of silently
  dropping them; add per-chunk quality score debug logging; warn when
  all chunks filtered by quality threshold with actionable hint;
  raise fallback score to 0.6 so Ollama failures don't filter chunks
- exhaustive.py: replace bot User-Agent with full browser UA + headers
  for REDDIT_HEADERS; downgrade Reddit 403 from warning to info since
  server IPs are routinely blocked; use content_type=None on json()
  to avoid aiohttp content-type mismatch errors

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
ChemaVX
2026-04-27 20:37:39 +00:00
parent bb8171359d
commit 0c7176dd0b
3 changed files with 119 additions and 19 deletions
+60
View File
@@ -66,6 +66,7 @@ async def cmd_start(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
"`/research <topic>` — Start exhaustive research\n" "`/research <topic>` — Start exhaustive research\n"
"`/status` — Check current research progress\n" "`/status` — Check current research progress\n"
"`/finish` — Stop research and proceed to generation\n" "`/finish` — Stop research and proceed to generation\n"
"`/process` — Manually trigger chunk processing\n"
"`/generate <type>` — Generate output (podcast|blog|report|thread)\n" "`/generate <type>` — Generate output (podcast|blog|report|thread)\n"
"`/sources` — List all sources found\n" "`/sources` — List all sources found\n"
"`/outputs` — List generated outputs\n" "`/outputs` — List generated outputs\n"
@@ -426,6 +427,64 @@ async def cmd_outputs(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
await db_conn.close() await db_conn.close()
async def cmd_process(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
if not is_authorized(update.effective_user.id):
return
chat_id = update.effective_chat.id
db_conn = await get_db()
db = ResearchDB(db_conn)
try:
cursor = await db_conn.execute(
"SELECT * FROM research_sessions WHERE telegram_chat_id = ? ORDER BY created_at DESC LIMIT 1",
(chat_id,)
)
row = await cursor.fetchone()
if not row:
await update.message.reply_text("No research sessions found. Start with /research <topic>")
return
session = dict(row)
session_id = session["id"]
topic = session["topic"]
msg = await update.message.reply_text(
f"🧠 Processing session #{session_id}: `{topic}`\n"
f"Chunking & scoring with Ollama ({settings.ollama_model})...\n"
f"This may take a few minutes.",
parse_mode=ParseMode.MARKDOWN
)
ollama = OllamaClient()
if not await ollama.is_available():
await msg.edit_text("❌ Ollama not reachable. Check OLLAMA_URL setting.")
return
processor = ContentProcessor(db, ollama)
async def proc_progress(total_chunks, total_words):
try:
await msg.edit_text(
f"🧠 *Processing complete!*\n"
f"• Chunks stored: `{total_chunks}`\n"
f"• Words researched: `{total_words:,}`\n\n"
f"Ready! Use `/generate podcast|blog|report|thread`\n"
f"_If 0 chunks: set `QUALITY_THRESHOLD=0.3` or `0` and retry_",
parse_mode=ParseMode.MARKDOWN
)
except Exception:
pass
await processor.process_session(session_id, topic, proc_progress)
except Exception as e:
logger.error("Process command failed", error=str(e))
await update.message.reply_text(f"❌ Processing failed: {str(e)[:200]}")
finally:
await db_conn.close()
async def cmd_cancel(update: Update, ctx: ContextTypes.DEFAULT_TYPE): async def cmd_cancel(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
if not is_authorized(update.effective_user.id): if not is_authorized(update.effective_user.id):
return return
@@ -456,6 +515,7 @@ def create_bot() -> Application:
app.add_handler(CommandHandler("generate", cmd_generate)) app.add_handler(CommandHandler("generate", cmd_generate))
app.add_handler(CommandHandler("sources", cmd_sources)) app.add_handler(CommandHandler("sources", cmd_sources))
app.add_handler(CommandHandler("outputs", cmd_outputs)) app.add_handler(CommandHandler("outputs", cmd_outputs))
app.add_handler(CommandHandler("process", cmd_process))
app.add_handler(CommandHandler("cancel", cmd_cancel)) app.add_handler(CommandHandler("cancel", cmd_cancel))
return app return app
+35 -9
View File
@@ -137,8 +137,11 @@ class ContentProcessor:
results = await asyncio.gather(*[process_one(s) for s in scraped], results = await asyncio.gather(*[process_one(s) for s in scraped],
return_exceptions=True) return_exceptions=True)
for r in results: for i, r in enumerate(results):
if isinstance(r, int): if isinstance(r, Exception):
logger.error("Source processing raised exception",
source_id=scraped[i]["id"], error=str(r), exc_info=r)
elif isinstance(r, int):
total_chunks += r total_chunks += r
total_words = sum(s.get("word_count", 0) for s in scraped) total_words = sum(s.get("word_count", 0) for s in scraped)
@@ -159,17 +162,27 @@ class ContentProcessor:
content = await self.db.get_source_content(source_id) content = await self.db.get_source_content(source_id)
if not content: if not content:
logger.warning("No content in source_contents", source_id=source_id)
return 0 return 0
chunks = simple_chunk(content, settings.chunk_size, settings.chunk_overlap) chunks = simple_chunk(content, settings.chunk_size, settings.chunk_overlap)
logger.info("Processing source", source_id=source_id,
content_len=len(content), num_chunks=len(chunks),
quality_threshold=settings.quality_threshold)
stored = 0 stored = 0
filtered_quality = 0
for i, chunk in enumerate(chunks): for i, chunk in enumerate(chunks):
if len(chunk.split()) < 30: words = len(chunk.split())
if words < 30:
continue continue
quality = await self._score_quality(chunk, topic) quality = await self._score_quality(chunk, topic)
if quality < settings.quality_threshold: if quality < settings.quality_threshold:
filtered_quality += 1
logger.debug("Chunk filtered by quality", source_id=source_id,
chunk_index=i, quality=round(quality, 2),
threshold=settings.quality_threshold, words=words)
continue continue
embedding = await self.ollama.embed(chunk[:1000]) embedding = await self.ollama.embed(chunk[:1000])
@@ -179,12 +192,22 @@ class ContentProcessor:
source_id=source_id, source_id=source_id,
content=chunk, content=chunk,
chunk_index=i, chunk_index=i,
token_count=len(chunk.split()), token_count=words,
quality_score=quality, quality_score=quality,
embedding=embedding embedding=embedding
) )
stored += 1 stored += 1
if filtered_quality > 0 and stored == 0:
logger.warning(
"All chunks filtered by quality — consider lowering QUALITY_THRESHOLD "
"(currently %.1f) or set QUALITY_THRESHOLD=0 to disable",
settings.quality_threshold,
source_id=source_id, chunks_total=len(chunks),
chunks_filtered=filtered_quality
)
logger.info("Source processed", source_id=source_id, stored=stored)
return stored return stored
async def _score_quality(self, chunk: str, topic: str) -> float: async def _score_quality(self, chunk: str, topic: str) -> float:
@@ -204,14 +227,17 @@ Respond with ONLY a single number 0-10. No explanation."""
try: try:
response = await self.ollama.generate(prompt) response = await self.ollama.generate(prompt)
# Extract number from response
numbers = re.findall(r'\b(\d+(?:\.\d+)?)\b', response) numbers = re.findall(r'\b(\d+(?:\.\d+)?)\b', response)
if numbers: if numbers:
score = float(numbers[0]) score = float(numbers[0])
return min(1.0, score / 10.0) normalized = min(1.0, score / 10.0)
return 0.5 logger.debug("Quality score", raw=score, normalized=round(normalized, 2))
except Exception: return normalized
return 0.5 # default on error logger.debug("No number in quality response", response=response[:80])
return 0.6 # above threshold so chunk is kept
except Exception as e:
logger.warning("Quality scoring failed", error=str(e))
return 0.6 # above threshold so chunk is kept on Ollama error
async def rag_query(self, session_id: int, query: str, top_k: int = 20) -> str: async def rag_query(self, session_id: int, query: str, top_k: int = 20) -> str:
""" """
+24 -10
View File
@@ -23,9 +23,21 @@ from src.db.database import ResearchDB
logger = structlog.get_logger() logger = structlog.get_logger()
HEADERS = { HEADERS = {
"User-Agent": "Mozilla/5.0 (compatible; ResearchOwl/1.0; +https://chemavx.xyz)", "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9,es;q=0.8", "Accept-Language": "en-US,en;q=0.9,es;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1",
}
# Reddit requires its own headers — generic bots get 403
REDDIT_HEADERS = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Referer": "https://www.reddit.com/",
"X-Requested-With": "XMLHttpRequest",
} }
# Domains to skip — not useful for research # Domains to skip — not useful for research
@@ -177,10 +189,10 @@ class ExhaustiveScraper:
"""Search Reddit — sequential to avoid rate limiting""" """Search Reddit — sequential to avoid rate limiting"""
try: try:
http = await self._get_http() http = await self._get_http()
url = f"https://www.reddit.com/search.json?q={quote_plus(self.topic)}&sort=top&limit=15" url = f"https://www.reddit.com/search.json?q={quote_plus(self.topic)}&sort=top&limit=15&type=link"
async with http.get(url, headers={**HEADERS, "User-Agent": "ResearchOwl/1.0"}) as resp: async with http.get(url, headers=REDDIT_HEADERS) as resp:
if resp.status == 200: if resp.status == 200:
data = await resp.json() data = await resp.json(content_type=None)
posts = data.get("data", {}).get("children", []) posts = data.get("data", {}).get("children", [])
for post in posts: for post in posts:
post_data = post.get("data", {}) post_data = post.get("data", {})
@@ -192,6 +204,8 @@ class ExhaustiveScraper:
title=post_data.get("title") title=post_data.get("title")
) )
logger.info("Reddit seed", found=len(posts), status=resp.status) logger.info("Reddit seed", found=len(posts), status=resp.status)
elif resp.status == 403:
logger.info("Reddit seed blocked (403) — server IP likely blocked by Reddit; skipping")
else: else:
logger.warning("Reddit seed non-200", status=resp.status) logger.warning("Reddit seed non-200", status=resp.status)
except Exception as e: except Exception as e:
@@ -446,14 +460,14 @@ class ExhaustiveScraper:
json_url = url.rstrip("/") + ".json?limit=100&sort=top" json_url = url.rstrip("/") + ".json?limit=100&sort=top"
http = await self._get_http() http = await self._get_http()
try: try:
async with http.get( async with http.get(json_url, headers=REDDIT_HEADERS) as resp:
json_url, if resp.status == 403:
headers={**HEADERS, "User-Agent": "ResearchOwl/1.0"} logger.info("Reddit post blocked (403) — skipping", url=url[:60])
) as resp: return None, None
if resp.status != 200: if resp.status != 200:
logger.debug("Reddit non-200", status=resp.status, url=url[:60]) logger.debug("Reddit non-200", status=resp.status, url=url[:60])
return None, None return None, None
data = await resp.json() data = await resp.json(content_type=None)
post = data[0]["data"]["children"][0]["data"] post = data[0]["data"]["children"][0]["data"]
title = post.get("title", "") title = post.get("title", "")