fix: add /process command, log quality filtering, improve Reddit headers
Build & Deploy ResearchOwl / build-and-push (push) Successful in 5s
Build & Deploy ResearchOwl / build-and-push (push) Successful in 5s
- bot.py: add cmd_process handler to manually trigger chunk processing
on the last session; register CommandHandler("process")
- processor.py: log exceptions from asyncio.gather instead of silently
dropping them; add per-chunk quality score debug logging; warn when
all chunks filtered by quality threshold with actionable hint;
raise fallback score to 0.6 so Ollama failures don't filter chunks
- exhaustive.py: replace bot User-Agent with full browser UA + headers
for REDDIT_HEADERS; downgrade Reddit 403 from warning to info since
server IPs are routinely blocked; use content_type=None on json()
to avoid aiohttp content-type mismatch errors
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -66,6 +66,7 @@ async def cmd_start(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
|
|||||||
"`/research <topic>` — Start exhaustive research\n"
|
"`/research <topic>` — Start exhaustive research\n"
|
||||||
"`/status` — Check current research progress\n"
|
"`/status` — Check current research progress\n"
|
||||||
"`/finish` — Stop research and proceed to generation\n"
|
"`/finish` — Stop research and proceed to generation\n"
|
||||||
|
"`/process` — Manually trigger chunk processing\n"
|
||||||
"`/generate <type>` — Generate output (podcast|blog|report|thread)\n"
|
"`/generate <type>` — Generate output (podcast|blog|report|thread)\n"
|
||||||
"`/sources` — List all sources found\n"
|
"`/sources` — List all sources found\n"
|
||||||
"`/outputs` — List generated outputs\n"
|
"`/outputs` — List generated outputs\n"
|
||||||
@@ -426,6 +427,64 @@ async def cmd_outputs(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
|
|||||||
await db_conn.close()
|
await db_conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
async def cmd_process(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
|
||||||
|
if not is_authorized(update.effective_user.id):
|
||||||
|
return
|
||||||
|
|
||||||
|
chat_id = update.effective_chat.id
|
||||||
|
db_conn = await get_db()
|
||||||
|
db = ResearchDB(db_conn)
|
||||||
|
|
||||||
|
try:
|
||||||
|
cursor = await db_conn.execute(
|
||||||
|
"SELECT * FROM research_sessions WHERE telegram_chat_id = ? ORDER BY created_at DESC LIMIT 1",
|
||||||
|
(chat_id,)
|
||||||
|
)
|
||||||
|
row = await cursor.fetchone()
|
||||||
|
if not row:
|
||||||
|
await update.message.reply_text("No research sessions found. Start with /research <topic>")
|
||||||
|
return
|
||||||
|
|
||||||
|
session = dict(row)
|
||||||
|
session_id = session["id"]
|
||||||
|
topic = session["topic"]
|
||||||
|
|
||||||
|
msg = await update.message.reply_text(
|
||||||
|
f"🧠 Processing session #{session_id}: `{topic}`\n"
|
||||||
|
f"Chunking & scoring with Ollama ({settings.ollama_model})...\n"
|
||||||
|
f"This may take a few minutes.",
|
||||||
|
parse_mode=ParseMode.MARKDOWN
|
||||||
|
)
|
||||||
|
|
||||||
|
ollama = OllamaClient()
|
||||||
|
if not await ollama.is_available():
|
||||||
|
await msg.edit_text("❌ Ollama not reachable. Check OLLAMA_URL setting.")
|
||||||
|
return
|
||||||
|
|
||||||
|
processor = ContentProcessor(db, ollama)
|
||||||
|
|
||||||
|
async def proc_progress(total_chunks, total_words):
|
||||||
|
try:
|
||||||
|
await msg.edit_text(
|
||||||
|
f"🧠 *Processing complete!*\n"
|
||||||
|
f"• Chunks stored: `{total_chunks}`\n"
|
||||||
|
f"• Words researched: `{total_words:,}`\n\n"
|
||||||
|
f"Ready! Use `/generate podcast|blog|report|thread`\n"
|
||||||
|
f"_If 0 chunks: set `QUALITY_THRESHOLD=0.3` or `0` and retry_",
|
||||||
|
parse_mode=ParseMode.MARKDOWN
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
await processor.process_session(session_id, topic, proc_progress)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Process command failed", error=str(e))
|
||||||
|
await update.message.reply_text(f"❌ Processing failed: {str(e)[:200]}")
|
||||||
|
finally:
|
||||||
|
await db_conn.close()
|
||||||
|
|
||||||
|
|
||||||
async def cmd_cancel(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
|
async def cmd_cancel(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
|
||||||
if not is_authorized(update.effective_user.id):
|
if not is_authorized(update.effective_user.id):
|
||||||
return
|
return
|
||||||
@@ -456,6 +515,7 @@ def create_bot() -> Application:
|
|||||||
app.add_handler(CommandHandler("generate", cmd_generate))
|
app.add_handler(CommandHandler("generate", cmd_generate))
|
||||||
app.add_handler(CommandHandler("sources", cmd_sources))
|
app.add_handler(CommandHandler("sources", cmd_sources))
|
||||||
app.add_handler(CommandHandler("outputs", cmd_outputs))
|
app.add_handler(CommandHandler("outputs", cmd_outputs))
|
||||||
|
app.add_handler(CommandHandler("process", cmd_process))
|
||||||
app.add_handler(CommandHandler("cancel", cmd_cancel))
|
app.add_handler(CommandHandler("cancel", cmd_cancel))
|
||||||
|
|
||||||
return app
|
return app
|
||||||
|
|||||||
@@ -137,8 +137,11 @@ class ContentProcessor:
|
|||||||
results = await asyncio.gather(*[process_one(s) for s in scraped],
|
results = await asyncio.gather(*[process_one(s) for s in scraped],
|
||||||
return_exceptions=True)
|
return_exceptions=True)
|
||||||
|
|
||||||
for r in results:
|
for i, r in enumerate(results):
|
||||||
if isinstance(r, int):
|
if isinstance(r, Exception):
|
||||||
|
logger.error("Source processing raised exception",
|
||||||
|
source_id=scraped[i]["id"], error=str(r), exc_info=r)
|
||||||
|
elif isinstance(r, int):
|
||||||
total_chunks += r
|
total_chunks += r
|
||||||
|
|
||||||
total_words = sum(s.get("word_count", 0) for s in scraped)
|
total_words = sum(s.get("word_count", 0) for s in scraped)
|
||||||
@@ -159,17 +162,27 @@ class ContentProcessor:
|
|||||||
|
|
||||||
content = await self.db.get_source_content(source_id)
|
content = await self.db.get_source_content(source_id)
|
||||||
if not content:
|
if not content:
|
||||||
|
logger.warning("No content in source_contents", source_id=source_id)
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
chunks = simple_chunk(content, settings.chunk_size, settings.chunk_overlap)
|
chunks = simple_chunk(content, settings.chunk_size, settings.chunk_overlap)
|
||||||
|
logger.info("Processing source", source_id=source_id,
|
||||||
|
content_len=len(content), num_chunks=len(chunks),
|
||||||
|
quality_threshold=settings.quality_threshold)
|
||||||
stored = 0
|
stored = 0
|
||||||
|
filtered_quality = 0
|
||||||
|
|
||||||
for i, chunk in enumerate(chunks):
|
for i, chunk in enumerate(chunks):
|
||||||
if len(chunk.split()) < 30:
|
words = len(chunk.split())
|
||||||
|
if words < 30:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
quality = await self._score_quality(chunk, topic)
|
quality = await self._score_quality(chunk, topic)
|
||||||
if quality < settings.quality_threshold:
|
if quality < settings.quality_threshold:
|
||||||
|
filtered_quality += 1
|
||||||
|
logger.debug("Chunk filtered by quality", source_id=source_id,
|
||||||
|
chunk_index=i, quality=round(quality, 2),
|
||||||
|
threshold=settings.quality_threshold, words=words)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
embedding = await self.ollama.embed(chunk[:1000])
|
embedding = await self.ollama.embed(chunk[:1000])
|
||||||
@@ -179,12 +192,22 @@ class ContentProcessor:
|
|||||||
source_id=source_id,
|
source_id=source_id,
|
||||||
content=chunk,
|
content=chunk,
|
||||||
chunk_index=i,
|
chunk_index=i,
|
||||||
token_count=len(chunk.split()),
|
token_count=words,
|
||||||
quality_score=quality,
|
quality_score=quality,
|
||||||
embedding=embedding
|
embedding=embedding
|
||||||
)
|
)
|
||||||
stored += 1
|
stored += 1
|
||||||
|
|
||||||
|
if filtered_quality > 0 and stored == 0:
|
||||||
|
logger.warning(
|
||||||
|
"All chunks filtered by quality — consider lowering QUALITY_THRESHOLD "
|
||||||
|
"(currently %.1f) or set QUALITY_THRESHOLD=0 to disable",
|
||||||
|
settings.quality_threshold,
|
||||||
|
source_id=source_id, chunks_total=len(chunks),
|
||||||
|
chunks_filtered=filtered_quality
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info("Source processed", source_id=source_id, stored=stored)
|
||||||
return stored
|
return stored
|
||||||
|
|
||||||
async def _score_quality(self, chunk: str, topic: str) -> float:
|
async def _score_quality(self, chunk: str, topic: str) -> float:
|
||||||
@@ -204,14 +227,17 @@ Respond with ONLY a single number 0-10. No explanation."""
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
response = await self.ollama.generate(prompt)
|
response = await self.ollama.generate(prompt)
|
||||||
# Extract number from response
|
|
||||||
numbers = re.findall(r'\b(\d+(?:\.\d+)?)\b', response)
|
numbers = re.findall(r'\b(\d+(?:\.\d+)?)\b', response)
|
||||||
if numbers:
|
if numbers:
|
||||||
score = float(numbers[0])
|
score = float(numbers[0])
|
||||||
return min(1.0, score / 10.0)
|
normalized = min(1.0, score / 10.0)
|
||||||
return 0.5
|
logger.debug("Quality score", raw=score, normalized=round(normalized, 2))
|
||||||
except Exception:
|
return normalized
|
||||||
return 0.5 # default on error
|
logger.debug("No number in quality response", response=response[:80])
|
||||||
|
return 0.6 # above threshold so chunk is kept
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Quality scoring failed", error=str(e))
|
||||||
|
return 0.6 # above threshold so chunk is kept on Ollama error
|
||||||
|
|
||||||
async def rag_query(self, session_id: int, query: str, top_k: int = 20) -> str:
|
async def rag_query(self, session_id: int, query: str, top_k: int = 20) -> str:
|
||||||
"""
|
"""
|
||||||
|
|||||||
+24
-10
@@ -23,9 +23,21 @@ from src.db.database import ResearchDB
|
|||||||
logger = structlog.get_logger()
|
logger = structlog.get_logger()
|
||||||
|
|
||||||
HEADERS = {
|
HEADERS = {
|
||||||
"User-Agent": "Mozilla/5.0 (compatible; ResearchOwl/1.0; +https://chemavx.xyz)",
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||||
"Accept-Language": "en-US,en;q=0.9,es;q=0.8",
|
"Accept-Language": "en-US,en;q=0.9,es;q=0.8",
|
||||||
|
"Accept-Encoding": "gzip, deflate, br",
|
||||||
|
"DNT": "1",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Reddit requires its own headers — generic bots get 403
|
||||||
|
REDDIT_HEADERS = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||||
|
"Accept": "application/json, text/javascript, */*; q=0.01",
|
||||||
|
"Accept-Language": "en-US,en;q=0.9",
|
||||||
|
"Accept-Encoding": "gzip, deflate, br",
|
||||||
|
"Referer": "https://www.reddit.com/",
|
||||||
|
"X-Requested-With": "XMLHttpRequest",
|
||||||
}
|
}
|
||||||
|
|
||||||
# Domains to skip — not useful for research
|
# Domains to skip — not useful for research
|
||||||
@@ -177,10 +189,10 @@ class ExhaustiveScraper:
|
|||||||
"""Search Reddit — sequential to avoid rate limiting"""
|
"""Search Reddit — sequential to avoid rate limiting"""
|
||||||
try:
|
try:
|
||||||
http = await self._get_http()
|
http = await self._get_http()
|
||||||
url = f"https://www.reddit.com/search.json?q={quote_plus(self.topic)}&sort=top&limit=15"
|
url = f"https://www.reddit.com/search.json?q={quote_plus(self.topic)}&sort=top&limit=15&type=link"
|
||||||
async with http.get(url, headers={**HEADERS, "User-Agent": "ResearchOwl/1.0"}) as resp:
|
async with http.get(url, headers=REDDIT_HEADERS) as resp:
|
||||||
if resp.status == 200:
|
if resp.status == 200:
|
||||||
data = await resp.json()
|
data = await resp.json(content_type=None)
|
||||||
posts = data.get("data", {}).get("children", [])
|
posts = data.get("data", {}).get("children", [])
|
||||||
for post in posts:
|
for post in posts:
|
||||||
post_data = post.get("data", {})
|
post_data = post.get("data", {})
|
||||||
@@ -192,6 +204,8 @@ class ExhaustiveScraper:
|
|||||||
title=post_data.get("title")
|
title=post_data.get("title")
|
||||||
)
|
)
|
||||||
logger.info("Reddit seed", found=len(posts), status=resp.status)
|
logger.info("Reddit seed", found=len(posts), status=resp.status)
|
||||||
|
elif resp.status == 403:
|
||||||
|
logger.info("Reddit seed blocked (403) — server IP likely blocked by Reddit; skipping")
|
||||||
else:
|
else:
|
||||||
logger.warning("Reddit seed non-200", status=resp.status)
|
logger.warning("Reddit seed non-200", status=resp.status)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -446,14 +460,14 @@ class ExhaustiveScraper:
|
|||||||
json_url = url.rstrip("/") + ".json?limit=100&sort=top"
|
json_url = url.rstrip("/") + ".json?limit=100&sort=top"
|
||||||
http = await self._get_http()
|
http = await self._get_http()
|
||||||
try:
|
try:
|
||||||
async with http.get(
|
async with http.get(json_url, headers=REDDIT_HEADERS) as resp:
|
||||||
json_url,
|
if resp.status == 403:
|
||||||
headers={**HEADERS, "User-Agent": "ResearchOwl/1.0"}
|
logger.info("Reddit post blocked (403) — skipping", url=url[:60])
|
||||||
) as resp:
|
return None, None
|
||||||
if resp.status != 200:
|
if resp.status != 200:
|
||||||
logger.debug("Reddit non-200", status=resp.status, url=url[:60])
|
logger.debug("Reddit non-200", status=resp.status, url=url[:60])
|
||||||
return None, None
|
return None, None
|
||||||
data = await resp.json()
|
data = await resp.json(content_type=None)
|
||||||
|
|
||||||
post = data[0]["data"]["children"][0]["data"]
|
post = data[0]["data"]["children"][0]["data"]
|
||||||
title = post.get("title", "")
|
title = post.get("title", "")
|
||||||
|
|||||||
Reference in New Issue
Block a user