feat: SearXNG como motor principal, DDG como fallback
Build & Deploy ResearchOwl / build-and-push (push) Successful in 6s

This commit is contained in:
ChemaVX
2026-05-04 20:00:24 +00:00
parent ba2b366534
commit f4e167f3b6
+56 -9
View File
@@ -157,7 +157,7 @@ class ExhaustiveScraper:
"""Initial broad search across multiple sources"""
logger.info("Seeding research", topic=self.topic)
tasks = [
self._seed_duckduckgo(),
self._seed_search(),
self._seed_wikipedia(),
self._seed_reddit(),
self._seed_youtube(),
@@ -212,16 +212,66 @@ class ExhaustiveScraper:
error=str(e), error_type=type(e).__name__)
return fallback
async def _seed_duckduckgo(self):
"""Multiple DDG queries — fresh DDGS() per query to avoid cascading ratelimits"""
async def _search_searxng(self, query: str) -> list[dict]:
"""Busca en SearXNG y retorna lista de {href, title}. Retorna [] si no disponible."""
import aiohttp
searxng_url = "http://searxng-svc.researchowl.svc.cluster.local:8080/search"
params = {
"q": query,
"format": "json",
"engines": "duckduckgo,google,bing,brave",
"language": "all",
}
headers = {
"Accept": "application/json",
"X-Forwarded-For": "127.0.0.1",
"User-Agent": "ResearchOwl/1.0",
}
try:
async with aiohttp.ClientSession() as session:
async with session.get(
searxng_url,
params=params,
headers=headers,
timeout=aiohttp.ClientTimeout(total=15)
) as resp:
if resp.status == 200:
data = await resp.json()
results = data.get("results", [])
logger.info("SearXNG query ok", query=query, results=len(results))
return [
{"href": r.get("url", ""), "title": r.get("title", "")}
for r in results
if r.get("url")
]
else:
logger.warning("SearXNG non-200", status=resp.status, query=query)
return []
except Exception as e:
logger.warning("SearXNG failed", query=query, error=str(e))
return []
async def _seed_search(self):
"""SearXNG primary + DDG fallback per query"""
queries = await self._generate_ddg_queries()
for query in queries:
if self._stop:
break
results = await self._search_searxng(query)
if not results:
logger.info("SearXNG vacío, usando DDG", query=query)
try:
# Fresh instance per query — a ratelimit on one won't poison the rest
with DDGS() as ddgs:
results = list(ddgs.text(query, max_results=settings.max_pages_per_search))
ddg_results = list(ddgs.text(
query,
max_results=settings.max_pages_per_search
))
results = ddg_results
logger.info("DDG fallback ok", query=query, results=len(results))
except Exception as e:
logger.warning("DDG fallback failed", query=query, error=str(e))
results = []
for r in results:
url = normalize_url(r.get("href", ""))
if url and not is_blacklisted(url):
@@ -231,10 +281,7 @@ class ExhaustiveScraper:
depth=0,
title=r.get("title")
)
logger.info("DDG query ok", query=query, results=len(results))
except Exception as e:
logger.warning("DDG query failed", query=query, error=str(e))
await asyncio.sleep(random.uniform(3, 8))
await asyncio.sleep(random.uniform(1, 3))
async def _seed_wikipedia(self):
"""Search Wikipedia API for correct article URLs.