feat: SearXNG como motor principal, DDG como fallback
Build & Deploy ResearchOwl / build-and-push (push) Successful in 6s

This commit is contained in:
ChemaVX
2026-05-04 20:00:24 +00:00
parent ba2b366534
commit f4e167f3b6
+67 -20
View File
@@ -157,7 +157,7 @@ class ExhaustiveScraper:
"""Initial broad search across multiple sources""" """Initial broad search across multiple sources"""
logger.info("Seeding research", topic=self.topic) logger.info("Seeding research", topic=self.topic)
tasks = [ tasks = [
self._seed_duckduckgo(), self._seed_search(),
self._seed_wikipedia(), self._seed_wikipedia(),
self._seed_reddit(), self._seed_reddit(),
self._seed_youtube(), self._seed_youtube(),
@@ -212,29 +212,76 @@ class ExhaustiveScraper:
error=str(e), error_type=type(e).__name__) error=str(e), error_type=type(e).__name__)
return fallback return fallback
async def _seed_duckduckgo(self): async def _search_searxng(self, query: str) -> list[dict]:
"""Multiple DDG queries — fresh DDGS() per query to avoid cascading ratelimits""" """Busca en SearXNG y retorna lista de {href, title}. Retorna [] si no disponible."""
import aiohttp
searxng_url = "http://searxng-svc.researchowl.svc.cluster.local:8080/search"
params = {
"q": query,
"format": "json",
"engines": "duckduckgo,google,bing,brave",
"language": "all",
}
headers = {
"Accept": "application/json",
"X-Forwarded-For": "127.0.0.1",
"User-Agent": "ResearchOwl/1.0",
}
try:
async with aiohttp.ClientSession() as session:
async with session.get(
searxng_url,
params=params,
headers=headers,
timeout=aiohttp.ClientTimeout(total=15)
) as resp:
if resp.status == 200:
data = await resp.json()
results = data.get("results", [])
logger.info("SearXNG query ok", query=query, results=len(results))
return [
{"href": r.get("url", ""), "title": r.get("title", "")}
for r in results
if r.get("url")
]
else:
logger.warning("SearXNG non-200", status=resp.status, query=query)
return []
except Exception as e:
logger.warning("SearXNG failed", query=query, error=str(e))
return []
async def _seed_search(self):
"""SearXNG primary + DDG fallback per query"""
queries = await self._generate_ddg_queries() queries = await self._generate_ddg_queries()
for query in queries: for query in queries:
if self._stop: if self._stop:
break break
try: results = await self._search_searxng(query)
# Fresh instance per query — a ratelimit on one won't poison the rest if not results:
with DDGS() as ddgs: logger.info("SearXNG vacío, usando DDG", query=query)
results = list(ddgs.text(query, max_results=settings.max_pages_per_search)) try:
for r in results: with DDGS() as ddgs:
url = normalize_url(r.get("href", "")) ddg_results = list(ddgs.text(
if url and not is_blacklisted(url): query,
await self.db.add_source( max_results=settings.max_pages_per_search
self.session_id, url, ))
detect_source_type(url), results = ddg_results
depth=0, logger.info("DDG fallback ok", query=query, results=len(results))
title=r.get("title") except Exception as e:
) logger.warning("DDG fallback failed", query=query, error=str(e))
logger.info("DDG query ok", query=query, results=len(results)) results = []
except Exception as e:
logger.warning("DDG query failed", query=query, error=str(e)) for r in results:
await asyncio.sleep(random.uniform(3, 8)) url = normalize_url(r.get("href", ""))
if url and not is_blacklisted(url):
await self.db.add_source(
self.session_id, url,
detect_source_type(url),
depth=0,
title=r.get("title")
)
await asyncio.sleep(random.uniform(1, 3))
async def _seed_wikipedia(self): async def _seed_wikipedia(self):
"""Search Wikipedia API for correct article URLs. """Search Wikipedia API for correct article URLs.