feat: SearXNG como motor principal, DDG como fallback
Build & Deploy ResearchOwl / build-and-push (push) Successful in 6s
Build & Deploy ResearchOwl / build-and-push (push) Successful in 6s
This commit is contained in:
+67
-20
@@ -157,7 +157,7 @@ class ExhaustiveScraper:
|
|||||||
"""Initial broad search across multiple sources"""
|
"""Initial broad search across multiple sources"""
|
||||||
logger.info("Seeding research", topic=self.topic)
|
logger.info("Seeding research", topic=self.topic)
|
||||||
tasks = [
|
tasks = [
|
||||||
self._seed_duckduckgo(),
|
self._seed_search(),
|
||||||
self._seed_wikipedia(),
|
self._seed_wikipedia(),
|
||||||
self._seed_reddit(),
|
self._seed_reddit(),
|
||||||
self._seed_youtube(),
|
self._seed_youtube(),
|
||||||
@@ -212,29 +212,76 @@ class ExhaustiveScraper:
|
|||||||
error=str(e), error_type=type(e).__name__)
|
error=str(e), error_type=type(e).__name__)
|
||||||
return fallback
|
return fallback
|
||||||
|
|
||||||
async def _seed_duckduckgo(self):
|
async def _search_searxng(self, query: str) -> list[dict]:
|
||||||
"""Multiple DDG queries — fresh DDGS() per query to avoid cascading ratelimits"""
|
"""Busca en SearXNG y retorna lista de {href, title}. Retorna [] si no disponible."""
|
||||||
|
import aiohttp
|
||||||
|
searxng_url = "http://searxng-svc.researchowl.svc.cluster.local:8080/search"
|
||||||
|
params = {
|
||||||
|
"q": query,
|
||||||
|
"format": "json",
|
||||||
|
"engines": "duckduckgo,google,bing,brave",
|
||||||
|
"language": "all",
|
||||||
|
}
|
||||||
|
headers = {
|
||||||
|
"Accept": "application/json",
|
||||||
|
"X-Forwarded-For": "127.0.0.1",
|
||||||
|
"User-Agent": "ResearchOwl/1.0",
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.get(
|
||||||
|
searxng_url,
|
||||||
|
params=params,
|
||||||
|
headers=headers,
|
||||||
|
timeout=aiohttp.ClientTimeout(total=15)
|
||||||
|
) as resp:
|
||||||
|
if resp.status == 200:
|
||||||
|
data = await resp.json()
|
||||||
|
results = data.get("results", [])
|
||||||
|
logger.info("SearXNG query ok", query=query, results=len(results))
|
||||||
|
return [
|
||||||
|
{"href": r.get("url", ""), "title": r.get("title", "")}
|
||||||
|
for r in results
|
||||||
|
if r.get("url")
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
logger.warning("SearXNG non-200", status=resp.status, query=query)
|
||||||
|
return []
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("SearXNG failed", query=query, error=str(e))
|
||||||
|
return []
|
||||||
|
|
||||||
|
async def _seed_search(self):
|
||||||
|
"""SearXNG primary + DDG fallback per query"""
|
||||||
queries = await self._generate_ddg_queries()
|
queries = await self._generate_ddg_queries()
|
||||||
for query in queries:
|
for query in queries:
|
||||||
if self._stop:
|
if self._stop:
|
||||||
break
|
break
|
||||||
try:
|
results = await self._search_searxng(query)
|
||||||
# Fresh instance per query — a ratelimit on one won't poison the rest
|
if not results:
|
||||||
with DDGS() as ddgs:
|
logger.info("SearXNG vacío, usando DDG", query=query)
|
||||||
results = list(ddgs.text(query, max_results=settings.max_pages_per_search))
|
try:
|
||||||
for r in results:
|
with DDGS() as ddgs:
|
||||||
url = normalize_url(r.get("href", ""))
|
ddg_results = list(ddgs.text(
|
||||||
if url and not is_blacklisted(url):
|
query,
|
||||||
await self.db.add_source(
|
max_results=settings.max_pages_per_search
|
||||||
self.session_id, url,
|
))
|
||||||
detect_source_type(url),
|
results = ddg_results
|
||||||
depth=0,
|
logger.info("DDG fallback ok", query=query, results=len(results))
|
||||||
title=r.get("title")
|
except Exception as e:
|
||||||
)
|
logger.warning("DDG fallback failed", query=query, error=str(e))
|
||||||
logger.info("DDG query ok", query=query, results=len(results))
|
results = []
|
||||||
except Exception as e:
|
|
||||||
logger.warning("DDG query failed", query=query, error=str(e))
|
for r in results:
|
||||||
await asyncio.sleep(random.uniform(3, 8))
|
url = normalize_url(r.get("href", ""))
|
||||||
|
if url and not is_blacklisted(url):
|
||||||
|
await self.db.add_source(
|
||||||
|
self.session_id, url,
|
||||||
|
detect_source_type(url),
|
||||||
|
depth=0,
|
||||||
|
title=r.get("title")
|
||||||
|
)
|
||||||
|
await asyncio.sleep(random.uniform(1, 3))
|
||||||
|
|
||||||
async def _seed_wikipedia(self):
|
async def _seed_wikipedia(self):
|
||||||
"""Search Wikipedia API for correct article URLs.
|
"""Search Wikipedia API for correct article URLs.
|
||||||
|
|||||||
Reference in New Issue
Block a user