From f4e167f3b63e1f27f8980ee5bc95b24f167135a3 Mon Sep 17 00:00:00 2001 From: ChemaVX Date: Mon, 4 May 2026 20:00:24 +0000 Subject: [PATCH] feat: SearXNG como motor principal, DDG como fallback --- src/scraper/exhaustive.py | 87 ++++++++++++++++++++++++++++++--------- 1 file changed, 67 insertions(+), 20 deletions(-) diff --git a/src/scraper/exhaustive.py b/src/scraper/exhaustive.py index aeb007f..b3599d5 100644 --- a/src/scraper/exhaustive.py +++ b/src/scraper/exhaustive.py @@ -157,7 +157,7 @@ class ExhaustiveScraper: """Initial broad search across multiple sources""" logger.info("Seeding research", topic=self.topic) tasks = [ - self._seed_duckduckgo(), + self._seed_search(), self._seed_wikipedia(), self._seed_reddit(), self._seed_youtube(), @@ -212,29 +212,76 @@ class ExhaustiveScraper: error=str(e), error_type=type(e).__name__) return fallback - async def _seed_duckduckgo(self): - """Multiple DDG queries — fresh DDGS() per query to avoid cascading ratelimits""" + async def _search_searxng(self, query: str) -> list[dict]: + """Busca en SearXNG y retorna lista de {href, title}. Retorna [] si no disponible.""" + import aiohttp + searxng_url = "http://searxng-svc.researchowl.svc.cluster.local:8080/search" + params = { + "q": query, + "format": "json", + "engines": "duckduckgo,google,bing,brave", + "language": "all", + } + headers = { + "Accept": "application/json", + "X-Forwarded-For": "127.0.0.1", + "User-Agent": "ResearchOwl/1.0", + } + try: + async with aiohttp.ClientSession() as session: + async with session.get( + searxng_url, + params=params, + headers=headers, + timeout=aiohttp.ClientTimeout(total=15) + ) as resp: + if resp.status == 200: + data = await resp.json() + results = data.get("results", []) + logger.info("SearXNG query ok", query=query, results=len(results)) + return [ + {"href": r.get("url", ""), "title": r.get("title", "")} + for r in results + if r.get("url") + ] + else: + logger.warning("SearXNG non-200", status=resp.status, query=query) + return [] + except Exception as e: + logger.warning("SearXNG failed", query=query, error=str(e)) + return [] + + async def _seed_search(self): + """SearXNG primary + DDG fallback per query""" queries = await self._generate_ddg_queries() for query in queries: if self._stop: break - try: - # Fresh instance per query — a ratelimit on one won't poison the rest - with DDGS() as ddgs: - results = list(ddgs.text(query, max_results=settings.max_pages_per_search)) - for r in results: - url = normalize_url(r.get("href", "")) - if url and not is_blacklisted(url): - await self.db.add_source( - self.session_id, url, - detect_source_type(url), - depth=0, - title=r.get("title") - ) - logger.info("DDG query ok", query=query, results=len(results)) - except Exception as e: - logger.warning("DDG query failed", query=query, error=str(e)) - await asyncio.sleep(random.uniform(3, 8)) + results = await self._search_searxng(query) + if not results: + logger.info("SearXNG vacío, usando DDG", query=query) + try: + with DDGS() as ddgs: + ddg_results = list(ddgs.text( + query, + max_results=settings.max_pages_per_search + )) + results = ddg_results + logger.info("DDG fallback ok", query=query, results=len(results)) + except Exception as e: + logger.warning("DDG fallback failed", query=query, error=str(e)) + results = [] + + for r in results: + url = normalize_url(r.get("href", "")) + if url and not is_blacklisted(url): + await self.db.add_source( + self.session_id, url, + detect_source_type(url), + depth=0, + title=r.get("title") + ) + await asyncio.sleep(random.uniform(1, 3)) async def _seed_wikipedia(self): """Search Wikipedia API for correct article URLs.