From 82e614e2851ee98a2208c0a2bd7aa3cebc918f74 Mon Sep 17 00:00:00 2001 From: ChemaVX Date: Wed, 6 May 2026 07:05:41 +0000 Subject: [PATCH] =?UTF-8?q?feat:=20cach=C3=A9=20de=20contenido=20de=20fuen?= =?UTF-8?q?tes=20=E2=80=94=20reutiliza=20URLs=20scrapeadas=20en=20=C3=BAlt?= =?UTF-8?q?imos=207=20d=C3=ADas?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/db/database.py | 13 +++++++++++++ src/scraper/exhaustive.py | 17 +++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/src/db/database.py b/src/db/database.py index f6312bf..a74eebb 100644 --- a/src/db/database.py +++ b/src/db/database.py @@ -318,6 +318,19 @@ class ResearchDB: row = await cursor.fetchone() return row[0] if row else None + async def get_cached_content(self, url: str, + max_age_days: int = 7) -> Optional[str]: + threshold = time.time() - (max_age_days * 86400) + async with self.db.execute( + """SELECT sc.content FROM source_contents sc + JOIN sources s ON s.id = sc.source_id + WHERE s.url = ? AND sc.created_at > ? + ORDER BY sc.created_at DESC LIMIT 1""", + (url, threshold) + ) as cur: + row = await cur.fetchone() + return row[0] if row else None + async def get_outputs(self, session_id: int) -> list[dict]: cursor = await self.db.execute( "SELECT * FROM outputs WHERE session_id = ? ORDER BY created_at DESC", diff --git a/src/scraper/exhaustive.py b/src/scraper/exhaustive.py index b3599d5..710a4b5 100644 --- a/src/scraper/exhaustive.py +++ b/src/scraper/exhaustive.py @@ -414,6 +414,23 @@ class ExhaustiveScraper: source_id = source["id"] try: + try: + cached = await self.db.get_cached_content(url) + except Exception as cache_err: + logger.warning("Cache lookup failed", url=url, error=str(cache_err)) + cached = None + + if cached: + logger.debug("Cache hit", url=url) + await self.db.save_source_content(source_id, cached) + await self.db.update_source( + source_id, + status="scraped", + scraped_at=time.time(), + word_count=len(cached.split()), + ) + return 0 + if source_type == "youtube": content, title = await fetch_with_retry( lambda: self._extract_youtube(url), url