From 82e614e2851ee98a2208c0a2bd7aa3cebc918f74 Mon Sep 17 00:00:00 2001
From: ChemaVX <jmivanez@gmail.com>
Date: Wed, 6 May 2026 07:05:41 +0000
Subject: [PATCH] =?UTF-8?q?feat:=20cach=C3=A9=20de=20contenido=20de=20fuen?=
 =?UTF-8?q?tes=20=E2=80=94=20reutiliza=20URLs=20scrapeadas=20en=20=C3=BAlt?=
 =?UTF-8?q?imos=207=20d=C3=ADas?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/db/database.py        | 13 +++++++++++++
 src/scraper/exhaustive.py | 17 +++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/src/db/database.py b/src/db/database.py
index f6312bf..a74eebb 100644
--- a/src/db/database.py
+++ b/src/db/database.py
@@ -318,6 +318,19 @@ class ResearchDB:
         row = await cursor.fetchone()
         return row[0] if row else None
 
+    async def get_cached_content(self, url: str,
+                                  max_age_days: int = 7) -> Optional[str]:
+        threshold = time.time() - (max_age_days * 86400)
+        async with self.db.execute(
+            """SELECT sc.content FROM source_contents sc
+               JOIN sources s ON s.id = sc.source_id
+               WHERE s.url = ? AND sc.created_at > ?
+               ORDER BY sc.created_at DESC LIMIT 1""",
+            (url, threshold)
+        ) as cur:
+            row = await cur.fetchone()
+            return row[0] if row else None
+
     async def get_outputs(self, session_id: int) -> list[dict]:
         cursor = await self.db.execute(
             "SELECT * FROM outputs WHERE session_id = ? ORDER BY created_at DESC",
diff --git a/src/scraper/exhaustive.py b/src/scraper/exhaustive.py
index b3599d5..710a4b5 100644
--- a/src/scraper/exhaustive.py
+++ b/src/scraper/exhaustive.py
@@ -414,6 +414,23 @@ class ExhaustiveScraper:
             source_id = source["id"]
 
             try:
+                try:
+                    cached = await self.db.get_cached_content(url)
+                except Exception as cache_err:
+                    logger.warning("Cache lookup failed", url=url, error=str(cache_err))
+                    cached = None
+
+                if cached:
+                    logger.debug("Cache hit", url=url)
+                    await self.db.save_source_content(source_id, cached)
+                    await self.db.update_source(
+                        source_id,
+                        status="scraped",
+                        scraped_at=time.time(),
+                        word_count=len(cached.split()),
+                    )
+                    return 0
+
                 if source_type == "youtube":
                     content, title = await fetch_with_retry(
                         lambda: self._extract_youtube(url), url