feat: caché de contenido de fuentes — reutiliza URLs scrapeadas en últimos 7 días

2026-05-06 07:05:41 +00:00
parent aa83cfacbd
commit 82e614e285
2 changed files with 30 additions and 0 deletions
@@ -318,6 +318,19 @@ class ResearchDB:
        row = await cursor.fetchone()
        return row[0] if row else None
    async def get_cached_content(self, url: str,
                                  max_age_days: int = 7) -> Optional[str]:
        threshold = time.time() - (max_age_days * 86400)
        async with self.db.execute(
            """SELECT sc.content FROM source_contents sc
               JOIN sources s ON s.id = sc.source_id
               WHERE s.url = ? AND sc.created_at > ?
               ORDER BY sc.created_at DESC LIMIT 1""",
            (url, threshold)
        ) as cur:
            row = await cur.fetchone()
            return row[0] if row else None
    async def get_outputs(self, session_id: int) -> list[dict]:
        cursor = await self.db.execute(
            "SELECT * FROM outputs WHERE session_id = ? ORDER BY created_at DESC",
@@ -414,6 +414,23 @@ class ExhaustiveScraper:
            source_id = source["id"]
            try:
                try:
                    cached = await self.db.get_cached_content(url)
                except Exception as cache_err:
                    logger.warning("Cache lookup failed", url=url, error=str(cache_err))
                    cached = None
                if cached:
                    logger.debug("Cache hit", url=url)
                    await self.db.save_source_content(source_id, cached)
                    await self.db.update_source(
                        source_id,
                        status="scraped",
                        scraped_at=time.time(),
                        word_count=len(cached.split()),
                    )
                    return 0
                if source_type == "youtube":
                    content, title = await fetch_with_retry(
                        lambda: self._extract_youtube(url), url