feat: caché de contenido de fuentes — reutiliza URLs scrapeadas en últimos 7 días
Build & Deploy ResearchOwl / build-and-push (push) Successful in 6s
Build & Deploy ResearchOwl / build-and-push (push) Successful in 6s
This commit is contained in:
@@ -318,6 +318,19 @@ class ResearchDB:
|
|||||||
row = await cursor.fetchone()
|
row = await cursor.fetchone()
|
||||||
return row[0] if row else None
|
return row[0] if row else None
|
||||||
|
|
||||||
|
async def get_cached_content(self, url: str,
|
||||||
|
max_age_days: int = 7) -> Optional[str]:
|
||||||
|
threshold = time.time() - (max_age_days * 86400)
|
||||||
|
async with self.db.execute(
|
||||||
|
"""SELECT sc.content FROM source_contents sc
|
||||||
|
JOIN sources s ON s.id = sc.source_id
|
||||||
|
WHERE s.url = ? AND sc.created_at > ?
|
||||||
|
ORDER BY sc.created_at DESC LIMIT 1""",
|
||||||
|
(url, threshold)
|
||||||
|
) as cur:
|
||||||
|
row = await cur.fetchone()
|
||||||
|
return row[0] if row else None
|
||||||
|
|
||||||
async def get_outputs(self, session_id: int) -> list[dict]:
|
async def get_outputs(self, session_id: int) -> list[dict]:
|
||||||
cursor = await self.db.execute(
|
cursor = await self.db.execute(
|
||||||
"SELECT * FROM outputs WHERE session_id = ? ORDER BY created_at DESC",
|
"SELECT * FROM outputs WHERE session_id = ? ORDER BY created_at DESC",
|
||||||
|
|||||||
@@ -414,6 +414,23 @@ class ExhaustiveScraper:
|
|||||||
source_id = source["id"]
|
source_id = source["id"]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
try:
|
||||||
|
cached = await self.db.get_cached_content(url)
|
||||||
|
except Exception as cache_err:
|
||||||
|
logger.warning("Cache lookup failed", url=url, error=str(cache_err))
|
||||||
|
cached = None
|
||||||
|
|
||||||
|
if cached:
|
||||||
|
logger.debug("Cache hit", url=url)
|
||||||
|
await self.db.save_source_content(source_id, cached)
|
||||||
|
await self.db.update_source(
|
||||||
|
source_id,
|
||||||
|
status="scraped",
|
||||||
|
scraped_at=time.time(),
|
||||||
|
word_count=len(cached.split()),
|
||||||
|
)
|
||||||
|
return 0
|
||||||
|
|
||||||
if source_type == "youtube":
|
if source_type == "youtube":
|
||||||
content, title = await fetch_with_retry(
|
content, title = await fetch_with_retry(
|
||||||
lambda: self._extract_youtube(url), url
|
lambda: self._extract_youtube(url), url
|
||||||
|
|||||||
Reference in New Issue
Block a user