diff --git a/src/processor/processor.py b/src/processor/processor.py index b84cde8..7fd02a1 100644 --- a/src/processor/processor.py +++ b/src/processor/processor.py @@ -211,33 +211,26 @@ class ContentProcessor: return stored async def _score_quality(self, chunk: str, topic: str) -> float: - """ - Ask Ollama to score relevance and quality of a chunk. - Returns 0.0-1.0 - """ - prompt = f"""Rate this text chunk on a scale of 0-10 for: -1. Relevance to topic: "{topic}" -2. Information density (facts, data, insights) -3. Credibility (not speculation, not clickbait) - -Text: -{chunk[:500]} - -Respond with ONLY a single number 0-10. No explanation.""" - + """Score 0-1 how relevant chunk is to topic. Single axis — avoids off-topic content.""" + prompt = ( + f'Score 0-10: how relevant is this text to the topic "{topic}"?\n' + f"0 = completely unrelated, 10 = directly and specifically about this topic.\n\n" + f"Text:\n{chunk[:500]}\n\n" + f"Reply with ONLY a single integer 0-10. No explanation." + ) try: response = await self.ollama.generate(prompt) numbers = re.findall(r'\b(\d+(?:\.\d+)?)\b', response) if numbers: score = float(numbers[0]) normalized = min(1.0, score / 10.0) - logger.debug("Quality score", raw=score, normalized=round(normalized, 2)) + logger.debug("Relevance score", raw=score, normalized=round(normalized, 2)) return normalized - logger.debug("No number in quality response", response=response[:80]) - return 0.6 # above threshold so chunk is kept + logger.debug("No number in relevance response", response=response[:80]) + return 0.6 except Exception as e: - logger.warning("Quality scoring failed", error=str(e)) - return 0.6 # above threshold so chunk is kept on Ollama error + logger.warning("Relevance scoring failed", error=str(e)) + return 0.6 async def rag_query(self, session_id: int, query: str, top_k: int = 20) -> str: """ diff --git a/src/scraper/exhaustive.py b/src/scraper/exhaustive.py index f07b5cf..c5b6744 100644 --- a/src/scraper/exhaustive.py +++ b/src/scraper/exhaustive.py @@ -90,6 +90,14 @@ class ExhaustiveScraper: Keeps expanding until saturation or limits hit. """ + # Common stopwords to ignore when extracting topic keywords + _STOPWORDS = { + 'the','a','an','and','or','of','in','on','at','to','for','is','are','was', + 'were','be','been','have','has','had','do','does','did','about','with','from', + 'el','la','los','las','de','del','en','un','una','y','o','que','se','por', + 'con','para','sobre','como','pero','más', + } + def __init__(self, db: ResearchDB, session_id: int, topic: str, progress_callback=None): self.db = db @@ -100,6 +108,18 @@ class ExhaustiveScraper: self.total_sources = 0 self._stop = False self._http: Optional[aiohttp.ClientSession] = None + # Pre-compute topic keywords for child-URL relevance filtering + self._keywords = [ + w for w in re.findall(r'\b\w{3,}\b', topic.lower()) + if w not in self._STOPWORDS + ] + + def _url_is_relevant(self, url: str, title: str = "") -> bool: + """True if URL path or title contains at least one topic keyword.""" + if not self._keywords: + return True + text = (urlparse(url).path + " " + (title or "")).lower() + return any(kw in text for kw in self._keywords) async def stop(self): self._stop = True @@ -300,13 +320,16 @@ class ExhaustiveScraper: content, title = await self._extract_youtube(url) elif source_type == "wikipedia": content, title, new_urls = await self._extract_wikipedia(url) + added = 0 for new_url in (new_urls or []): - await self.db.add_source( - self.session_id, new_url, "wikipedia", - depth=source["depth"] + 1 - ) + if self._url_is_relevant(new_url): + await self.db.add_source( + self.session_id, new_url, "wikipedia", + depth=source["depth"] + 1 + ) + added += 1 await self._mark_scraped(source_id, content, title, url) - return len(new_urls or []) + return added elif source_type == "reddit": content, title = await self._extract_reddit(url) # Small delay between Reddit requests to avoid rate limiting @@ -315,14 +338,17 @@ class ExhaustiveScraper: content, title = await self._extract_pdf(url) else: content, title, new_urls = await self._extract_web(url, source["depth"]) + added = 0 for new_url in (new_urls or []): - await self.db.add_source( - self.session_id, new_url, - detect_source_type(new_url), - depth=source["depth"] + 1 - ) + if self._url_is_relevant(new_url): + await self.db.add_source( + self.session_id, new_url, + detect_source_type(new_url), + depth=source["depth"] + 1 + ) + added += 1 await self._mark_scraped(source_id, content, title, url) - return len(new_urls or []) + return added await self._mark_scraped(source_id, content, title, url) return 0