fix: relevance scoring per topic + URL keyword filter for child pages

processor.py: simplify _score_quality prompt to single axis — "how relevant is this text to topic X?" — instead of averaging relevance + density + credibility, which let off-topic but well-written content pass through exhaustive.py: pre-compute topic keywords (stopword-filtered) at scraper init; filter child URLs (discovered during crawl, depth>0) to only add ones whose URL path or title contains a topic keyword; seed URLs (depth=0, from DDG/Wikipedia/Reddit) are always included since those searches are already topic-scoped Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-27 20:52:43 +00:00
parent 0c7176dd0b
commit f7d62345b8
2 changed files with 49 additions and 30 deletions
@@ -211,33 +211,26 @@ class ContentProcessor:
        return stored
    async def _score_quality(self, chunk: str, topic: str) -> float:
-        """
+        """Score 0-1 how relevant chunk is to topic. Single axis — avoids off-topic content."""
-        Ask Ollama to score relevance and quality of a chunk.
+        prompt = (
-        Returns 0.0-1.0
+            f'Score 0-10: how relevant is this text to the topic "{topic}"?\n'
-        """
+            f"0 = completely unrelated, 10 = directly and specifically about this topic.\n\n"
-        prompt = f"""Rate this text chunk on a scale of 0-10 for:
+            f"Text:\n{chunk[:500]}\n\n"
-1. Relevance to topic: "{topic}"
+            f"Reply with ONLY a single integer 0-10. No explanation."
-2. Information density (facts, data, insights)
+        )
 3. Credibility (not speculation, not clickbait)
 Text:
 {chunk[:500]}
 Respond with ONLY a single number 0-10. No explanation."""
        try:
            response = await self.ollama.generate(prompt)
            numbers = re.findall(r'\b(\d+(?:\.\d+)?)\b', response)
            if numbers:
                score = float(numbers[0])
                normalized = min(1.0, score / 10.0)
-                logger.debug("Quality score", raw=score, normalized=round(normalized, 2))
+                logger.debug("Relevance score", raw=score, normalized=round(normalized, 2))
                return normalized
-            logger.debug("No number in quality response", response=response[:80])
+            logger.debug("No number in relevance response", response=response[:80])
-            return 0.6  # above threshold so chunk is kept
+            return 0.6
        except Exception as e:
-            logger.warning("Quality scoring failed", error=str(e))
+            logger.warning("Relevance scoring failed", error=str(e))
-            return 0.6  # above threshold so chunk is kept on Ollama error
+            return 0.6
    async def rag_query(self, session_id: int, query: str, top_k: int = 20) -> str:
        """
@@ -90,6 +90,14 @@ class ExhaustiveScraper:
    Keeps expanding until saturation or limits hit.
    """
    # Common stopwords to ignore when extracting topic keywords
    _STOPWORDS = {
        'the','a','an','and','or','of','in','on','at','to','for','is','are','was',
        'were','be','been','have','has','had','do','does','did','about','with','from',
        'el','la','los','las','de','del','en','un','una','y','o','que','se','por',
        'con','para','sobre','como','pero','más',
    }
    def __init__(self, db: ResearchDB, session_id: int, topic: str,
                 progress_callback=None):
        self.db = db
@@ -100,6 +108,18 @@ class ExhaustiveScraper:
        self.total_sources = 0
        self._stop = False
        self._http: Optional[aiohttp.ClientSession] = None
        # Pre-compute topic keywords for child-URL relevance filtering
        self._keywords = [
            w for w in re.findall(r'\b\w{3,}\b', topic.lower())
            if w not in self._STOPWORDS
        ]
    def _url_is_relevant(self, url: str, title: str = "") -> bool:
        """True if URL path or title contains at least one topic keyword."""
        if not self._keywords:
            return True
        text = (urlparse(url).path + " " + (title or "")).lower()
        return any(kw in text for kw in self._keywords)
    async def stop(self):
        self._stop = True
@@ -300,13 +320,16 @@ class ExhaustiveScraper:
                    content, title = await self._extract_youtube(url)
                elif source_type == "wikipedia":
                    content, title, new_urls = await self._extract_wikipedia(url)
                    added = 0
                    for new_url in (new_urls or []):
-                        await self.db.add_source(
+                        if self._url_is_relevant(new_url):
-                            self.session_id, new_url, "wikipedia",
+                            await self.db.add_source(
-                            depth=source["depth"] + 1
+                                self.session_id, new_url, "wikipedia",
-                        )
+                                depth=source["depth"] + 1
                            )
                            added += 1
                    await self._mark_scraped(source_id, content, title, url)
-                    return len(new_urls or [])
+                    return added
                elif source_type == "reddit":
                    content, title = await self._extract_reddit(url)
                    # Small delay between Reddit requests to avoid rate limiting
@@ -315,14 +338,17 @@ class ExhaustiveScraper:
                    content, title = await self._extract_pdf(url)
                else:
                    content, title, new_urls = await self._extract_web(url, source["depth"])
                    added = 0
                    for new_url in (new_urls or []):
-                        await self.db.add_source(
+                        if self._url_is_relevant(new_url):
-                            self.session_id, new_url,
+                            await self.db.add_source(
-                            detect_source_type(new_url),
+                                self.session_id, new_url,
-                            depth=source["depth"] + 1
+                                detect_source_type(new_url),
-                        )
+                                depth=source["depth"] + 1
                            )
                            added += 1
                    await self._mark_scraped(source_id, content, title, url)
-                    return len(new_urls or [])
+                    return added
                await self._mark_scraped(source_id, content, title, url)
                return 0