fix: relevance scoring per topic + URL keyword filter for child pages

processor.py: simplify _score_quality prompt to single axis — "how relevant is this text to topic X?" — instead of averaging relevance + density + credibility, which let off-topic but well-written content pass through exhaustive.py: pre-compute topic keywords (stopword-filtered) at scraper init; filter child URLs (discovered during crawl, depth>0) to only add ones whose URL path or title contains a topic keyword; seed URLs (depth=0, from DDG/Wikipedia/Reddit) are always included since those searches are already topic-scoped Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-27 20:52:43 +00:00
parent 0c7176dd0b
commit f7d62345b8
2 changed files with 49 additions and 30 deletions
@@ -211,33 +211,26 @@ class ContentProcessor:
        return stored

    async def _score_quality(self, chunk: str, topic: str) -> float:
-        """
-        Ask Ollama to score relevance and quality of a chunk.
-        Returns 0.0-1.0
-        """
-        prompt = f"""Rate this text chunk on a scale of 0-10 for:
-1. Relevance to topic: "{topic}"
-2. Information density (facts, data, insights)
-3. Credibility (not speculation, not clickbait)
-
-Text:
-{chunk[:500]}
-
-Respond with ONLY a single number 0-10. No explanation."""
-
+        """Score 0-1 how relevant chunk is to topic. Single axis — avoids off-topic content."""
+        prompt = (
+            f'Score 0-10: how relevant is this text to the topic "{topic}"?\n'
+            f"0 = completely unrelated, 10 = directly and specifically about this topic.\n\n"
+            f"Text:\n{chunk[:500]}\n\n"
+            f"Reply with ONLY a single integer 0-10. No explanation."
+        )
        try:
            response = await self.ollama.generate(prompt)
            numbers = re.findall(r'\b(\d+(?:\.\d+)?)\b', response)
            if numbers:
                score = float(numbers[0])
                normalized = min(1.0, score / 10.0)
-                logger.debug("Quality score", raw=score, normalized=round(normalized, 2))
+                logger.debug("Relevance score", raw=score, normalized=round(normalized, 2))
                return normalized
-            logger.debug("No number in quality response", response=response[:80])
-            return 0.6  # above threshold so chunk is kept
+            logger.debug("No number in relevance response", response=response[:80])
+            return 0.6
        except Exception as e:
-            logger.warning("Quality scoring failed", error=str(e))
-            return 0.6  # above threshold so chunk is kept on Ollama error
+            logger.warning("Relevance scoring failed", error=str(e))
+            return 0.6

    async def rag_query(self, session_id: int, query: str, top_k: int = 20) -> str:
        """