fix: relevance scoring per topic + URL keyword filter for child pages
Build & Deploy ResearchOwl / build-and-push (push) Successful in 6s
Build & Deploy ResearchOwl / build-and-push (push) Successful in 6s
processor.py: simplify _score_quality prompt to single axis — "how relevant is this text to topic X?" — instead of averaging relevance + density + credibility, which let off-topic but well-written content pass through exhaustive.py: pre-compute topic keywords (stopword-filtered) at scraper init; filter child URLs (discovered during crawl, depth>0) to only add ones whose URL path or title contains a topic keyword; seed URLs (depth=0, from DDG/Wikipedia/Reddit) are always included since those searches are already topic-scoped Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+12
-19
@@ -211,33 +211,26 @@ class ContentProcessor:
|
||||
return stored
|
||||
|
||||
async def _score_quality(self, chunk: str, topic: str) -> float:
|
||||
"""
|
||||
Ask Ollama to score relevance and quality of a chunk.
|
||||
Returns 0.0-1.0
|
||||
"""
|
||||
prompt = f"""Rate this text chunk on a scale of 0-10 for:
|
||||
1. Relevance to topic: "{topic}"
|
||||
2. Information density (facts, data, insights)
|
||||
3. Credibility (not speculation, not clickbait)
|
||||
|
||||
Text:
|
||||
{chunk[:500]}
|
||||
|
||||
Respond with ONLY a single number 0-10. No explanation."""
|
||||
|
||||
"""Score 0-1 how relevant chunk is to topic. Single axis — avoids off-topic content."""
|
||||
prompt = (
|
||||
f'Score 0-10: how relevant is this text to the topic "{topic}"?\n'
|
||||
f"0 = completely unrelated, 10 = directly and specifically about this topic.\n\n"
|
||||
f"Text:\n{chunk[:500]}\n\n"
|
||||
f"Reply with ONLY a single integer 0-10. No explanation."
|
||||
)
|
||||
try:
|
||||
response = await self.ollama.generate(prompt)
|
||||
numbers = re.findall(r'\b(\d+(?:\.\d+)?)\b', response)
|
||||
if numbers:
|
||||
score = float(numbers[0])
|
||||
normalized = min(1.0, score / 10.0)
|
||||
logger.debug("Quality score", raw=score, normalized=round(normalized, 2))
|
||||
logger.debug("Relevance score", raw=score, normalized=round(normalized, 2))
|
||||
return normalized
|
||||
logger.debug("No number in quality response", response=response[:80])
|
||||
return 0.6 # above threshold so chunk is kept
|
||||
logger.debug("No number in relevance response", response=response[:80])
|
||||
return 0.6
|
||||
except Exception as e:
|
||||
logger.warning("Quality scoring failed", error=str(e))
|
||||
return 0.6 # above threshold so chunk is kept on Ollama error
|
||||
logger.warning("Relevance scoring failed", error=str(e))
|
||||
return 0.6
|
||||
|
||||
async def rag_query(self, session_id: int, query: str, top_k: int = 20) -> str:
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user