fix: relevance scoring per topic + URL keyword filter for child pages
Build & Deploy ResearchOwl / build-and-push (push) Successful in 6s

processor.py: simplify _score_quality prompt to single axis —
  "how relevant is this text to topic X?" — instead of averaging
  relevance + density + credibility, which let off-topic but
  well-written content pass through

exhaustive.py: pre-compute topic keywords (stopword-filtered) at
  scraper init; filter child URLs (discovered during crawl, depth>0)
  to only add ones whose URL path or title contains a topic keyword;
  seed URLs (depth=0, from DDG/Wikipedia/Reddit) are always included
  since those searches are already topic-scoped

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
ChemaVX
2026-04-27 20:52:43 +00:00
parent 0c7176dd0b
commit f7d62345b8
2 changed files with 49 additions and 30 deletions
+12 -19
View File
@@ -211,33 +211,26 @@ class ContentProcessor:
return stored
async def _score_quality(self, chunk: str, topic: str) -> float:
"""
Ask Ollama to score relevance and quality of a chunk.
Returns 0.0-1.0
"""
prompt = f"""Rate this text chunk on a scale of 0-10 for:
1. Relevance to topic: "{topic}"
2. Information density (facts, data, insights)
3. Credibility (not speculation, not clickbait)
Text:
{chunk[:500]}
Respond with ONLY a single number 0-10. No explanation."""
"""Score 0-1 how relevant chunk is to topic. Single axis — avoids off-topic content."""
prompt = (
f'Score 0-10: how relevant is this text to the topic "{topic}"?\n'
f"0 = completely unrelated, 10 = directly and specifically about this topic.\n\n"
f"Text:\n{chunk[:500]}\n\n"
f"Reply with ONLY a single integer 0-10. No explanation."
)
try:
response = await self.ollama.generate(prompt)
numbers = re.findall(r'\b(\d+(?:\.\d+)?)\b', response)
if numbers:
score = float(numbers[0])
normalized = min(1.0, score / 10.0)
logger.debug("Quality score", raw=score, normalized=round(normalized, 2))
logger.debug("Relevance score", raw=score, normalized=round(normalized, 2))
return normalized
logger.debug("No number in quality response", response=response[:80])
return 0.6 # above threshold so chunk is kept
logger.debug("No number in relevance response", response=response[:80])
return 0.6
except Exception as e:
logger.warning("Quality scoring failed", error=str(e))
return 0.6 # above threshold so chunk is kept on Ollama error
logger.warning("Relevance scoring failed", error=str(e))
return 0.6
async def rag_query(self, session_id: int, query: str, top_k: int = 20) -> str:
"""