fix: relevance scoring per topic + URL keyword filter for child pages
Build & Deploy ResearchOwl / build-and-push (push) Successful in 6s

processor.py: simplify _score_quality prompt to single axis —
  "how relevant is this text to topic X?" — instead of averaging
  relevance + density + credibility, which let off-topic but
  well-written content pass through

exhaustive.py: pre-compute topic keywords (stopword-filtered) at
  scraper init; filter child URLs (discovered during crawl, depth>0)
  to only add ones whose URL path or title contains a topic keyword;
  seed URLs (depth=0, from DDG/Wikipedia/Reddit) are always included
  since those searches are already topic-scoped

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
ChemaVX
2026-04-27 20:52:43 +00:00
parent 0c7176dd0b
commit f7d62345b8
2 changed files with 49 additions and 30 deletions
+12 -19
View File
@@ -211,33 +211,26 @@ class ContentProcessor:
return stored
async def _score_quality(self, chunk: str, topic: str) -> float:
"""
Ask Ollama to score relevance and quality of a chunk.
Returns 0.0-1.0
"""
prompt = f"""Rate this text chunk on a scale of 0-10 for:
1. Relevance to topic: "{topic}"
2. Information density (facts, data, insights)
3. Credibility (not speculation, not clickbait)
Text:
{chunk[:500]}
Respond with ONLY a single number 0-10. No explanation."""
"""Score 0-1 how relevant chunk is to topic. Single axis — avoids off-topic content."""
prompt = (
f'Score 0-10: how relevant is this text to the topic "{topic}"?\n'
f"0 = completely unrelated, 10 = directly and specifically about this topic.\n\n"
f"Text:\n{chunk[:500]}\n\n"
f"Reply with ONLY a single integer 0-10. No explanation."
)
try:
response = await self.ollama.generate(prompt)
numbers = re.findall(r'\b(\d+(?:\.\d+)?)\b', response)
if numbers:
score = float(numbers[0])
normalized = min(1.0, score / 10.0)
logger.debug("Quality score", raw=score, normalized=round(normalized, 2))
logger.debug("Relevance score", raw=score, normalized=round(normalized, 2))
return normalized
logger.debug("No number in quality response", response=response[:80])
return 0.6 # above threshold so chunk is kept
logger.debug("No number in relevance response", response=response[:80])
return 0.6
except Exception as e:
logger.warning("Quality scoring failed", error=str(e))
return 0.6 # above threshold so chunk is kept on Ollama error
logger.warning("Relevance scoring failed", error=str(e))
return 0.6
async def rag_query(self, session_id: int, query: str, top_k: int = 20) -> str:
"""
+28 -2
View File
@@ -90,6 +90,14 @@ class ExhaustiveScraper:
Keeps expanding until saturation or limits hit.
"""
# Common stopwords to ignore when extracting topic keywords
_STOPWORDS = {
'the','a','an','and','or','of','in','on','at','to','for','is','are','was',
'were','be','been','have','has','had','do','does','did','about','with','from',
'el','la','los','las','de','del','en','un','una','y','o','que','se','por',
'con','para','sobre','como','pero','más',
}
def __init__(self, db: ResearchDB, session_id: int, topic: str,
progress_callback=None):
self.db = db
@@ -100,6 +108,18 @@ class ExhaustiveScraper:
self.total_sources = 0
self._stop = False
self._http: Optional[aiohttp.ClientSession] = None
# Pre-compute topic keywords for child-URL relevance filtering
self._keywords = [
w for w in re.findall(r'\b\w{3,}\b', topic.lower())
if w not in self._STOPWORDS
]
def _url_is_relevant(self, url: str, title: str = "") -> bool:
"""True if URL path or title contains at least one topic keyword."""
if not self._keywords:
return True
text = (urlparse(url).path + " " + (title or "")).lower()
return any(kw in text for kw in self._keywords)
async def stop(self):
self._stop = True
@@ -300,13 +320,16 @@ class ExhaustiveScraper:
content, title = await self._extract_youtube(url)
elif source_type == "wikipedia":
content, title, new_urls = await self._extract_wikipedia(url)
added = 0
for new_url in (new_urls or []):
if self._url_is_relevant(new_url):
await self.db.add_source(
self.session_id, new_url, "wikipedia",
depth=source["depth"] + 1
)
added += 1
await self._mark_scraped(source_id, content, title, url)
return len(new_urls or [])
return added
elif source_type == "reddit":
content, title = await self._extract_reddit(url)
# Small delay between Reddit requests to avoid rate limiting
@@ -315,14 +338,17 @@ class ExhaustiveScraper:
content, title = await self._extract_pdf(url)
else:
content, title, new_urls = await self._extract_web(url, source["depth"])
added = 0
for new_url in (new_urls or []):
if self._url_is_relevant(new_url):
await self.db.add_source(
self.session_id, new_url,
detect_source_type(new_url),
depth=source["depth"] + 1
)
added += 1
await self._mark_scraped(source_id, content, title, url)
return len(new_urls or [])
return added
await self._mark_scraped(source_id, content, title, url)
return 0