fix: relevance scoring per topic + URL keyword filter for child pages
Build & Deploy ResearchOwl / build-and-push (push) Successful in 6s
Build & Deploy ResearchOwl / build-and-push (push) Successful in 6s
processor.py: simplify _score_quality prompt to single axis — "how relevant is this text to topic X?" — instead of averaging relevance + density + credibility, which let off-topic but well-written content pass through exhaustive.py: pre-compute topic keywords (stopword-filtered) at scraper init; filter child URLs (discovered during crawl, depth>0) to only add ones whose URL path or title contains a topic keyword; seed URLs (depth=0, from DDG/Wikipedia/Reddit) are always included since those searches are already topic-scoped Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+12
-19
@@ -211,33 +211,26 @@ class ContentProcessor:
|
|||||||
return stored
|
return stored
|
||||||
|
|
||||||
async def _score_quality(self, chunk: str, topic: str) -> float:
|
async def _score_quality(self, chunk: str, topic: str) -> float:
|
||||||
"""
|
"""Score 0-1 how relevant chunk is to topic. Single axis — avoids off-topic content."""
|
||||||
Ask Ollama to score relevance and quality of a chunk.
|
prompt = (
|
||||||
Returns 0.0-1.0
|
f'Score 0-10: how relevant is this text to the topic "{topic}"?\n'
|
||||||
"""
|
f"0 = completely unrelated, 10 = directly and specifically about this topic.\n\n"
|
||||||
prompt = f"""Rate this text chunk on a scale of 0-10 for:
|
f"Text:\n{chunk[:500]}\n\n"
|
||||||
1. Relevance to topic: "{topic}"
|
f"Reply with ONLY a single integer 0-10. No explanation."
|
||||||
2. Information density (facts, data, insights)
|
)
|
||||||
3. Credibility (not speculation, not clickbait)
|
|
||||||
|
|
||||||
Text:
|
|
||||||
{chunk[:500]}
|
|
||||||
|
|
||||||
Respond with ONLY a single number 0-10. No explanation."""
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = await self.ollama.generate(prompt)
|
response = await self.ollama.generate(prompt)
|
||||||
numbers = re.findall(r'\b(\d+(?:\.\d+)?)\b', response)
|
numbers = re.findall(r'\b(\d+(?:\.\d+)?)\b', response)
|
||||||
if numbers:
|
if numbers:
|
||||||
score = float(numbers[0])
|
score = float(numbers[0])
|
||||||
normalized = min(1.0, score / 10.0)
|
normalized = min(1.0, score / 10.0)
|
||||||
logger.debug("Quality score", raw=score, normalized=round(normalized, 2))
|
logger.debug("Relevance score", raw=score, normalized=round(normalized, 2))
|
||||||
return normalized
|
return normalized
|
||||||
logger.debug("No number in quality response", response=response[:80])
|
logger.debug("No number in relevance response", response=response[:80])
|
||||||
return 0.6 # above threshold so chunk is kept
|
return 0.6
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("Quality scoring failed", error=str(e))
|
logger.warning("Relevance scoring failed", error=str(e))
|
||||||
return 0.6 # above threshold so chunk is kept on Ollama error
|
return 0.6
|
||||||
|
|
||||||
async def rag_query(self, session_id: int, query: str, top_k: int = 20) -> str:
|
async def rag_query(self, session_id: int, query: str, top_k: int = 20) -> str:
|
||||||
"""
|
"""
|
||||||
|
|||||||
+37
-11
@@ -90,6 +90,14 @@ class ExhaustiveScraper:
|
|||||||
Keeps expanding until saturation or limits hit.
|
Keeps expanding until saturation or limits hit.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Common stopwords to ignore when extracting topic keywords
|
||||||
|
_STOPWORDS = {
|
||||||
|
'the','a','an','and','or','of','in','on','at','to','for','is','are','was',
|
||||||
|
'were','be','been','have','has','had','do','does','did','about','with','from',
|
||||||
|
'el','la','los','las','de','del','en','un','una','y','o','que','se','por',
|
||||||
|
'con','para','sobre','como','pero','más',
|
||||||
|
}
|
||||||
|
|
||||||
def __init__(self, db: ResearchDB, session_id: int, topic: str,
|
def __init__(self, db: ResearchDB, session_id: int, topic: str,
|
||||||
progress_callback=None):
|
progress_callback=None):
|
||||||
self.db = db
|
self.db = db
|
||||||
@@ -100,6 +108,18 @@ class ExhaustiveScraper:
|
|||||||
self.total_sources = 0
|
self.total_sources = 0
|
||||||
self._stop = False
|
self._stop = False
|
||||||
self._http: Optional[aiohttp.ClientSession] = None
|
self._http: Optional[aiohttp.ClientSession] = None
|
||||||
|
# Pre-compute topic keywords for child-URL relevance filtering
|
||||||
|
self._keywords = [
|
||||||
|
w for w in re.findall(r'\b\w{3,}\b', topic.lower())
|
||||||
|
if w not in self._STOPWORDS
|
||||||
|
]
|
||||||
|
|
||||||
|
def _url_is_relevant(self, url: str, title: str = "") -> bool:
|
||||||
|
"""True if URL path or title contains at least one topic keyword."""
|
||||||
|
if not self._keywords:
|
||||||
|
return True
|
||||||
|
text = (urlparse(url).path + " " + (title or "")).lower()
|
||||||
|
return any(kw in text for kw in self._keywords)
|
||||||
|
|
||||||
async def stop(self):
|
async def stop(self):
|
||||||
self._stop = True
|
self._stop = True
|
||||||
@@ -300,13 +320,16 @@ class ExhaustiveScraper:
|
|||||||
content, title = await self._extract_youtube(url)
|
content, title = await self._extract_youtube(url)
|
||||||
elif source_type == "wikipedia":
|
elif source_type == "wikipedia":
|
||||||
content, title, new_urls = await self._extract_wikipedia(url)
|
content, title, new_urls = await self._extract_wikipedia(url)
|
||||||
|
added = 0
|
||||||
for new_url in (new_urls or []):
|
for new_url in (new_urls or []):
|
||||||
await self.db.add_source(
|
if self._url_is_relevant(new_url):
|
||||||
self.session_id, new_url, "wikipedia",
|
await self.db.add_source(
|
||||||
depth=source["depth"] + 1
|
self.session_id, new_url, "wikipedia",
|
||||||
)
|
depth=source["depth"] + 1
|
||||||
|
)
|
||||||
|
added += 1
|
||||||
await self._mark_scraped(source_id, content, title, url)
|
await self._mark_scraped(source_id, content, title, url)
|
||||||
return len(new_urls or [])
|
return added
|
||||||
elif source_type == "reddit":
|
elif source_type == "reddit":
|
||||||
content, title = await self._extract_reddit(url)
|
content, title = await self._extract_reddit(url)
|
||||||
# Small delay between Reddit requests to avoid rate limiting
|
# Small delay between Reddit requests to avoid rate limiting
|
||||||
@@ -315,14 +338,17 @@ class ExhaustiveScraper:
|
|||||||
content, title = await self._extract_pdf(url)
|
content, title = await self._extract_pdf(url)
|
||||||
else:
|
else:
|
||||||
content, title, new_urls = await self._extract_web(url, source["depth"])
|
content, title, new_urls = await self._extract_web(url, source["depth"])
|
||||||
|
added = 0
|
||||||
for new_url in (new_urls or []):
|
for new_url in (new_urls or []):
|
||||||
await self.db.add_source(
|
if self._url_is_relevant(new_url):
|
||||||
self.session_id, new_url,
|
await self.db.add_source(
|
||||||
detect_source_type(new_url),
|
self.session_id, new_url,
|
||||||
depth=source["depth"] + 1
|
detect_source_type(new_url),
|
||||||
)
|
depth=source["depth"] + 1
|
||||||
|
)
|
||||||
|
added += 1
|
||||||
await self._mark_scraped(source_id, content, title, url)
|
await self._mark_scraped(source_id, content, title, url)
|
||||||
return len(new_urls or [])
|
return added
|
||||||
|
|
||||||
await self._mark_scraped(source_id, content, title, url)
|
await self._mark_scraped(source_id, content, title, url)
|
||||||
return 0
|
return 0
|
||||||
|
|||||||
Reference in New Issue
Block a user