feat: Claude Haiku for relevance scoring, fallback to Ollama
Build & Deploy ResearchOwl / build-and-push (push) Successful in 45s

processor.py: split _score_quality into _score_with_claude and
  _score_with_ollama; if ANTHROPIC_API_KEY is set, use Claude Haiku
  (claude-haiku-4-5) with max_tokens=10 for fast, accurate 0-10
  relevance scoring; falls back to Ollama on any error

requirements.txt: add anthropic>=0.40.0

k8s: ANTHROPIC_API_KEY added to researchowl-secrets and mounted in
  deployment; QUALITY_THRESHOLD restored to 0.4 (Claude scoring
  is accurate enough to use the threshold)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
ChemaVX
2026-04-29 08:04:12 +00:00
parent 5feff6073e
commit d0e55ddb50
2 changed files with 37 additions and 4 deletions
+3
View File
@@ -23,6 +23,9 @@ tiktoken==0.7.0
numpy==1.26.4 numpy==1.26.4
scikit-learn==1.5.1 scikit-learn==1.5.1
# Claude API (scoring)
anthropic>=0.40.0
# Utilities # Utilities
pydantic==2.8.0 pydantic==2.8.0
pydantic-settings==2.4.0 pydantic-settings==2.4.0
+34 -4
View File
@@ -216,7 +216,37 @@ class ContentProcessor:
return stored return stored
async def _score_quality(self, chunk: str, topic: str) -> float: async def _score_quality(self, chunk: str, topic: str) -> float:
"""Score 0-1 how relevant chunk is to topic. Single axis — avoids off-topic content.""" """Score 0-1 relevance to topic. Uses Claude Haiku if API key set, else Ollama."""
if settings.anthropic_api_key:
return await self._score_with_claude(chunk, topic)
return await self._score_with_ollama(chunk, topic)
async def _score_with_claude(self, chunk: str, topic: str) -> float:
import anthropic
prompt = (
f'Rate 0-10 how relevant this text is to the topic "{topic}". '
f'Reply with only a number.\n\nText:\n{chunk[:500]}'
)
try:
client = anthropic.AsyncAnthropic(api_key=settings.anthropic_api_key)
msg = await client.messages.create(
model=settings.claude_model,
max_tokens=10,
messages=[{"role": "user", "content": prompt}]
)
response = msg.content[0].text.strip()
numbers = re.findall(r'\b(\d+(?:\.\d+)?)\b', response)
if numbers:
score = float(numbers[0])
normalized = min(1.0, score / 10.0)
logger.debug("Claude relevance score", raw=score, normalized=round(normalized, 2))
return normalized
return 0.6
except Exception as e:
logger.warning("Claude scoring failed, falling back to Ollama", error=str(e))
return await self._score_with_ollama(chunk, topic)
async def _score_with_ollama(self, chunk: str, topic: str) -> float:
prompt = ( prompt = (
f'Score 0-10: how relevant is this text to the topic "{topic}"?\n' f'Score 0-10: how relevant is this text to the topic "{topic}"?\n'
f"0 = completely unrelated, 10 = directly and specifically about this topic.\n\n" f"0 = completely unrelated, 10 = directly and specifically about this topic.\n\n"
@@ -229,12 +259,12 @@ class ContentProcessor:
if numbers: if numbers:
score = float(numbers[0]) score = float(numbers[0])
normalized = min(1.0, score / 10.0) normalized = min(1.0, score / 10.0)
logger.debug("Relevance score", raw=score, normalized=round(normalized, 2)) logger.debug("Ollama relevance score", raw=score, normalized=round(normalized, 2))
return normalized return normalized
logger.debug("No number in relevance response", response=response[:80]) logger.debug("No number in Ollama relevance response", response=response[:80])
return 0.6 return 0.6
except Exception as e: except Exception as e:
logger.warning("Relevance scoring failed", error=str(e)) logger.warning("Ollama relevance scoring failed", error=str(e))
return 0.6 return 0.6
async def rag_query(self, session_id: int, query: str, top_k: int = 20) -> str: async def rag_query(self, session_id: int, query: str, top_k: int = 20) -> str: