feat: Claude Haiku for relevance scoring, fallback to Ollama

processor.py: split _score_quality into _score_with_claude and _score_with_ollama; if ANTHROPIC_API_KEY is set, use Claude Haiku (claude-haiku-4-5) with max_tokens=10 for fast, accurate 0-10 relevance scoring; falls back to Ollama on any error requirements.txt: add anthropic>=0.40.0 k8s: ANTHROPIC_API_KEY added to researchowl-secrets and mounted in deployment; QUALITY_THRESHOLD restored to 0.4 (Claude scoring is accurate enough to use the threshold) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-29 08:04:12 +00:00
parent 5feff6073e
commit d0e55ddb50
2 changed files with 37 additions and 4 deletions
@@ -23,6 +23,9 @@ tiktoken==0.7.0
 numpy==1.26.4
 scikit-learn==1.5.1
 # Claude API (scoring)
 anthropic>=0.40.0
 # Utilities
 pydantic==2.8.0
 pydantic-settings==2.4.0
@@ -216,7 +216,37 @@ class ContentProcessor:
        return stored
    async def _score_quality(self, chunk: str, topic: str) -> float:
-        """Score 0-1 how relevant chunk is to topic. Single axis — avoids off-topic content."""
+        """Score 0-1 relevance to topic. Uses Claude Haiku if API key set, else Ollama."""
        if settings.anthropic_api_key:
            return await self._score_with_claude(chunk, topic)
        return await self._score_with_ollama(chunk, topic)
    async def _score_with_claude(self, chunk: str, topic: str) -> float:
        import anthropic
        prompt = (
            f'Rate 0-10 how relevant this text is to the topic "{topic}". '
            f'Reply with only a number.\n\nText:\n{chunk[:500]}'
        )
        try:
            client = anthropic.AsyncAnthropic(api_key=settings.anthropic_api_key)
            msg = await client.messages.create(
                model=settings.claude_model,
                max_tokens=10,
                messages=[{"role": "user", "content": prompt}]
            )
            response = msg.content[0].text.strip()
            numbers = re.findall(r'\b(\d+(?:\.\d+)?)\b', response)
            if numbers:
                score = float(numbers[0])
                normalized = min(1.0, score / 10.0)
                logger.debug("Claude relevance score", raw=score, normalized=round(normalized, 2))
                return normalized
            return 0.6
        except Exception as e:
            logger.warning("Claude scoring failed, falling back to Ollama", error=str(e))
            return await self._score_with_ollama(chunk, topic)
    async def _score_with_ollama(self, chunk: str, topic: str) -> float:
        prompt = (
            f'Score 0-10: how relevant is this text to the topic "{topic}"?\n'
            f"0 = completely unrelated, 10 = directly and specifically about this topic.\n\n"
@@ -229,12 +259,12 @@ class ContentProcessor:
            if numbers:
                score = float(numbers[0])
                normalized = min(1.0, score / 10.0)
-                logger.debug("Relevance score", raw=score, normalized=round(normalized, 2))
+                logger.debug("Ollama relevance score", raw=score, normalized=round(normalized, 2))
                return normalized
-            logger.debug("No number in relevance response", response=response[:80])
+            logger.debug("No number in Ollama relevance response", response=response[:80])
            return 0.6
        except Exception as e:
-            logger.warning("Relevance scoring failed", error=str(e))
+            logger.warning("Ollama relevance scoring failed", error=str(e))
            return 0.6
    async def rag_query(self, session_id: int, query: str, top_k: int = 20) -> str: