feat: initial ResearchOwl

2026-04-27 13:49:07 +00:00
commit ba08536337
37 changed files with 2431 additions and 0 deletions
@@ -0,0 +1,188 @@
+"""
+ResearchOwl Generators
+Produces structured outputs from processed research using Ollama
+"""
+import structlog
+
+from src.processor.processor import OllamaClient, ContentProcessor
+from src.db.database import ResearchDB, OutputType
+
+logger = structlog.get_logger()
+
+PODCAST_SYSTEM = """You are an expert podcast scriptwriter. Create engaging, well-structured scripts 
+that feel natural when spoken aloud. Use conversational language, rhetorical questions, 
+clear transitions, and compelling storytelling. Include [PAUSE], [EMPHASIS], and [MUSIC CUE] markers."""
+
+BLOG_SYSTEM = """You are an expert blog writer and journalist. Create SEO-optimized, 
+well-structured articles with clear headings, engaging prose, and proper citations. 
+Use markdown formatting. Write for an educated general audience."""
+
+REPORT_SYSTEM = """You are an expert research analyst. Create comprehensive, objective reports 
+with executive summary, detailed findings, source analysis, contradictions found, 
+and conclusions. Use structured markdown with tables where appropriate."""
+
+THREAD_SYSTEM = """You are a social media expert. Create engaging Twitter/X thread content.
+Each tweet must be under 280 characters. Use numbers (1/N, 2/N...), hooks, cliffhangers.
+Make it shareable and engaging. Include relevant hashtags at the end."""
+
+
+PROMPTS = {
+    OutputType.PODCAST: """Based on the research below about "{topic}", write a complete podcast script.
+
+Structure:
+- INTRO (hook + topic intro, 2-3 min)
+- SEGMENT 1: Background & Context
+- SEGMENT 2: Key Facts & Evidence  
+- SEGMENT 3: Controversies & Different Perspectives
+- SEGMENT 4: Deep Dive (most interesting finding)
+- OUTRO + Call to Action
+
+Make it 20-30 minutes of content. Include host notes in [brackets].
+
+RESEARCH MATERIAL:
+{context}
+
+Write the complete script now:""",
+
+    OutputType.BLOG: """Based on the research below about "{topic}", write a comprehensive blog post.
+
+Requirements:
+- Compelling headline and meta description
+- Engaging intro with hook
+- Well-structured sections with H2/H3 headers
+- Key facts highlighted
+- Multiple perspectives presented
+- Strong conclusion with takeaways
+- Word count: 1500-2500 words
+- Tone: Informative but engaging
+
+RESEARCH MATERIAL:
+{context}
+
+Write the complete blog post in markdown:""",
+
+    OutputType.REPORT: """Based on the research below about "{topic}", write a comprehensive research report.
+
+Structure:
+1. Executive Summary (200 words)
+2. Introduction & Scope
+3. Key Findings (numbered)
+4. Evidence Analysis
+5. Source Quality Assessment
+6. Contradictions & Disputed Claims
+7. Timeline of Events (if applicable)
+8. Conclusions
+9. Further Research Suggestions
+
+RESEARCH MATERIAL:
+{context}
+
+Write the complete report in markdown:""",
+
+    OutputType.THREAD: """Based on the research below about "{topic}", write an engaging Twitter/X thread.
+
+Requirements:
+- Start with a KILLER hook tweet
+- 15-25 tweets total
+- Each tweet max 280 chars
+- Number them (1/20, 2/20...)
+- Include surprising facts
+- Build suspense between tweets
+- End with strong conclusion + CTA
+- Add relevant hashtags to last tweet
+
+RESEARCH MATERIAL:
+{context}
+
+Write the complete thread, one tweet per line:"""
+}
+
+
+class OutputGenerator:
+    def __init__(self, db: ResearchDB, ollama: OllamaClient, processor: ContentProcessor):
+        self.db = db
+        self.ollama = ollama
+        self.processor = processor
+
+    async def generate(self, session_id: int, output_type: OutputType,
+                       progress_callback=None) -> str:
+        """Generate an output for a research session"""
+        session = await self.db.get_session(session_id)
+        if not session:
+            raise ValueError(f"Session {session_id} not found")
+
+        topic = session["topic"]
+        logger.info("Generating output", type=output_type, topic=topic)
+
+        if progress_callback:
+            await progress_callback(f"🔍 Retrieving best research material for {output_type}...")
+
+        # RAG: get most relevant context for this output type
+        query = self._get_rag_query(output_type, topic)
+        context = await self.processor.rag_query(session_id, query, top_k=30)
+
+        if not context:
+            # Fallback: use raw top chunks
+            chunks = await self.db.get_top_chunks(session_id, limit=20)
+            context = "\n\n---\n\n".join(c["content"] for c in chunks)
+
+        if not context:
+            raise ValueError("No processed content available. Run /process first.")
+
+        # Truncate context to avoid Ollama context limits
+        context_words = context.split()
+        if len(context_words) > 6000:
+            context = " ".join(context_words[:6000]) + "\n\n[... additional material truncated ...]"
+
+        if progress_callback:
+            await progress_callback(f"✍️ Generating {output_type} with Ollama... (this takes 2-5 min)")
+
+        # Build prompt
+        system = self._get_system(output_type)
+        prompt = PROMPTS[output_type].format(topic=topic, context=context)
+
+        # Generate — may take a while with local LLM
+        output = await self.ollama.generate(prompt, system=system, timeout=300)
+
+        # Add metadata header
+        stats = await self.db.get_session_stats(session_id)
+        header = self._build_header(topic, output_type, session, stats)
+        full_output = header + "\n\n" + output
+
+        # Save to DB
+        await self.db.save_output(session_id, output_type, full_output)
+
+        logger.info("Output generated", type=output_type, length=len(full_output))
+        return full_output
+
+    def _get_rag_query(self, output_type: OutputType, topic: str) -> str:
+        queries = {
+            OutputType.PODCAST: f"{topic} story narrative facts interesting",
+            OutputType.BLOG: f"{topic} key facts evidence analysis",
+            OutputType.REPORT: f"{topic} evidence data official findings",
+            OutputType.THREAD: f"{topic} surprising facts shocking revelations",
+        }
+        return queries.get(output_type, topic)
+
+    def _get_system(self, output_type: OutputType) -> str:
+        systems = {
+            OutputType.PODCAST: PODCAST_SYSTEM,
+            OutputType.BLOG: BLOG_SYSTEM,
+            OutputType.REPORT: REPORT_SYSTEM,
+            OutputType.THREAD: THREAD_SYSTEM,
+        }
+        return systems.get(output_type, "You are a helpful research assistant.")
+
+    def _build_header(self, topic: str, output_type: OutputType,
+                      session: dict, stats: dict) -> str:
+        from datetime import datetime
+        dt = datetime.utcnow().strftime("%Y-%m-%d %H:%M UTC")
+        return f"""---
+ResearchOwl | {output_type.upper()} OUTPUT
+Topic: {topic}
+Generated: {dt}
+Sources: {stats.get('scraped', 0)} scraped | {stats.get('failed', 0)} failed
+Iterations: {session.get('iterations', 0)}
+Total words researched: {session.get('total_words', 0):,}
+---
+"""