feat: initial ResearchOwl
Build & Deploy ResearchOwl / build (push) Failing after 1m38s

This commit is contained in:
ChemaVX
2026-04-27 13:49:07 +00:00
commit ba08536337
37 changed files with 2431 additions and 0 deletions
+188
View File
@@ -0,0 +1,188 @@
"""
ResearchOwl Generators
Produces structured outputs from processed research using Ollama
"""
import structlog
from src.processor.processor import OllamaClient, ContentProcessor
from src.db.database import ResearchDB, OutputType
logger = structlog.get_logger()
PODCAST_SYSTEM = """You are an expert podcast scriptwriter. Create engaging, well-structured scripts
that feel natural when spoken aloud. Use conversational language, rhetorical questions,
clear transitions, and compelling storytelling. Include [PAUSE], [EMPHASIS], and [MUSIC CUE] markers."""
BLOG_SYSTEM = """You are an expert blog writer and journalist. Create SEO-optimized,
well-structured articles with clear headings, engaging prose, and proper citations.
Use markdown formatting. Write for an educated general audience."""
REPORT_SYSTEM = """You are an expert research analyst. Create comprehensive, objective reports
with executive summary, detailed findings, source analysis, contradictions found,
and conclusions. Use structured markdown with tables where appropriate."""
THREAD_SYSTEM = """You are a social media expert. Create engaging Twitter/X thread content.
Each tweet must be under 280 characters. Use numbers (1/N, 2/N...), hooks, cliffhangers.
Make it shareable and engaging. Include relevant hashtags at the end."""
PROMPTS = {
OutputType.PODCAST: """Based on the research below about "{topic}", write a complete podcast script.
Structure:
- INTRO (hook + topic intro, 2-3 min)
- SEGMENT 1: Background & Context
- SEGMENT 2: Key Facts & Evidence
- SEGMENT 3: Controversies & Different Perspectives
- SEGMENT 4: Deep Dive (most interesting finding)
- OUTRO + Call to Action
Make it 20-30 minutes of content. Include host notes in [brackets].
RESEARCH MATERIAL:
{context}
Write the complete script now:""",
OutputType.BLOG: """Based on the research below about "{topic}", write a comprehensive blog post.
Requirements:
- Compelling headline and meta description
- Engaging intro with hook
- Well-structured sections with H2/H3 headers
- Key facts highlighted
- Multiple perspectives presented
- Strong conclusion with takeaways
- Word count: 1500-2500 words
- Tone: Informative but engaging
RESEARCH MATERIAL:
{context}
Write the complete blog post in markdown:""",
OutputType.REPORT: """Based on the research below about "{topic}", write a comprehensive research report.
Structure:
1. Executive Summary (200 words)
2. Introduction & Scope
3. Key Findings (numbered)
4. Evidence Analysis
5. Source Quality Assessment
6. Contradictions & Disputed Claims
7. Timeline of Events (if applicable)
8. Conclusions
9. Further Research Suggestions
RESEARCH MATERIAL:
{context}
Write the complete report in markdown:""",
OutputType.THREAD: """Based on the research below about "{topic}", write an engaging Twitter/X thread.
Requirements:
- Start with a KILLER hook tweet
- 15-25 tweets total
- Each tweet max 280 chars
- Number them (1/20, 2/20...)
- Include surprising facts
- Build suspense between tweets
- End with strong conclusion + CTA
- Add relevant hashtags to last tweet
RESEARCH MATERIAL:
{context}
Write the complete thread, one tweet per line:"""
}
class OutputGenerator:
def __init__(self, db: ResearchDB, ollama: OllamaClient, processor: ContentProcessor):
self.db = db
self.ollama = ollama
self.processor = processor
async def generate(self, session_id: int, output_type: OutputType,
progress_callback=None) -> str:
"""Generate an output for a research session"""
session = await self.db.get_session(session_id)
if not session:
raise ValueError(f"Session {session_id} not found")
topic = session["topic"]
logger.info("Generating output", type=output_type, topic=topic)
if progress_callback:
await progress_callback(f"🔍 Retrieving best research material for {output_type}...")
# RAG: get most relevant context for this output type
query = self._get_rag_query(output_type, topic)
context = await self.processor.rag_query(session_id, query, top_k=30)
if not context:
# Fallback: use raw top chunks
chunks = await self.db.get_top_chunks(session_id, limit=20)
context = "\n\n---\n\n".join(c["content"] for c in chunks)
if not context:
raise ValueError("No processed content available. Run /process first.")
# Truncate context to avoid Ollama context limits
context_words = context.split()
if len(context_words) > 6000:
context = " ".join(context_words[:6000]) + "\n\n[... additional material truncated ...]"
if progress_callback:
await progress_callback(f"✍️ Generating {output_type} with Ollama... (this takes 2-5 min)")
# Build prompt
system = self._get_system(output_type)
prompt = PROMPTS[output_type].format(topic=topic, context=context)
# Generate — may take a while with local LLM
output = await self.ollama.generate(prompt, system=system, timeout=300)
# Add metadata header
stats = await self.db.get_session_stats(session_id)
header = self._build_header(topic, output_type, session, stats)
full_output = header + "\n\n" + output
# Save to DB
await self.db.save_output(session_id, output_type, full_output)
logger.info("Output generated", type=output_type, length=len(full_output))
return full_output
def _get_rag_query(self, output_type: OutputType, topic: str) -> str:
queries = {
OutputType.PODCAST: f"{topic} story narrative facts interesting",
OutputType.BLOG: f"{topic} key facts evidence analysis",
OutputType.REPORT: f"{topic} evidence data official findings",
OutputType.THREAD: f"{topic} surprising facts shocking revelations",
}
return queries.get(output_type, topic)
def _get_system(self, output_type: OutputType) -> str:
systems = {
OutputType.PODCAST: PODCAST_SYSTEM,
OutputType.BLOG: BLOG_SYSTEM,
OutputType.REPORT: REPORT_SYSTEM,
OutputType.THREAD: THREAD_SYSTEM,
}
return systems.get(output_type, "You are a helpful research assistant.")
def _build_header(self, topic: str, output_type: OutputType,
session: dict, stats: dict) -> str:
from datetime import datetime
dt = datetime.utcnow().strftime("%Y-%m-%d %H:%M UTC")
return f"""---
ResearchOwl | {output_type.upper()} OUTPUT
Topic: {topic}
Generated: {dt}
Sources: {stats.get('scraped', 0)} scraped | {stats.get('failed', 0)} failed
Iterations: {session.get('iterations', 0)}
Total words researched: {session.get('total_words', 0):,}
---
"""