This commit is contained in:
@@ -0,0 +1,188 @@
|
||||
"""
|
||||
ResearchOwl Generators
|
||||
Produces structured outputs from processed research using Ollama
|
||||
"""
|
||||
import structlog
|
||||
|
||||
from src.processor.processor import OllamaClient, ContentProcessor
|
||||
from src.db.database import ResearchDB, OutputType
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
PODCAST_SYSTEM = """You are an expert podcast scriptwriter. Create engaging, well-structured scripts
|
||||
that feel natural when spoken aloud. Use conversational language, rhetorical questions,
|
||||
clear transitions, and compelling storytelling. Include [PAUSE], [EMPHASIS], and [MUSIC CUE] markers."""
|
||||
|
||||
BLOG_SYSTEM = """You are an expert blog writer and journalist. Create SEO-optimized,
|
||||
well-structured articles with clear headings, engaging prose, and proper citations.
|
||||
Use markdown formatting. Write for an educated general audience."""
|
||||
|
||||
REPORT_SYSTEM = """You are an expert research analyst. Create comprehensive, objective reports
|
||||
with executive summary, detailed findings, source analysis, contradictions found,
|
||||
and conclusions. Use structured markdown with tables where appropriate."""
|
||||
|
||||
THREAD_SYSTEM = """You are a social media expert. Create engaging Twitter/X thread content.
|
||||
Each tweet must be under 280 characters. Use numbers (1/N, 2/N...), hooks, cliffhangers.
|
||||
Make it shareable and engaging. Include relevant hashtags at the end."""
|
||||
|
||||
|
||||
PROMPTS = {
|
||||
OutputType.PODCAST: """Based on the research below about "{topic}", write a complete podcast script.
|
||||
|
||||
Structure:
|
||||
- INTRO (hook + topic intro, 2-3 min)
|
||||
- SEGMENT 1: Background & Context
|
||||
- SEGMENT 2: Key Facts & Evidence
|
||||
- SEGMENT 3: Controversies & Different Perspectives
|
||||
- SEGMENT 4: Deep Dive (most interesting finding)
|
||||
- OUTRO + Call to Action
|
||||
|
||||
Make it 20-30 minutes of content. Include host notes in [brackets].
|
||||
|
||||
RESEARCH MATERIAL:
|
||||
{context}
|
||||
|
||||
Write the complete script now:""",
|
||||
|
||||
OutputType.BLOG: """Based on the research below about "{topic}", write a comprehensive blog post.
|
||||
|
||||
Requirements:
|
||||
- Compelling headline and meta description
|
||||
- Engaging intro with hook
|
||||
- Well-structured sections with H2/H3 headers
|
||||
- Key facts highlighted
|
||||
- Multiple perspectives presented
|
||||
- Strong conclusion with takeaways
|
||||
- Word count: 1500-2500 words
|
||||
- Tone: Informative but engaging
|
||||
|
||||
RESEARCH MATERIAL:
|
||||
{context}
|
||||
|
||||
Write the complete blog post in markdown:""",
|
||||
|
||||
OutputType.REPORT: """Based on the research below about "{topic}", write a comprehensive research report.
|
||||
|
||||
Structure:
|
||||
1. Executive Summary (200 words)
|
||||
2. Introduction & Scope
|
||||
3. Key Findings (numbered)
|
||||
4. Evidence Analysis
|
||||
5. Source Quality Assessment
|
||||
6. Contradictions & Disputed Claims
|
||||
7. Timeline of Events (if applicable)
|
||||
8. Conclusions
|
||||
9. Further Research Suggestions
|
||||
|
||||
RESEARCH MATERIAL:
|
||||
{context}
|
||||
|
||||
Write the complete report in markdown:""",
|
||||
|
||||
OutputType.THREAD: """Based on the research below about "{topic}", write an engaging Twitter/X thread.
|
||||
|
||||
Requirements:
|
||||
- Start with a KILLER hook tweet
|
||||
- 15-25 tweets total
|
||||
- Each tweet max 280 chars
|
||||
- Number them (1/20, 2/20...)
|
||||
- Include surprising facts
|
||||
- Build suspense between tweets
|
||||
- End with strong conclusion + CTA
|
||||
- Add relevant hashtags to last tweet
|
||||
|
||||
RESEARCH MATERIAL:
|
||||
{context}
|
||||
|
||||
Write the complete thread, one tweet per line:"""
|
||||
}
|
||||
|
||||
|
||||
class OutputGenerator:
|
||||
def __init__(self, db: ResearchDB, ollama: OllamaClient, processor: ContentProcessor):
|
||||
self.db = db
|
||||
self.ollama = ollama
|
||||
self.processor = processor
|
||||
|
||||
async def generate(self, session_id: int, output_type: OutputType,
|
||||
progress_callback=None) -> str:
|
||||
"""Generate an output for a research session"""
|
||||
session = await self.db.get_session(session_id)
|
||||
if not session:
|
||||
raise ValueError(f"Session {session_id} not found")
|
||||
|
||||
topic = session["topic"]
|
||||
logger.info("Generating output", type=output_type, topic=topic)
|
||||
|
||||
if progress_callback:
|
||||
await progress_callback(f"🔍 Retrieving best research material for {output_type}...")
|
||||
|
||||
# RAG: get most relevant context for this output type
|
||||
query = self._get_rag_query(output_type, topic)
|
||||
context = await self.processor.rag_query(session_id, query, top_k=30)
|
||||
|
||||
if not context:
|
||||
# Fallback: use raw top chunks
|
||||
chunks = await self.db.get_top_chunks(session_id, limit=20)
|
||||
context = "\n\n---\n\n".join(c["content"] for c in chunks)
|
||||
|
||||
if not context:
|
||||
raise ValueError("No processed content available. Run /process first.")
|
||||
|
||||
# Truncate context to avoid Ollama context limits
|
||||
context_words = context.split()
|
||||
if len(context_words) > 6000:
|
||||
context = " ".join(context_words[:6000]) + "\n\n[... additional material truncated ...]"
|
||||
|
||||
if progress_callback:
|
||||
await progress_callback(f"✍️ Generating {output_type} with Ollama... (this takes 2-5 min)")
|
||||
|
||||
# Build prompt
|
||||
system = self._get_system(output_type)
|
||||
prompt = PROMPTS[output_type].format(topic=topic, context=context)
|
||||
|
||||
# Generate — may take a while with local LLM
|
||||
output = await self.ollama.generate(prompt, system=system, timeout=300)
|
||||
|
||||
# Add metadata header
|
||||
stats = await self.db.get_session_stats(session_id)
|
||||
header = self._build_header(topic, output_type, session, stats)
|
||||
full_output = header + "\n\n" + output
|
||||
|
||||
# Save to DB
|
||||
await self.db.save_output(session_id, output_type, full_output)
|
||||
|
||||
logger.info("Output generated", type=output_type, length=len(full_output))
|
||||
return full_output
|
||||
|
||||
def _get_rag_query(self, output_type: OutputType, topic: str) -> str:
|
||||
queries = {
|
||||
OutputType.PODCAST: f"{topic} story narrative facts interesting",
|
||||
OutputType.BLOG: f"{topic} key facts evidence analysis",
|
||||
OutputType.REPORT: f"{topic} evidence data official findings",
|
||||
OutputType.THREAD: f"{topic} surprising facts shocking revelations",
|
||||
}
|
||||
return queries.get(output_type, topic)
|
||||
|
||||
def _get_system(self, output_type: OutputType) -> str:
|
||||
systems = {
|
||||
OutputType.PODCAST: PODCAST_SYSTEM,
|
||||
OutputType.BLOG: BLOG_SYSTEM,
|
||||
OutputType.REPORT: REPORT_SYSTEM,
|
||||
OutputType.THREAD: THREAD_SYSTEM,
|
||||
}
|
||||
return systems.get(output_type, "You are a helpful research assistant.")
|
||||
|
||||
def _build_header(self, topic: str, output_type: OutputType,
|
||||
session: dict, stats: dict) -> str:
|
||||
from datetime import datetime
|
||||
dt = datetime.utcnow().strftime("%Y-%m-%d %H:%M UTC")
|
||||
return f"""---
|
||||
ResearchOwl | {output_type.upper()} OUTPUT
|
||||
Topic: {topic}
|
||||
Generated: {dt}
|
||||
Sources: {stats.get('scraped', 0)} scraped | {stats.get('failed', 0)} failed
|
||||
Iterations: {session.get('iterations', 0)}
|
||||
Total words researched: {session.get('total_words', 0):,}
|
||||
---
|
||||
"""
|
||||
Reference in New Issue
Block a user