fix: WAL mode for concurrent reads, skipped stats, anti-repetition prompts
Build & Deploy ResearchOwl / build-and-push (push) Successful in 5s
Build & Deploy ResearchOwl / build-and-push (push) Successful in 5s
database.py: enable PRAGMA journal_mode=WAL + synchronous=NORMAL so /status reads from concurrent connections see committed data without blocking behind the scraper's writes; add 'skipped' to get_session_stats bot.py: show skipped count in fmt_progress and cmd_status; use 'or 0' to guard against NULL from SUM(); label active research in /status processor.py: raise generate() temperature default to 0.7 + add repeat_penalty=1.15/repeat_last_n=128 to Ollama options to stop qwen2.5:3b from looping; scoring prompt keeps temperature=0.1 generator.py: rewrite all prompts with explicit "NEVER repeat" constraints and distinct-content rules per section; podcast prompt now asks for spoken-word style (no formal headers); reduce thread to 12-18 tweets (was 15-25) to fit model context; pass temperature=0.7 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+12
-10
@@ -34,14 +34,15 @@ def is_authorized(user_id: int) -> bool:
|
||||
|
||||
|
||||
def fmt_progress(iteration: int, total: int, new: int, stats: dict) -> str:
|
||||
scraped = stats.get("scraped", 0)
|
||||
failed = stats.get("failed", 0)
|
||||
pending = stats.get("pending", 0)
|
||||
scraped = stats.get("scraped") or 0
|
||||
failed = stats.get("failed") or 0
|
||||
pending = stats.get("pending") or 0
|
||||
skipped = stats.get("skipped") or 0
|
||||
return (
|
||||
f"🔄 *Iteration {iteration}*\n"
|
||||
f"📚 Sources found: `{total}`\n"
|
||||
f"✅ Scraped: `{scraped}` | ❌ Failed: `{failed}` | ⏳ Pending: `{pending}`\n"
|
||||
f"🆕 New this round: `{new}`"
|
||||
f"✅ Scraped: `{scraped}` | ⏭️ Skipped: `{skipped}` | ❌ Failed: `{failed}` | ⏳ Pending: `{pending}`\n"
|
||||
f"🆕 New URLs this round: `{new}`"
|
||||
)
|
||||
|
||||
|
||||
@@ -213,13 +214,14 @@ async def cmd_status(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
|
||||
f"📝 Topic: `{session['topic']}`\n"
|
||||
f"🔁 Status: `{session['status']}`\n"
|
||||
f"🔢 Iterations: `{session.get('iterations', 0)}`\n"
|
||||
f"📚 Total sources: `{stats.get('total', 0)}`\n"
|
||||
f"✅ Scraped: `{stats.get('scraped', 0)}`\n"
|
||||
f"❌ Failed: `{stats.get('failed', 0)}`\n"
|
||||
f"⏳ Pending: `{stats.get('pending', 0)}`\n"
|
||||
f"📚 Total sources: `{stats.get('total') or 0}`\n"
|
||||
f"✅ Scraped: `{stats.get('scraped') or 0}`\n"
|
||||
f"⏭️ Skipped: `{stats.get('skipped') or 0}`\n"
|
||||
f"❌ Failed: `{stats.get('failed') or 0}`\n"
|
||||
f"⏳ Pending: `{stats.get('pending') or 0}`\n"
|
||||
f"💬 Chunks: `{session.get('total_chunks', 0)}`\n"
|
||||
f"📖 Words: `{session.get('total_words', 0):,}`\n"
|
||||
f"{'🟢 Active' if is_active else '⚫ Idle'}",
|
||||
f"{'🟢 Active — stats update each iteration' if is_active else '⚫ Idle'}",
|
||||
parse_mode=ParseMode.MARKDOWN
|
||||
)
|
||||
finally:
|
||||
|
||||
+4
-1
@@ -91,6 +91,8 @@ async def get_db() -> aiosqlite.Connection:
|
||||
Path(settings.db_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
db = await aiosqlite.connect(settings.db_path)
|
||||
db.row_factory = aiosqlite.Row
|
||||
await db.execute("PRAGMA journal_mode=WAL")
|
||||
await db.execute("PRAGMA synchronous=NORMAL")
|
||||
await db.executescript(SCHEMA)
|
||||
await db.commit()
|
||||
return db
|
||||
@@ -144,7 +146,8 @@ class ResearchDB:
|
||||
COUNT(*) as total,
|
||||
SUM(CASE WHEN status='scraped' THEN 1 ELSE 0 END) as scraped,
|
||||
SUM(CASE WHEN status='failed' THEN 1 ELSE 0 END) as failed,
|
||||
SUM(CASE WHEN status='pending' THEN 1 ELSE 0 END) as pending
|
||||
SUM(CASE WHEN status='pending' THEN 1 ELSE 0 END) as pending,
|
||||
SUM(CASE WHEN status='skipped' THEN 1 ELSE 0 END) as skipped
|
||||
FROM sources WHERE session_id = ?""",
|
||||
(session_id,)
|
||||
)
|
||||
|
||||
+88
-56
@@ -9,92 +9,124 @@ from src.db.database import ResearchDB, OutputType
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
PODCAST_SYSTEM = """You are an expert podcast scriptwriter. Create engaging, well-structured scripts
|
||||
that feel natural when spoken aloud. Use conversational language, rhetorical questions,
|
||||
clear transitions, and compelling storytelling. Include [PAUSE], [EMPHASIS], and [MUSIC CUE] markers."""
|
||||
PODCAST_SYSTEM = (
|
||||
"You are a podcast scriptwriter. Write exactly as a host SPEAKS — contractions, "
|
||||
"incomplete sentences, natural pauses, rhetorical questions. "
|
||||
"NEVER repeat a sentence, phrase, or idea you already wrote. "
|
||||
"Each paragraph must introduce NEW information. "
|
||||
"Use [PAUSE], [EMPHASIS], [MUSIC CUE] markers sparingly."
|
||||
)
|
||||
|
||||
BLOG_SYSTEM = """You are an expert blog writer and journalist. Create SEO-optimized,
|
||||
well-structured articles with clear headings, engaging prose, and proper citations.
|
||||
Use markdown formatting. Write for an educated general audience."""
|
||||
BLOG_SYSTEM = (
|
||||
"You are a journalist writing a blog post. Use clear markdown headings. "
|
||||
"NEVER repeat the same fact or phrase twice — if you said something, move on. "
|
||||
"Each section must add new information not covered in previous sections."
|
||||
)
|
||||
|
||||
REPORT_SYSTEM = """You are an expert research analyst. Create comprehensive, objective reports
|
||||
with executive summary, detailed findings, source analysis, contradictions found,
|
||||
and conclusions. Use structured markdown with tables where appropriate."""
|
||||
REPORT_SYSTEM = (
|
||||
"You are a research analyst. Write a structured factual report. "
|
||||
"Be concise — do NOT pad with redundant summaries. "
|
||||
"NEVER restate a finding already listed. Each numbered finding must be distinct."
|
||||
)
|
||||
|
||||
THREAD_SYSTEM = """You are a social media expert. Create engaging Twitter/X thread content.
|
||||
Each tweet must be under 280 characters. Use numbers (1/N, 2/N...), hooks, cliffhangers.
|
||||
Make it shareable and engaging. Include relevant hashtags at the end."""
|
||||
THREAD_SYSTEM = (
|
||||
"You write Twitter/X threads. Each tweet must be under 280 chars. "
|
||||
"NEVER repeat information from a previous tweet. "
|
||||
"Each tweet must reveal something NEW. Number them 1/N, 2/N..."
|
||||
)
|
||||
|
||||
|
||||
PROMPTS = {
|
||||
OutputType.PODCAST: """Based on the research below about "{topic}", write a complete podcast script.
|
||||
OutputType.PODCAST: """\
|
||||
Write a podcast script about: "{topic}"
|
||||
|
||||
Structure:
|
||||
- INTRO (hook + topic intro, 2-3 min)
|
||||
- SEGMENT 1: Background & Context
|
||||
- SEGMENT 2: Key Facts & Evidence
|
||||
- SEGMENT 3: Controversies & Different Perspectives
|
||||
- SEGMENT 4: Deep Dive (most interesting finding)
|
||||
- OUTRO + Call to Action
|
||||
RULES — follow strictly:
|
||||
- Write as SPOKEN WORD: contractions, natural rhythm, as if talking to a friend
|
||||
- DO NOT use formal headings like "SEGMENT 1:" — just flow naturally
|
||||
- Each paragraph must introduce a NEW fact or angle — never restate something already said
|
||||
- If you find yourself repeating, stop and jump to the next new point
|
||||
- Aim for 800-1200 words of actual spoken content
|
||||
|
||||
Make it 20-30 minutes of content. Include host notes in [brackets].
|
||||
STRUCTURE (use natural transitions, not headers):
|
||||
1. Hook: open with the most surprising or dramatic fact
|
||||
2. Background: how did we get here?
|
||||
3. The key evidence or events (pick the 3 most interesting)
|
||||
4. Controversy or debate around the topic
|
||||
5. What does this mean / what happened next
|
||||
|
||||
RESEARCH MATERIAL:
|
||||
{context}
|
||||
|
||||
Write the complete script now:""",
|
||||
Write the script now (spoken word only, no stage directions except occasional [PAUSE]):""",
|
||||
|
||||
OutputType.BLOG: """Based on the research below about "{topic}", write a comprehensive blog post.
|
||||
OutputType.BLOG: """\
|
||||
Write a blog post about: "{topic}"
|
||||
|
||||
Requirements:
|
||||
- Compelling headline and meta description
|
||||
- Engaging intro with hook
|
||||
- Well-structured sections with H2/H3 headers
|
||||
- Key facts highlighted
|
||||
- Multiple perspectives presented
|
||||
- Strong conclusion with takeaways
|
||||
- Word count: 1500-2500 words
|
||||
- Tone: Informative but engaging
|
||||
RULES — follow strictly:
|
||||
- Each section under a heading must add NEW information not covered elsewhere
|
||||
- Do NOT summarize previous sections at the start of each new section
|
||||
- Do NOT repeat facts — if a fact appears once, do not mention it again
|
||||
- Use concrete details, numbers, names — avoid vague generalities
|
||||
- Target 1000-1500 words
|
||||
|
||||
STRUCTURE:
|
||||
# [Compelling headline]
|
||||
|
||||
[Hook paragraph — the most surprising fact]
|
||||
|
||||
## Background
|
||||
[Context — what, when, who — only facts not covered elsewhere]
|
||||
|
||||
## Key Facts
|
||||
[The most significant findings — each bullet must be distinct]
|
||||
|
||||
## Analysis / Significance
|
||||
[What this means — no repetition of Key Facts section]
|
||||
|
||||
## Conclusion
|
||||
[Takeaway — no more than 2 sentences summarizing, then a forward-looking statement]
|
||||
|
||||
RESEARCH MATERIAL:
|
||||
{context}
|
||||
|
||||
Write the complete blog post in markdown:""",
|
||||
|
||||
OutputType.REPORT: """Based on the research below about "{topic}", write a comprehensive research report.
|
||||
OutputType.REPORT: """\
|
||||
Write a research report about: "{topic}"
|
||||
|
||||
Structure:
|
||||
1. Executive Summary (200 words)
|
||||
2. Introduction & Scope
|
||||
3. Key Findings (numbered)
|
||||
4. Evidence Analysis
|
||||
5. Source Quality Assessment
|
||||
6. Contradictions & Disputed Claims
|
||||
7. Timeline of Events (if applicable)
|
||||
8. Conclusions
|
||||
9. Further Research Suggestions
|
||||
RULES — follow strictly:
|
||||
- Each numbered finding must be DISTINCT — no overlapping content
|
||||
- The Executive Summary must NOT repeat findings verbatim — only the 2-3 most critical points
|
||||
- Source quality and contradictions must reference specific claims, not generic statements
|
||||
- Be precise and concise — no filler
|
||||
|
||||
STRUCTURE:
|
||||
1. Executive Summary (3-4 sentences, key takeaways only)
|
||||
2. Key Findings (5-10 numbered, each completely distinct)
|
||||
3. Evidence Analysis (what the sources show, with any contradictions)
|
||||
4. Timeline (if applicable — specific dates/events)
|
||||
5. Conclusions & Open Questions
|
||||
|
||||
RESEARCH MATERIAL:
|
||||
{context}
|
||||
|
||||
Write the complete report in markdown:""",
|
||||
|
||||
OutputType.THREAD: """Based on the research below about "{topic}", write an engaging Twitter/X thread.
|
||||
OutputType.THREAD: """\
|
||||
Write a Twitter/X thread about: "{topic}"
|
||||
|
||||
Requirements:
|
||||
- Start with a KILLER hook tweet
|
||||
- 15-25 tweets total
|
||||
- Each tweet max 280 chars
|
||||
- Number them (1/20, 2/20...)
|
||||
- Include surprising facts
|
||||
- Build suspense between tweets
|
||||
- End with strong conclusion + CTA
|
||||
- Add relevant hashtags to last tweet
|
||||
RULES — follow strictly:
|
||||
- Each tweet must reveal ONE new fact or idea — never restate a previous tweet
|
||||
- Max 280 characters per tweet (count carefully)
|
||||
- Number format: 1/ 2/ 3/ ... N/
|
||||
- Hook tweet must be the most surprising/provocative fact
|
||||
- Build toward a conclusion — do not repeat the hook at the end
|
||||
- 12-18 tweets total
|
||||
|
||||
RESEARCH MATERIAL:
|
||||
{context}
|
||||
|
||||
Write the complete thread, one tweet per line:"""
|
||||
Write the thread (one tweet per line, nothing else):"""
|
||||
}
|
||||
|
||||
|
||||
@@ -141,8 +173,8 @@ class OutputGenerator:
|
||||
system = self._get_system(output_type)
|
||||
prompt = PROMPTS[output_type].format(topic=topic, context=context)
|
||||
|
||||
# Generate — may take a while with local LLM
|
||||
output = await self.ollama.generate(prompt, system=system, timeout=300)
|
||||
# Generate — temperature=0.7 reduces repetition in small models
|
||||
output = await self.ollama.generate(prompt, system=system, timeout=300, temperature=0.7)
|
||||
|
||||
# Add metadata header
|
||||
stats = await self.db.get_session_stats(session_id)
|
||||
|
||||
@@ -25,12 +25,17 @@ class OllamaClient:
|
||||
self.model = settings.ollama_model
|
||||
|
||||
async def generate(self, prompt: str, system: str = None,
|
||||
timeout: int = 120) -> str:
|
||||
timeout: int = 120, temperature: float = 0.7) -> str:
|
||||
payload = {
|
||||
"model": self.model,
|
||||
"prompt": prompt,
|
||||
"stream": False,
|
||||
"options": {"temperature": 0.1, "num_predict": 512}
|
||||
"options": {
|
||||
"temperature": temperature,
|
||||
"num_predict": 2048,
|
||||
"repeat_penalty": 1.15,
|
||||
"repeat_last_n": 128,
|
||||
}
|
||||
}
|
||||
if system:
|
||||
payload["system"] = system
|
||||
@@ -219,7 +224,7 @@ class ContentProcessor:
|
||||
f"Reply with ONLY a single integer 0-10. No explanation."
|
||||
)
|
||||
try:
|
||||
response = await self.ollama.generate(prompt)
|
||||
response = await self.ollama.generate(prompt, temperature=0.1)
|
||||
numbers = re.findall(r'\b(\d+(?:\.\d+)?)\b', response)
|
||||
if numbers:
|
||||
score = float(numbers[0])
|
||||
|
||||
Reference in New Issue
Block a user