From c4fb33fbf5bba52e31721fc1f9d45b325725aa32 Mon Sep 17 00:00:00 2001 From: ChemaVX Date: Tue, 28 Apr 2026 10:15:30 +0000 Subject: [PATCH] fix: WAL mode for concurrent reads, skipped stats, anti-repetition prompts database.py: enable PRAGMA journal_mode=WAL + synchronous=NORMAL so /status reads from concurrent connections see committed data without blocking behind the scraper's writes; add 'skipped' to get_session_stats bot.py: show skipped count in fmt_progress and cmd_status; use 'or 0' to guard against NULL from SUM(); label active research in /status processor.py: raise generate() temperature default to 0.7 + add repeat_penalty=1.15/repeat_last_n=128 to Ollama options to stop qwen2.5:3b from looping; scoring prompt keeps temperature=0.1 generator.py: rewrite all prompts with explicit "NEVER repeat" constraints and distinct-content rules per section; podcast prompt now asks for spoken-word style (no formal headers); reduce thread to 12-18 tweets (was 15-25) to fit model context; pass temperature=0.7 Co-Authored-By: Claude Sonnet 4.6 --- src/bot/bot.py | 22 +++--- src/db/database.py | 11 +-- src/generator/generator.py | 144 ++++++++++++++++++++++--------------- src/processor/processor.py | 11 ++- 4 files changed, 115 insertions(+), 73 deletions(-) diff --git a/src/bot/bot.py b/src/bot/bot.py index 79f4a2f..e8082c0 100644 --- a/src/bot/bot.py +++ b/src/bot/bot.py @@ -34,14 +34,15 @@ def is_authorized(user_id: int) -> bool: def fmt_progress(iteration: int, total: int, new: int, stats: dict) -> str: - scraped = stats.get("scraped", 0) - failed = stats.get("failed", 0) - pending = stats.get("pending", 0) + scraped = stats.get("scraped") or 0 + failed = stats.get("failed") or 0 + pending = stats.get("pending") or 0 + skipped = stats.get("skipped") or 0 return ( f"🔄 *Iteration {iteration}*\n" f"📚 Sources found: `{total}`\n" - f"✅ Scraped: `{scraped}` | ❌ Failed: `{failed}` | ⏳ Pending: `{pending}`\n" - f"🆕 New this round: `{new}`" + f"✅ Scraped: `{scraped}` | ⏭️ Skipped: `{skipped}` | ❌ Failed: `{failed}` | ⏳ Pending: `{pending}`\n" + f"🆕 New URLs this round: `{new}`" ) @@ -213,13 +214,14 @@ async def cmd_status(update: Update, ctx: ContextTypes.DEFAULT_TYPE): f"📝 Topic: `{session['topic']}`\n" f"🔁 Status: `{session['status']}`\n" f"🔢 Iterations: `{session.get('iterations', 0)}`\n" - f"📚 Total sources: `{stats.get('total', 0)}`\n" - f"✅ Scraped: `{stats.get('scraped', 0)}`\n" - f"❌ Failed: `{stats.get('failed', 0)}`\n" - f"⏳ Pending: `{stats.get('pending', 0)}`\n" + f"📚 Total sources: `{stats.get('total') or 0}`\n" + f"✅ Scraped: `{stats.get('scraped') or 0}`\n" + f"⏭️ Skipped: `{stats.get('skipped') or 0}`\n" + f"❌ Failed: `{stats.get('failed') or 0}`\n" + f"⏳ Pending: `{stats.get('pending') or 0}`\n" f"💬 Chunks: `{session.get('total_chunks', 0)}`\n" f"📖 Words: `{session.get('total_words', 0):,}`\n" - f"{'🟢 Active' if is_active else '⚫ Idle'}", + f"{'🟢 Active — stats update each iteration' if is_active else '⚫ Idle'}", parse_mode=ParseMode.MARKDOWN ) finally: diff --git a/src/db/database.py b/src/db/database.py index cf099fa..eb0352c 100644 --- a/src/db/database.py +++ b/src/db/database.py @@ -91,6 +91,8 @@ async def get_db() -> aiosqlite.Connection: Path(settings.db_path).parent.mkdir(parents=True, exist_ok=True) db = await aiosqlite.connect(settings.db_path) db.row_factory = aiosqlite.Row + await db.execute("PRAGMA journal_mode=WAL") + await db.execute("PRAGMA synchronous=NORMAL") await db.executescript(SCHEMA) await db.commit() return db @@ -140,11 +142,12 @@ class ResearchDB: async def get_session_stats(self, session_id: int) -> dict: cursor = await self.db.execute( - """SELECT + """SELECT COUNT(*) as total, - SUM(CASE WHEN status='scraped' THEN 1 ELSE 0 END) as scraped, - SUM(CASE WHEN status='failed' THEN 1 ELSE 0 END) as failed, - SUM(CASE WHEN status='pending' THEN 1 ELSE 0 END) as pending + SUM(CASE WHEN status='scraped' THEN 1 ELSE 0 END) as scraped, + SUM(CASE WHEN status='failed' THEN 1 ELSE 0 END) as failed, + SUM(CASE WHEN status='pending' THEN 1 ELSE 0 END) as pending, + SUM(CASE WHEN status='skipped' THEN 1 ELSE 0 END) as skipped FROM sources WHERE session_id = ?""", (session_id,) ) diff --git a/src/generator/generator.py b/src/generator/generator.py index bd11330..2360da7 100644 --- a/src/generator/generator.py +++ b/src/generator/generator.py @@ -9,92 +9,124 @@ from src.db.database import ResearchDB, OutputType logger = structlog.get_logger() -PODCAST_SYSTEM = """You are an expert podcast scriptwriter. Create engaging, well-structured scripts -that feel natural when spoken aloud. Use conversational language, rhetorical questions, -clear transitions, and compelling storytelling. Include [PAUSE], [EMPHASIS], and [MUSIC CUE] markers.""" +PODCAST_SYSTEM = ( + "You are a podcast scriptwriter. Write exactly as a host SPEAKS — contractions, " + "incomplete sentences, natural pauses, rhetorical questions. " + "NEVER repeat a sentence, phrase, or idea you already wrote. " + "Each paragraph must introduce NEW information. " + "Use [PAUSE], [EMPHASIS], [MUSIC CUE] markers sparingly." +) -BLOG_SYSTEM = """You are an expert blog writer and journalist. Create SEO-optimized, -well-structured articles with clear headings, engaging prose, and proper citations. -Use markdown formatting. Write for an educated general audience.""" +BLOG_SYSTEM = ( + "You are a journalist writing a blog post. Use clear markdown headings. " + "NEVER repeat the same fact or phrase twice — if you said something, move on. " + "Each section must add new information not covered in previous sections." +) -REPORT_SYSTEM = """You are an expert research analyst. Create comprehensive, objective reports -with executive summary, detailed findings, source analysis, contradictions found, -and conclusions. Use structured markdown with tables where appropriate.""" +REPORT_SYSTEM = ( + "You are a research analyst. Write a structured factual report. " + "Be concise — do NOT pad with redundant summaries. " + "NEVER restate a finding already listed. Each numbered finding must be distinct." +) -THREAD_SYSTEM = """You are a social media expert. Create engaging Twitter/X thread content. -Each tweet must be under 280 characters. Use numbers (1/N, 2/N...), hooks, cliffhangers. -Make it shareable and engaging. Include relevant hashtags at the end.""" +THREAD_SYSTEM = ( + "You write Twitter/X threads. Each tweet must be under 280 chars. " + "NEVER repeat information from a previous tweet. " + "Each tweet must reveal something NEW. Number them 1/N, 2/N..." +) PROMPTS = { - OutputType.PODCAST: """Based on the research below about "{topic}", write a complete podcast script. + OutputType.PODCAST: """\ +Write a podcast script about: "{topic}" -Structure: -- INTRO (hook + topic intro, 2-3 min) -- SEGMENT 1: Background & Context -- SEGMENT 2: Key Facts & Evidence -- SEGMENT 3: Controversies & Different Perspectives -- SEGMENT 4: Deep Dive (most interesting finding) -- OUTRO + Call to Action +RULES — follow strictly: +- Write as SPOKEN WORD: contractions, natural rhythm, as if talking to a friend +- DO NOT use formal headings like "SEGMENT 1:" — just flow naturally +- Each paragraph must introduce a NEW fact or angle — never restate something already said +- If you find yourself repeating, stop and jump to the next new point +- Aim for 800-1200 words of actual spoken content -Make it 20-30 minutes of content. Include host notes in [brackets]. +STRUCTURE (use natural transitions, not headers): +1. Hook: open with the most surprising or dramatic fact +2. Background: how did we get here? +3. The key evidence or events (pick the 3 most interesting) +4. Controversy or debate around the topic +5. What does this mean / what happened next RESEARCH MATERIAL: {context} -Write the complete script now:""", +Write the script now (spoken word only, no stage directions except occasional [PAUSE]):""", - OutputType.BLOG: """Based on the research below about "{topic}", write a comprehensive blog post. + OutputType.BLOG: """\ +Write a blog post about: "{topic}" -Requirements: -- Compelling headline and meta description -- Engaging intro with hook -- Well-structured sections with H2/H3 headers -- Key facts highlighted -- Multiple perspectives presented -- Strong conclusion with takeaways -- Word count: 1500-2500 words -- Tone: Informative but engaging +RULES — follow strictly: +- Each section under a heading must add NEW information not covered elsewhere +- Do NOT summarize previous sections at the start of each new section +- Do NOT repeat facts — if a fact appears once, do not mention it again +- Use concrete details, numbers, names — avoid vague generalities +- Target 1000-1500 words + +STRUCTURE: +# [Compelling headline] + +[Hook paragraph — the most surprising fact] + +## Background +[Context — what, when, who — only facts not covered elsewhere] + +## Key Facts +[The most significant findings — each bullet must be distinct] + +## Analysis / Significance +[What this means — no repetition of Key Facts section] + +## Conclusion +[Takeaway — no more than 2 sentences summarizing, then a forward-looking statement] RESEARCH MATERIAL: {context} Write the complete blog post in markdown:""", - OutputType.REPORT: """Based on the research below about "{topic}", write a comprehensive research report. + OutputType.REPORT: """\ +Write a research report about: "{topic}" -Structure: -1. Executive Summary (200 words) -2. Introduction & Scope -3. Key Findings (numbered) -4. Evidence Analysis -5. Source Quality Assessment -6. Contradictions & Disputed Claims -7. Timeline of Events (if applicable) -8. Conclusions -9. Further Research Suggestions +RULES — follow strictly: +- Each numbered finding must be DISTINCT — no overlapping content +- The Executive Summary must NOT repeat findings verbatim — only the 2-3 most critical points +- Source quality and contradictions must reference specific claims, not generic statements +- Be precise and concise — no filler + +STRUCTURE: +1. Executive Summary (3-4 sentences, key takeaways only) +2. Key Findings (5-10 numbered, each completely distinct) +3. Evidence Analysis (what the sources show, with any contradictions) +4. Timeline (if applicable — specific dates/events) +5. Conclusions & Open Questions RESEARCH MATERIAL: {context} Write the complete report in markdown:""", - OutputType.THREAD: """Based on the research below about "{topic}", write an engaging Twitter/X thread. + OutputType.THREAD: """\ +Write a Twitter/X thread about: "{topic}" -Requirements: -- Start with a KILLER hook tweet -- 15-25 tweets total -- Each tweet max 280 chars -- Number them (1/20, 2/20...) -- Include surprising facts -- Build suspense between tweets -- End with strong conclusion + CTA -- Add relevant hashtags to last tweet +RULES — follow strictly: +- Each tweet must reveal ONE new fact or idea — never restate a previous tweet +- Max 280 characters per tweet (count carefully) +- Number format: 1/ 2/ 3/ ... N/ +- Hook tweet must be the most surprising/provocative fact +- Build toward a conclusion — do not repeat the hook at the end +- 12-18 tweets total RESEARCH MATERIAL: {context} -Write the complete thread, one tweet per line:""" +Write the thread (one tweet per line, nothing else):""" } @@ -141,8 +173,8 @@ class OutputGenerator: system = self._get_system(output_type) prompt = PROMPTS[output_type].format(topic=topic, context=context) - # Generate — may take a while with local LLM - output = await self.ollama.generate(prompt, system=system, timeout=300) + # Generate — temperature=0.7 reduces repetition in small models + output = await self.ollama.generate(prompt, system=system, timeout=300, temperature=0.7) # Add metadata header stats = await self.db.get_session_stats(session_id) diff --git a/src/processor/processor.py b/src/processor/processor.py index 7fd02a1..041449f 100644 --- a/src/processor/processor.py +++ b/src/processor/processor.py @@ -25,12 +25,17 @@ class OllamaClient: self.model = settings.ollama_model async def generate(self, prompt: str, system: str = None, - timeout: int = 120) -> str: + timeout: int = 120, temperature: float = 0.7) -> str: payload = { "model": self.model, "prompt": prompt, "stream": False, - "options": {"temperature": 0.1, "num_predict": 512} + "options": { + "temperature": temperature, + "num_predict": 2048, + "repeat_penalty": 1.15, + "repeat_last_n": 128, + } } if system: payload["system"] = system @@ -219,7 +224,7 @@ class ContentProcessor: f"Reply with ONLY a single integer 0-10. No explanation." ) try: - response = await self.ollama.generate(prompt) + response = await self.ollama.generate(prompt, temperature=0.1) numbers = re.findall(r'\b(\d+(?:\.\d+)?)\b', response) if numbers: score = float(numbers[0])