From c4fb33fbf5bba52e31721fc1f9d45b325725aa32 Mon Sep 17 00:00:00 2001
From: ChemaVX <jmivanez@gmail.com>
Date: Tue, 28 Apr 2026 10:15:30 +0000
Subject: [PATCH] fix: WAL mode for concurrent reads, skipped stats,
 anti-repetition prompts

database.py: enable PRAGMA journal_mode=WAL + synchronous=NORMAL so
  /status reads from concurrent connections see committed data without
  blocking behind the scraper's writes; add 'skipped' to get_session_stats

bot.py: show skipped count in fmt_progress and cmd_status; use 'or 0'
  to guard against NULL from SUM(); label active research in /status

processor.py: raise generate() temperature default to 0.7 + add
  repeat_penalty=1.15/repeat_last_n=128 to Ollama options to stop
  qwen2.5:3b from looping; scoring prompt keeps temperature=0.1

generator.py: rewrite all prompts with explicit "NEVER repeat"
  constraints and distinct-content rules per section; podcast prompt
  now asks for spoken-word style (no formal headers); reduce thread
  to 12-18 tweets (was 15-25) to fit model context; pass temperature=0.7

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/bot/bot.py             |  22 +++---
 src/db/database.py         |  11 +--
 src/generator/generator.py | 144 ++++++++++++++++++++++---------------
 src/processor/processor.py |  11 ++-
 4 files changed, 115 insertions(+), 73 deletions(-)

diff --git a/src/bot/bot.py b/src/bot/bot.py
index 79f4a2f..e8082c0 100644
--- a/src/bot/bot.py
+++ b/src/bot/bot.py
@@ -34,14 +34,15 @@ def is_authorized(user_id: int) -> bool:
 
 
 def fmt_progress(iteration: int, total: int, new: int, stats: dict) -> str:
-    scraped = stats.get("scraped", 0)
-    failed = stats.get("failed", 0)
-    pending = stats.get("pending", 0)
+    scraped = stats.get("scraped") or 0
+    failed  = stats.get("failed")  or 0
+    pending = stats.get("pending") or 0
+    skipped = stats.get("skipped") or 0
     return (
         f"🔄 *Iteration {iteration}*\n"
         f"📚 Sources found: `{total}`\n"
-        f"✅ Scraped: `{scraped}` | ❌ Failed: `{failed}` | ⏳ Pending: `{pending}`\n"
-        f"🆕 New this round: `{new}`"
+        f"✅ Scraped: `{scraped}` | ⏭️ Skipped: `{skipped}` | ❌ Failed: `{failed}` | ⏳ Pending: `{pending}`\n"
+        f"🆕 New URLs this round: `{new}`"
     )
 
 
@@ -213,13 +214,14 @@ async def cmd_status(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
             f"📝 Topic: `{session['topic']}`\n"
             f"🔁 Status: `{session['status']}`\n"
             f"🔢 Iterations: `{session.get('iterations', 0)}`\n"
-            f"📚 Total sources: `{stats.get('total', 0)}`\n"
-            f"✅ Scraped: `{stats.get('scraped', 0)}`\n"
-            f"❌ Failed: `{stats.get('failed', 0)}`\n"
-            f"⏳ Pending: `{stats.get('pending', 0)}`\n"
+            f"📚 Total sources: `{stats.get('total') or 0}`\n"
+            f"✅ Scraped: `{stats.get('scraped') or 0}`\n"
+            f"⏭️ Skipped: `{stats.get('skipped') or 0}`\n"
+            f"❌ Failed: `{stats.get('failed') or 0}`\n"
+            f"⏳ Pending: `{stats.get('pending') or 0}`\n"
             f"💬 Chunks: `{session.get('total_chunks', 0)}`\n"
             f"📖 Words: `{session.get('total_words', 0):,}`\n"
-            f"{'🟢 Active' if is_active else '⚫ Idle'}",
+            f"{'🟢 Active — stats update each iteration' if is_active else '⚫ Idle'}",
             parse_mode=ParseMode.MARKDOWN
         )
     finally:
diff --git a/src/db/database.py b/src/db/database.py
index cf099fa..eb0352c 100644
--- a/src/db/database.py
+++ b/src/db/database.py
@@ -91,6 +91,8 @@ async def get_db() -> aiosqlite.Connection:
     Path(settings.db_path).parent.mkdir(parents=True, exist_ok=True)
     db = await aiosqlite.connect(settings.db_path)
     db.row_factory = aiosqlite.Row
+    await db.execute("PRAGMA journal_mode=WAL")
+    await db.execute("PRAGMA synchronous=NORMAL")
     await db.executescript(SCHEMA)
     await db.commit()
     return db
@@ -140,11 +142,12 @@ class ResearchDB:
 
     async def get_session_stats(self, session_id: int) -> dict:
         cursor = await self.db.execute(
-            """SELECT 
+            """SELECT
                 COUNT(*) as total,
-                SUM(CASE WHEN status='scraped' THEN 1 ELSE 0 END) as scraped,
-                SUM(CASE WHEN status='failed' THEN 1 ELSE 0 END) as failed,
-                SUM(CASE WHEN status='pending' THEN 1 ELSE 0 END) as pending
+                SUM(CASE WHEN status='scraped'  THEN 1 ELSE 0 END) as scraped,
+                SUM(CASE WHEN status='failed'   THEN 1 ELSE 0 END) as failed,
+                SUM(CASE WHEN status='pending'  THEN 1 ELSE 0 END) as pending,
+                SUM(CASE WHEN status='skipped'  THEN 1 ELSE 0 END) as skipped
                FROM sources WHERE session_id = ?""",
             (session_id,)
         )
diff --git a/src/generator/generator.py b/src/generator/generator.py
index bd11330..2360da7 100644
--- a/src/generator/generator.py
+++ b/src/generator/generator.py
@@ -9,92 +9,124 @@ from src.db.database import ResearchDB, OutputType
 
 logger = structlog.get_logger()
 
-PODCAST_SYSTEM = """You are an expert podcast scriptwriter. Create engaging, well-structured scripts 
-that feel natural when spoken aloud. Use conversational language, rhetorical questions, 
-clear transitions, and compelling storytelling. Include [PAUSE], [EMPHASIS], and [MUSIC CUE] markers."""
+PODCAST_SYSTEM = (
+    "You are a podcast scriptwriter. Write exactly as a host SPEAKS — contractions, "
+    "incomplete sentences, natural pauses, rhetorical questions. "
+    "NEVER repeat a sentence, phrase, or idea you already wrote. "
+    "Each paragraph must introduce NEW information. "
+    "Use [PAUSE], [EMPHASIS], [MUSIC CUE] markers sparingly."
+)
 
-BLOG_SYSTEM = """You are an expert blog writer and journalist. Create SEO-optimized, 
-well-structured articles with clear headings, engaging prose, and proper citations. 
-Use markdown formatting. Write for an educated general audience."""
+BLOG_SYSTEM = (
+    "You are a journalist writing a blog post. Use clear markdown headings. "
+    "NEVER repeat the same fact or phrase twice — if you said something, move on. "
+    "Each section must add new information not covered in previous sections."
+)
 
-REPORT_SYSTEM = """You are an expert research analyst. Create comprehensive, objective reports 
-with executive summary, detailed findings, source analysis, contradictions found, 
-and conclusions. Use structured markdown with tables where appropriate."""
+REPORT_SYSTEM = (
+    "You are a research analyst. Write a structured factual report. "
+    "Be concise — do NOT pad with redundant summaries. "
+    "NEVER restate a finding already listed. Each numbered finding must be distinct."
+)
 
-THREAD_SYSTEM = """You are a social media expert. Create engaging Twitter/X thread content.
-Each tweet must be under 280 characters. Use numbers (1/N, 2/N...), hooks, cliffhangers.
-Make it shareable and engaging. Include relevant hashtags at the end."""
+THREAD_SYSTEM = (
+    "You write Twitter/X threads. Each tweet must be under 280 chars. "
+    "NEVER repeat information from a previous tweet. "
+    "Each tweet must reveal something NEW. Number them 1/N, 2/N..."
+)
 
 
 PROMPTS = {
-    OutputType.PODCAST: """Based on the research below about "{topic}", write a complete podcast script.
+    OutputType.PODCAST: """\
+Write a podcast script about: "{topic}"
 
-Structure:
-- INTRO (hook + topic intro, 2-3 min)
-- SEGMENT 1: Background & Context
-- SEGMENT 2: Key Facts & Evidence  
-- SEGMENT 3: Controversies & Different Perspectives
-- SEGMENT 4: Deep Dive (most interesting finding)
-- OUTRO + Call to Action
+RULES — follow strictly:
+- Write as SPOKEN WORD: contractions, natural rhythm, as if talking to a friend
+- DO NOT use formal headings like "SEGMENT 1:" — just flow naturally
+- Each paragraph must introduce a NEW fact or angle — never restate something already said
+- If you find yourself repeating, stop and jump to the next new point
+- Aim for 800-1200 words of actual spoken content
 
-Make it 20-30 minutes of content. Include host notes in [brackets].
+STRUCTURE (use natural transitions, not headers):
+1. Hook: open with the most surprising or dramatic fact
+2. Background: how did we get here?
+3. The key evidence or events (pick the 3 most interesting)
+4. Controversy or debate around the topic
+5. What does this mean / what happened next
 
 RESEARCH MATERIAL:
 {context}
 
-Write the complete script now:""",
+Write the script now (spoken word only, no stage directions except occasional [PAUSE]):""",
 
-    OutputType.BLOG: """Based on the research below about "{topic}", write a comprehensive blog post.
+    OutputType.BLOG: """\
+Write a blog post about: "{topic}"
 
-Requirements:
-- Compelling headline and meta description
-- Engaging intro with hook
-- Well-structured sections with H2/H3 headers
-- Key facts highlighted
-- Multiple perspectives presented
-- Strong conclusion with takeaways
-- Word count: 1500-2500 words
-- Tone: Informative but engaging
+RULES — follow strictly:
+- Each section under a heading must add NEW information not covered elsewhere
+- Do NOT summarize previous sections at the start of each new section
+- Do NOT repeat facts — if a fact appears once, do not mention it again
+- Use concrete details, numbers, names — avoid vague generalities
+- Target 1000-1500 words
+
+STRUCTURE:
+# [Compelling headline]
+
+[Hook paragraph — the most surprising fact]
+
+## Background
+[Context — what, when, who — only facts not covered elsewhere]
+
+## Key Facts
+[The most significant findings — each bullet must be distinct]
+
+## Analysis / Significance
+[What this means — no repetition of Key Facts section]
+
+## Conclusion
+[Takeaway — no more than 2 sentences summarizing, then a forward-looking statement]
 
 RESEARCH MATERIAL:
 {context}
 
 Write the complete blog post in markdown:""",
 
-    OutputType.REPORT: """Based on the research below about "{topic}", write a comprehensive research report.
+    OutputType.REPORT: """\
+Write a research report about: "{topic}"
 
-Structure:
-1. Executive Summary (200 words)
-2. Introduction & Scope
-3. Key Findings (numbered)
-4. Evidence Analysis
-5. Source Quality Assessment
-6. Contradictions & Disputed Claims
-7. Timeline of Events (if applicable)
-8. Conclusions
-9. Further Research Suggestions
+RULES — follow strictly:
+- Each numbered finding must be DISTINCT — no overlapping content
+- The Executive Summary must NOT repeat findings verbatim — only the 2-3 most critical points
+- Source quality and contradictions must reference specific claims, not generic statements
+- Be precise and concise — no filler
+
+STRUCTURE:
+1. Executive Summary (3-4 sentences, key takeaways only)
+2. Key Findings (5-10 numbered, each completely distinct)
+3. Evidence Analysis (what the sources show, with any contradictions)
+4. Timeline (if applicable — specific dates/events)
+5. Conclusions & Open Questions
 
 RESEARCH MATERIAL:
 {context}
 
 Write the complete report in markdown:""",
 
-    OutputType.THREAD: """Based on the research below about "{topic}", write an engaging Twitter/X thread.
+    OutputType.THREAD: """\
+Write a Twitter/X thread about: "{topic}"
 
-Requirements:
-- Start with a KILLER hook tweet
-- 15-25 tweets total
-- Each tweet max 280 chars
-- Number them (1/20, 2/20...)
-- Include surprising facts
-- Build suspense between tweets
-- End with strong conclusion + CTA
-- Add relevant hashtags to last tweet
+RULES — follow strictly:
+- Each tweet must reveal ONE new fact or idea — never restate a previous tweet
+- Max 280 characters per tweet (count carefully)
+- Number format: 1/ 2/ 3/ ... N/
+- Hook tweet must be the most surprising/provocative fact
+- Build toward a conclusion — do not repeat the hook at the end
+- 12-18 tweets total
 
 RESEARCH MATERIAL:
 {context}
 
-Write the complete thread, one tweet per line:"""
+Write the thread (one tweet per line, nothing else):"""
 }
 
 
@@ -141,8 +173,8 @@ class OutputGenerator:
         system = self._get_system(output_type)
         prompt = PROMPTS[output_type].format(topic=topic, context=context)
 
-        # Generate — may take a while with local LLM
-        output = await self.ollama.generate(prompt, system=system, timeout=300)
+        # Generate — temperature=0.7 reduces repetition in small models
+        output = await self.ollama.generate(prompt, system=system, timeout=300, temperature=0.7)
 
         # Add metadata header
         stats = await self.db.get_session_stats(session_id)
diff --git a/src/processor/processor.py b/src/processor/processor.py
index 7fd02a1..041449f 100644
--- a/src/processor/processor.py
+++ b/src/processor/processor.py
@@ -25,12 +25,17 @@ class OllamaClient:
         self.model = settings.ollama_model
 
     async def generate(self, prompt: str, system: str = None,
-                       timeout: int = 120) -> str:
+                       timeout: int = 120, temperature: float = 0.7) -> str:
         payload = {
             "model": self.model,
             "prompt": prompt,
             "stream": False,
-            "options": {"temperature": 0.1, "num_predict": 512}
+            "options": {
+                "temperature": temperature,
+                "num_predict": 2048,
+                "repeat_penalty": 1.15,
+                "repeat_last_n": 128,
+            }
         }
         if system:
             payload["system"] = system
@@ -219,7 +224,7 @@ class ContentProcessor:
             f"Reply with ONLY a single integer 0-10. No explanation."
         )
         try:
-            response = await self.ollama.generate(prompt)
+            response = await self.ollama.generate(prompt, temperature=0.1)
             numbers = re.findall(r'\b(\d+(?:\.\d+)?)\b', response)
             if numbers:
                 score = float(numbers[0])