researchowl/src/generator/generator.py

"""
ResearchOwl Generators
Produces structured outputs from processed research using Claude or Ollama
"""
import base64
import hashlib
import hmac
import json
import re
import time

import structlog

from src.config import settings
from src.processor.processor import OllamaClient, ContentProcessor
from src.db.database import ResearchDB, OutputType

logger = structlog.get_logger()

PODCAST_SYSTEM = (
    "Escribe SIEMPRE en español. "
    "Eres un guionista de podcast. Escribe exactamente como un presentador HABLA — contracciones, "
    "frases naturales, pausas, preguntas retóricas. "
    "NUNCA repitas una frase o idea que ya escribiste. "
    "Cada párrafo debe introducir información NUEVA. "
    "Usa marcadores [PAUSA], [ÉNFASIS], [MÚSICA] con moderación."
)

BLOG_SYSTEM = (
    "Escribe SIEMPRE en español. "
    "Eres un periodista escribiendo un artículo de blog. Usa encabezados markdown claros. "
    "NUNCA repitas el mismo dato o frase dos veces — si ya lo dijiste, avanza. "
    "Cada sección debe añadir información nueva no cubierta en secciones anteriores."
)

BLOG_SYSTEM_EN = (
    "You write ALWAYS in English. "
    "You are a journalist writing a blog article. Use clear markdown headers. "
    "NEVER repeat the same fact or phrase twice — if you said it, move on. "
    "Each section must add new information not covered in other sections."
)

BLOG_PROMPT_EN = """\
Write a blog article about: "{topic}"

RULES — follow strictly:
- Each section under a heading must add NEW information not covered elsewhere
- Do NOT summarize previous sections at the start of each new section
- Do NOT repeat facts — if a fact appears once, do not mention it again
- Use concrete details, numbers, names — avoid vague generalities
- Target: 1000-1500 words

STRUCTURE:
# [Impactful headline]

[Hook paragraph — the most surprising fact]

## Background
[Context — what, when, who — only facts not covered elsewhere]

## Key Facts
[Most significant findings — each point must be distinct]

## Analysis / Significance
[What this means — without repeating the Key Facts section]

## Conclusion
[No more than 2 sentences summarizing, then a forward-looking statement]

RESEARCH MATERIAL:
{context}

Write the complete article in markdown:"""

REPORT_SYSTEM = (
    "Escribe SIEMPRE en español. "
    "Eres un analista de investigación. Escribe un informe estructurado y factual. "
    "Sé conciso — NO rellenes con resúmenes redundantes. "
    "NUNCA repitas un hallazgo ya listado. Cada hallazgo numerado debe ser distinto."
)

THREAD_SYSTEM = (
    "Escribe SIEMPRE en español. "
    "Escribes hilos de Twitter/X. Cada tweet debe tener menos de 280 caracteres. "
    "NUNCA repitas información de un tweet anterior. "
    "Cada tweet debe revelar algo NUEVO. Numéralos 1/N, 2/N..."
)


PROMPTS = {
    OutputType.PODCAST: """\
Escribe un guion de podcast sobre: "{topic}"

REGLAS — sigue estrictamente:
- Escribe como PALABRA HABLADA: contracciones, ritmo natural, como si hablaras con un amigo
- NO uses encabezados formales como "SEGMENTO 1:" — fluye de forma natural
- Cada párrafo debe introducir un NUEVO hecho o ángulo — nunca repitas algo ya dicho
- Si te encuentras repitiendo, para y salta al siguiente punto nuevo
- Objetivo: 800-1200 palabras de contenido hablado real

ESTRUCTURA (usa transiciones naturales, no encabezados):
1. Gancho: abre con el hecho más sorprendente o dramático
2. Contexto: ¿cómo llegamos aquí?
3. Las evidencias o eventos clave (elige los 3 más interesantes)
4. La controversia o debate sobre el tema
5. ¿Qué significa esto / qué pasó después?

MATERIAL DE INVESTIGACIÓN:
{context}

Escribe el guion ahora (solo palabra hablada, sin acotaciones excepto [PAUSA] ocasional):""",

    OutputType.BLOG: """\
Escribe un artículo de blog sobre: "{topic}"

REGLAS — sigue estrictamente:
- Cada sección bajo un encabezado debe añadir información NUEVA no cubierta en otro lugar
- NO resumas secciones anteriores al inicio de cada nueva sección
- NO repitas hechos — si un hecho aparece una vez, no lo menciones de nuevo
- Usa detalles concretos, números, nombres — evita generalidades vagas
- Objetivo: 1000-1500 palabras

ESTRUCTURA:
# [Titular impactante]

[Párrafo gancho — el hecho más sorprendente]

## Contexto
[Contexto — qué, cuándo, quién — solo hechos no cubiertos en otro lugar]

## Hechos Clave
[Los hallazgos más significativos — cada punto debe ser distinto]

## Análisis / Importancia
[Qué significa esto — sin repetir la sección de Hechos Clave]

## Conclusión
[Conclusión — no más de 2 oraciones resumiendo, luego una declaración prospectiva]

MATERIAL DE INVESTIGACIÓN:
{context}

Escribe el artículo completo en markdown:""",

    OutputType.REPORT: """\
Escribe un informe de investigación sobre: "{topic}"

REGLAS — sigue estrictamente:
- Cada hallazgo numerado debe ser DISTINTO — sin contenido que se superponga
- El Resumen Ejecutivo NO debe repetir los hallazgos literalmente — solo los 2-3 puntos más críticos
- La calidad de las fuentes y contradicciones deben referenciar afirmaciones específicas, no declaraciones genéricas
- Sé preciso y conciso — sin relleno

ESTRUCTURA:
1. Resumen Ejecutivo (3-4 oraciones, solo puntos clave)
2. Hallazgos Clave (5-10 numerados, cada uno completamente distinto)
3. Análisis de Evidencia (lo que muestran las fuentes, con cualquier contradicción)
4. Cronología (si aplica — fechas/eventos específicos)
5. Conclusiones y Preguntas Abiertas

MATERIAL DE INVESTIGACIÓN:
{context}

Escribe el informe completo en markdown:""",

    OutputType.THREAD: """\
Escribe un hilo de Twitter/X sobre: "{topic}"

REGLAS — sigue estrictamente:
- Cada tweet debe revelar UN nuevo hecho o idea — nunca repetir un tweet anterior
- Máximo 280 caracteres por tweet (cuenta cuidadosamente)
- Formato de numeración: 1/ 2/ 3/ ... N/
- El tweet gancho debe ser el hecho más sorprendente/provocador
- Avanza hacia una conclusión — no repitas el gancho al final
- 12-18 tweets en total

MATERIAL DE INVESTIGACIÓN:
{context}

Escribe el hilo (un tweet por línea, nada más):"""
}


OUTLINE_REPORT = """
IMPORTANTE: Los títulos y queries deben estar en español, independientemente del idioma del material de investigación.

Eres un editor de investigación. Dado el tema "{topic}" y el material
disponible, genera un outline detallado para un informe exhaustivo.

Devuelve SOLO una lista JSON de secciones, sin texto adicional, sin
markdown, sin explicaciones. Formato exacto:
[
  {{"title": "Título de la sección", "query": "términos de búsqueda específicos para esta sección", "words": 800}},
  ...
]

Genera entre 6 y 10 secciones. Cada sección debe:
- Cubrir un ángulo distinto del tema
- Tener una query específica para recuperar chunks relevantes
- Indicar longitud objetivo en palabras (400-1200)

Material disponible (resumen):
{context_summary}
"""

OUTLINE_BLOG = """
IMPORTANTE: Los títulos y queries deben estar en español, independientemente del idioma del material de investigación.

Eres un editor de contenido. Dado el tema "{topic}" y el material
disponible, genera un outline para un artículo de blog exhaustivo.

Devuelve SOLO una lista JSON de secciones, sin texto adicional:
[
  {{"title": "Título de sección", "query": "términos búsqueda", "words": 600}},
  ...
]

Genera entre 5 y 8 secciones. Primera sección = introducción gancho.
Última sección = conclusión con perspectiva original.

Material disponible (resumen):
{context_summary}
"""

OUTLINE_PODCAST = """
IMPORTANTE: Los títulos y queries deben estar en español, independientemente del idioma del material de investigación.

Eres un productor de podcast. Dado el tema "{topic}" y el material
disponible, genera un outline para un guion de podcast exhaustivo.

Devuelve SOLO una lista JSON de segmentos, sin texto adicional:
[
  {{"title": "Nombre del segmento", "query": "términos búsqueda", "words": 700}},
  ...
]

Genera entre 5 y 7 segmentos. Flujo natural: gancho → contexto →
desarrollo → controversia → conclusión.

Material disponible (resumen):
{context_summary}
"""


# ─── Ghost CMS ────────────────────────────────────────────────────────────────

def _b64url(data: bytes | str) -> str:
    if isinstance(data, str):
        data = data.encode()
    return base64.urlsafe_b64encode(data).rstrip(b"=").decode()


def _extract_title(content: str) -> str:
    """Return first H1 heading from markdown, skipping the ResearchOwl header block."""
    in_header = False
    for line in content.splitlines():
        stripped = line.strip()
        if stripped == "---":
            in_header = not in_header
            continue
        if in_header:
            continue
        if stripped.startswith("# ") and not stripped.startswith("## "):
            return stripped[2:].strip()
    return ""


def _strip_researchowl_header(content: str) -> str:
    """Remove the ---...--- metadata block that ResearchOwl prepends to outputs."""
    lines = content.splitlines(keepends=True)
    dashes_seen = 0
    for i, line in enumerate(lines):
        if line.strip() == "---":
            dashes_seen += 1
            if dashes_seen == 2:
                return "".join(lines[i + 1:]).lstrip("\n")
    return content


class GhostPublisher:
    def __init__(self, lang: str = "es"):
        if lang == "en":
            self.url = (settings.ghost_url_en or "").rstrip("/")
            self.api_key = settings.ghost_api_key_en or ""
        else:
            self.url = (settings.ghost_url or "").rstrip("/")
            self.api_key = settings.ghost_api_key or ""

    def is_configured(self) -> bool:
        return bool(self.url and self.api_key)

    def _make_token(self) -> str:
        key_id, secret = self.api_key.split(":", 1)
        now = int(time.time())
        header = _b64url(json.dumps({"alg": "HS256", "typ": "JWT", "kid": key_id}))
        payload = _b64url(json.dumps({"iat": now, "exp": now + 300, "aud": "/admin/"}))
        signing = f"{header}.{payload}"
        sig = _b64url(
            hmac.new(bytes.fromhex(secret), signing.encode(), hashlib.sha256).digest()
        )
        return f"{signing}.{sig}"

    async def publish_draft(self, title: str, markdown_content: str,
                            tags: list[str] | None = None) -> dict:
        import aiohttp as _aio
        import markdown as _md

        clean = _strip_researchowl_header(markdown_content)
        html = _md.markdown(clean, extensions=["extra"])

        # Ghost añade el título automáticamente — eliminar el primer <h1> para evitar duplicado
        html = re.sub(r"<h1[^>]*>.*?</h1>", "", html, count=1, flags=re.DOTALL).lstrip()

        logger.info("Ghost publish_draft", html_length=len(html),
                    html_preview=html[:200])

        if not html.strip():
            raise ValueError("Ghost: HTML vacío tras conversión markdown — contenido no enviado")

        # Ghost 5.x (Lexical editor): el campo "html" es solo de lectura en la API.
        # La forma fiable de enviar HTML arbitrario es via mobiledoc con HTML card,
        # que Ghost acepta en todas las versiones de v5 y renderiza sin conversión.
        mobiledoc = json.dumps({
            "version": "0.3.1",
            "atoms": [],
            "cards": [["html", {"html": html}]],
            "markups": [],
            "sections": [[10, 0]],
        })

        token = self._make_token()
        body = {
            "posts": [{
                "title": title,
                "mobiledoc": mobiledoc,
                "status": "draft",
                "tags": [{"name": t} for t in (tags or ["investigacion"])],
            }]
        }
        async with _aio.ClientSession() as sess:
            async with sess.post(
                f"{self.url}/ghost/api/admin/posts/",
                json=body,
                headers={
                    "Authorization": f"Ghost {token}",
                    "Accept-Version": "v5.0",
                },
            ) as resp:
                if resp.status not in (200, 201):
                    text = await resp.text()
                    raise ValueError(f"Ghost API {resp.status}: {text[:300]}")
                return await resp.json()


# ─── Output generation ────────────────────────────────────────────────────────

class OutputGenerator:
    def __init__(self, db: ResearchDB, ollama: OllamaClient, processor: ContentProcessor):
        self.db = db
        self.ollama = ollama
        self.processor = processor

    async def generate(self, session_id: int, output_type: OutputType,
                       progress_callback=None, lang: str = "es") -> str:
        """Generate an output for a research session"""
        if output_type in (OutputType.REPORT_EXTENDED,
                           OutputType.BLOG_EXTENDED,
                           OutputType.PODCAST_EXTENDED):
            return await self.generate_extended(session_id, output_type, progress_callback,
                                                lang=lang)

        session = await self.db.get_session(session_id)
        if not session:
            raise ValueError(f"Session {session_id} not found")

        topic = session["topic"]
        logger.info("Generating output", type=output_type, topic=topic)

        if progress_callback:
            await progress_callback(f"🔍 Retrieving best research material for {output_type}...")

        # RAG: get most relevant context for this output type
        query = self._get_rag_query(output_type, topic)
        context = await self.processor.rag_query(session_id, query, top_k=80)

        if not context:
            # Fallback: use raw top chunks
            chunks = await self.db.get_top_chunks(session_id, limit=20)
            context = "\n\n---\n\n".join(c["content"] for c in chunks)

        if not context:
            raise ValueError("No processed content available. Run /process first.")

        backend = "Claude Haiku" if settings.anthropic_api_key else "Ollama"
        if progress_callback:
            await progress_callback(f"✍️ Generando {output_type} con {backend}... (2-5 min)")

        # Build prompt
        system = self._get_system(output_type)
        prompt = PROMPTS[output_type].format(topic=topic, context=context)

        if lang == "en" and output_type == OutputType.BLOG:
            system = BLOG_SYSTEM_EN
            prompt = BLOG_PROMPT_EN.format(topic=topic, context=context)

        output = await self._generate(prompt, system, output_type, session_id)

        # Add metadata header
        stats = await self.db.get_session_stats(session_id)
        header = self._build_header(topic, output_type, session, stats)
        full_output = header + "\n\n" + output

        # Save to DB
        await self.db.save_output(session_id, output_type, full_output)

        # Auto-publish to Ghost for blog outputs
        ghost_notice = ""
        if output_type in (OutputType.BLOG, OutputType.BLOG_EXTENDED):
            ghost = GhostPublisher(lang=lang)
            if ghost.is_configured():
                try:
                    title = _extract_title(full_output) or topic
                    result = await ghost.publish_draft(title, full_output)
                    post = result["posts"][0]
                    ghost_notice = (
                        f"\n\n---\n"
                        f"📤 *Borrador publicado en Ghost*\n"
                        f"Editar: {ghost.url}/ghost/#/editor/post/{post['id']}"
                    )
                    logger.info("Auto-published blog to Ghost", post_id=post["id"])
                except Exception as e:
                    logger.warning("Auto-publish to Ghost failed", error=str(e))

        logger.info("Output generated", type=output_type, length=len(full_output))
        return full_output + ghost_notice

    async def _generate(self, prompt: str, system: str, output_type: OutputType,
                         session_id: int | None = None) -> str:
        if settings.anthropic_api_key:
            return await self._generate_with_claude(prompt, system, output_type, session_id)
        return await self._generate_with_ollama(prompt, system)

    async def _generate_with_claude(self, prompt: str, system: str, output_type: OutputType,
                                     session_id: int | None = None) -> str:
        import anthropic
        max_tokens = 4096 if output_type == OutputType.THREAD else 16000
        try:
            client = anthropic.AsyncAnthropic(api_key=settings.anthropic_api_key)
            msg = await client.messages.create(
                model=settings.claude_model,
                max_tokens=max_tokens,
                system=system,
                messages=[{"role": "user", "content": prompt}],
            )
            if session_id is not None:
                try:
                    await self.db.log_api_call(
                        session_id, "generation", settings.claude_model,
                        msg.usage.input_tokens, msg.usage.output_tokens
                    )
                except Exception as log_err:
                    logger.warning("Failed to log API usage", error=str(log_err))
            return msg.content[0].text.strip()
        except Exception as e:
            logger.warning("Claude generation failed, falling back to Ollama", error=str(e))
            return await self._generate_with_ollama(prompt, system)

    async def _generate_with_ollama(self, prompt: str, system: str) -> str:
        return await self.ollama.generate(prompt, system=system, timeout=300, temperature=0.7)

    def _get_rag_query(self, output_type: OutputType, topic: str) -> str:
        queries = {
            OutputType.PODCAST: f"{topic} story narrative facts interesting",
            OutputType.BLOG: f"{topic} key facts evidence analysis",
            OutputType.REPORT: f"{topic} evidence data official findings",
            OutputType.THREAD: f"{topic} surprising facts shocking revelations",
        }
        return queries.get(output_type, topic)

    def _get_system(self, output_type: OutputType) -> str:
        systems = {
            OutputType.PODCAST: PODCAST_SYSTEM,
            OutputType.BLOG: BLOG_SYSTEM,
            OutputType.REPORT: REPORT_SYSTEM,
            OutputType.THREAD: THREAD_SYSTEM,
        }
        return systems.get(output_type, "You are a helpful research assistant.")

    async def generate_extended(self, session_id: int, output_type: OutputType,
                                progress_callback=None, lang: str = "es") -> str:
        """
        Generación por secciones para outputs exhaustivos.
        1. Recupera muestra de contexto para el outline
        2. Genera outline con Claude (lista de secciones)
        3. Para cada sección: RAG específico → genera sección
        4. Concatena y guarda
        """
        session = await self.db.get_session(session_id)
        if not session:
            raise ValueError(f"Session {session_id} not found")
        topic = session["topic"]

        # Paso 1: contexto resumen para el outline (top 10 chunks)
        top_chunks = await self.db.get_top_chunks(session_id, limit=10)
        if not top_chunks:
            raise ValueError("No processed content available. Run /process first.")
        context_summary = "\n\n".join(
            f"- {c.get('title', '')}: {c['content'][:300]}"
            for c in top_chunks
        )

        if progress_callback:
            await progress_callback("🗂️ Generando estructura del documento…")

        # Paso 2: outline
        base_type = output_type.value.replace("_extended", "")
        outline_prompts = {
            "report": OUTLINE_REPORT,
            "blog": OUTLINE_BLOG,
            "podcast": OUTLINE_PODCAST,
        }
        outline_prompt = outline_prompts[base_type].format(
            topic=topic, context_summary=context_summary
        )

        outline_json = await self._generate_raw(outline_prompt, session_id)
        try:
            import json as _json
            clean = outline_json.strip()
            if clean.startswith("```"):
                clean = "\n".join(clean.split("\n")[1:])
            if clean.endswith("```"):
                clean = "\n".join(clean.split("\n")[:-1])
            sections = _json.loads(clean.strip())
        except Exception as e:
            logger.error("Failed to parse outline", error=str(e), raw=outline_json[:200])
            raise ValueError(f"No se pudo generar el outline: {e}")

        if progress_callback:
            await progress_callback(
                f"✍️ Generando {len(sections)} secciones… (esto tardará varios minutos)"
            )

        # Paso 3: generar cada sección
        base_output_type = OutputType(base_type)
        system = self._get_system(base_output_type)
        if lang == "en" and output_type == OutputType.BLOG_EXTENDED:
            system = BLOG_SYSTEM_EN
        sections_text = []

        for i, section in enumerate(sections, 1):
            title = section.get("title", f"Sección {i}")
            query = section.get("query", topic)
            target_words = section.get("words", 600)

            if progress_callback:
                await progress_callback(
                    f"✍️ Sección {i}/{len(sections)}: {title[:40]}…"
                )

            section_context = await self.processor.rag_query(session_id, query, top_k=40)
            if not section_context:
                section_context = context_summary

            lang_rule = "- Write in English\n" if lang == "en" else "- Escribe en español\n"
            section_prompt = (
                f"Escribe la sección '{title}' del {base_type} sobre: '{topic}'\n\n"
                f"REGLAS:\n"
                f"- NO incluyas ningún título o encabezado al inicio de tu respuesta — el título de la sección ya está incluido externamente\n"
                f"- Esta es UNA sección de un documento más largo — no repitas introducción ni conclusión general\n"
                f"- No incluyas encabezados del documento completo, solo el contenido de esta sección\n"
                f"- Objetivo: aproximadamente {target_words} palabras\n"
                f"- Usa SOLO información del material siguiente — no inventes datos\n"
                f"{lang_rule}"
                f"\nMATERIAL:\n{section_context}"
            )

            section_text = await self._generate(
                section_prompt, system, base_output_type, session_id
            )
            sections_text.append(f"## {title}\n\n{section_text}")

        # Paso 4: concatenar
        full_content = "\n\n---\n\n".join(sections_text)
        stats = await self.db.get_session_stats(session_id)
        header = self._build_header(topic, output_type, session, stats)
        full_output = header + "\n\n" + full_content

        await self.db.save_output(session_id, output_type, full_output)

        # Auto-publish to Ghost for extended blog outputs
        ghost_notice = ""
        if output_type == OutputType.BLOG_EXTENDED:
            ghost = GhostPublisher(lang=lang)
            if ghost.is_configured():
                try:
                    title = _extract_title(full_output) or topic
                    result = await ghost.publish_draft(title, full_output)
                    post = result["posts"][0]
                    ghost_notice = (
                        f"\n\n---\n"
                        f"📤 *Borrador publicado en Ghost*\n"
                        f"Editar: {ghost.url}/ghost/#/editor/post/{post['id']}"
                    )
                    logger.info("Auto-published extended blog to Ghost", post_id=post["id"])
                except Exception as e:
                    logger.warning("Auto-publish to Ghost failed (extended)", error=str(e))

        logger.info("Extended output generated", type=output_type,
                    sections=len(sections), length=len(full_output))
        return full_output + ghost_notice

    async def _generate_raw(self, prompt: str,
                             session_id: int | None = None) -> str:
        if settings.anthropic_api_key:
            import anthropic
            try:
                client = anthropic.AsyncAnthropic(api_key=settings.anthropic_api_key)
                msg = await client.messages.create(
                    model=settings.claude_model,
                    max_tokens=2048,
                    messages=[{"role": "user", "content": prompt}],
                )
                if session_id is not None:
                    try:
                        await self.db.log_api_call(
                            session_id, "outline", settings.claude_model,
                            msg.usage.input_tokens, msg.usage.output_tokens
                        )
                    except Exception:
                        pass
                return msg.content[0].text.strip()
            except Exception as e:
                logger.warning("Claude outline failed", error=str(e))
                raise
        raise ValueError("Claude API key required for extended generation")

    def _build_header(self, topic: str, output_type: OutputType,
                      session: dict, stats: dict) -> str:
        from datetime import datetime
        dt = datetime.utcnow().strftime("%Y-%m-%d %H:%M UTC")
        return f"""---
ResearchOwl | {output_type.upper()} OUTPUT
Topic: {topic}
Generated: {dt}
Sources: {stats.get('scraped', 0)} scraped | {stats.get('failed', 0)} failed
Iterations: {session.get('iterations', 0)}
Total words researched: {session.get('total_words', 0):,}
---
"""


def _remove_duplicate_headings(text: str) -> str:
    lines = text.split('\n')
    result = []
    i = 0
    while i < len(lines):
        line = lines[i].rstrip()
        if line.startswith('# ') and not line.startswith('## '):
            h1_text = line[2:].strip().lower()
            window = result[-5:] if len(result) >= 5 else result[:]
            recent_h2s = {
                p.strip()[3:].strip().lower()
                for p in window
                if p.strip().startswith('## ')
            }
            if h1_text in recent_h2s:
                i += 1
                continue
        result.append(lines[i])
        i += 1
    return '\n'.join(result)


def generate_pdf(content: str, title: str = "ResearchOwl Output") -> bytes:
    content = _remove_duplicate_headings(content)
    try:
        from reportlab.lib.pagesizes import A4
        from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
        from reportlab.lib.units import cm
        from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, HRFlowable
        from reportlab.lib.enums import TA_LEFT
        from reportlab.lib import colors
        import io
        import re
    except ImportError:
        raise ImportError("reportlab is required for PDF export — pip install reportlab")

    buf = io.BytesIO()
    doc = SimpleDocTemplate(
        buf,
        pagesize=A4,
        rightMargin=2 * cm,
        leftMargin=2 * cm,
        topMargin=2.5 * cm,
        bottomMargin=2 * cm,
        title=title,
    )

    base = getSampleStyleSheet()
    normal = ParagraphStyle("RO_Normal", parent=base["Normal"],
                            fontSize=10, leading=14, spaceAfter=4)
    h1 = ParagraphStyle("RO_H1", parent=base["Heading1"],
                        fontSize=18, spaceBefore=12, spaceAfter=6,
                        textColor=colors.HexColor("#1a1a2e"))
    h2 = ParagraphStyle("RO_H2", parent=base["Heading2"],
                        fontSize=14, spaceBefore=10, spaceAfter=4,
                        textColor=colors.HexColor("#16213e"))
    h3 = ParagraphStyle("RO_H3", parent=base["Heading3"],
                        fontSize=12, spaceBefore=8, spaceAfter=4)
    code_style = ParagraphStyle("RO_Code", parent=base["Code"],
                                fontSize=9, leading=12, fontName="Courier",
                                backColor=colors.HexColor("#f4f4f4"), spaceAfter=4)
    bullet_style = ParagraphStyle("RO_Bullet", parent=normal,
                                  leftIndent=20, bulletIndent=10, spaceAfter=2)

    def md_to_para(text: str) -> str:
        text = text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
        text = re.sub(r'\*\*(.+?)\*\*', r'<b>\1</b>', text)
        text = re.sub(r'__(.+?)__', r'<b>\1</b>', text)
        text = re.sub(r'\*(.+?)\*', r'<i>\1</i>', text)
        text = re.sub(r'_(.+?)_', r'<i>\1</i>', text)
        text = re.sub(r'`(.+?)`', r'<font name="Courier">\1</font>', text)
        return text

    story = []
    lines = content.split("\n")
    in_code = False
    code_buf = []

    for line in lines:
        if line.startswith("```"):
            if not in_code:
                in_code = True
                code_buf = []
            else:
                in_code = False
                try:
                    story.append(Paragraph(
                        "<br/>".join(l.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
                                     for l in code_buf),
                        code_style
                    ))
                except Exception:
                    pass
            continue

        if in_code:
            code_buf.append(line)
            continue

        if re.match(r'^[-*_]{3,}$', line.strip()):
            story.append(HRFlowable(width="100%", thickness=0.5,
                                    color=colors.grey, spaceAfter=6))
            continue

        try:
            if line.startswith("### "):
                story.append(Paragraph(md_to_para(line[4:]), h3))
            elif line.startswith("## "):
                story.append(Paragraph(md_to_para(line[3:]), h2))
            elif line.startswith("# "):
                story.append(Paragraph(md_to_para(line[2:]), h1))
            elif re.match(r'^[-*+] ', line):
                story.append(Paragraph("• " + md_to_para(line[2:]), bullet_style))
            elif re.match(r'^\d+\. ', line):
                story.append(Paragraph(md_to_para(line), bullet_style))
            elif line.strip() == "":
                story.append(Spacer(1, 6))
            else:
                story.append(Paragraph(md_to_para(line), normal))
        except Exception:
            try:
                story.append(Paragraph(line[:300], normal))
            except Exception:
                pass

    doc.build(story)
    return buf.getvalue()


async def generate_diff_summary(
    topic: str,
    new_urls: set,
    old_urls: set,
    new_chunks: list,
    session_id: int,
    db,
) -> str | None:
    from src.config import settings
    import structlog
    diff_logger = structlog.get_logger()

    added_urls = new_urls - old_urls
    pct_new = len(added_urls) / max(len(new_urls), 1)

    diff_logger.info("Diff analysis", topic=topic,
                     new_urls=len(new_urls), old_urls=len(old_urls),
                     added=len(added_urls), pct_new=round(pct_new, 2))

    if pct_new < 0.20 and len(added_urls) < 5:
        diff_logger.info("Diff: no significant new sources", topic=topic)
        return None

    if not new_chunks:
        return None

    context = "\n\n---\n\n".join(
        f"[{c.get('source_type', 'web').upper()}] {c.get('title', '')}\n{c['content'][:400]}"
        for c in new_chunks[:20]
    )

    if not settings.anthropic_api_key:
        return (
            f"📊 *Novedades detectadas sobre {topic}*\n\n"
            f"• {len(added_urls)} fuentes nuevas encontradas\n"
            f"• {len(new_chunks)} chunks de contenido procesados\n\n"
            f"Usa /generate report para ver el análisis completo."
        )

    try:
        import anthropic
        client = anthropic.AsyncAnthropic(api_key=settings.anthropic_api_key)
        prompt = (
            f'Analiza el siguiente material de investigación sobre "{topic}" '
            f'y genera un resumen BREVE (máximo 300 palabras) de las novedades '
            f'más importantes encontradas. Escribe en español.\n\n'
            f'Si el contenido es muy similar a investigaciones anteriores o no '
            f'contiene información genuinamente nueva, responde SOLO con: '
            f'"SIN_NOVEDADES"\n\n'
            f'Material nuevo:\n{context}'
        )
        msg = await client.messages.create(
            model=settings.claude_model,
            max_tokens=500,
            messages=[{"role": "user", "content": prompt}]
        )
        summary = msg.content[0].text.strip()

        if summary == "SIN_NOVEDADES":
            diff_logger.info("Diff: Claude found no new information", topic=topic)
            return None

        try:
            await db.log_api_call(session_id, "diff", settings.claude_model,
                                  msg.usage.input_tokens, msg.usage.output_tokens)
        except Exception:
            pass

        return f"🔔 *Novedades — {topic}*\n\n{summary}\n\nUsa /generate para report completo."

    except Exception as e:
        diff_logger.warning("Diff summary generation failed", error=str(e))
        return (
            f"📊 *Actualización — {topic}*\n\n"
            f"• {len(added_urls)} fuentes nuevas\n"
            f"Usa /generate report para ver el análisis completo."
        )


async def generate_comparison(
    topic_a: str,
    topic_b: str,
    context_a: str,
    context_b: str,
    session_id_a: int,
    db,
) -> str:
    from src.config import settings
    import structlog
    cmp_logger = structlog.get_logger()

    if not settings.anthropic_api_key:
        raise ValueError("Claude API key required for comparison")

    def _truncate(text: str, max_words: int = 3000) -> str:
        words = text.split()
        if len(words) > max_words:
            return " ".join(words[:max_words]) + "\n\n[... contenido adicional truncado ...]"
        return text

    context_a = _truncate(context_a)
    context_b = _truncate(context_b)

    prompt = (
        f'Eres un analista experto. Compara en profundidad estos dos temas:\n'
        f'TEMA A: "{topic_a}"\n'
        f'TEMA B: "{topic_b}"\n\n'
        f'Escribe el análisis en español con esta estructura:\n\n'
        f'## Resumen comparativo\n'
        f'(2-3 párrafos con las diferencias y similitudes más importantes)\n\n'
        f'## {topic_a}\n'
        f'(Puntos clave únicos de este tema)\n\n'
        f'## {topic_b}\n'
        f'(Puntos clave únicos de este tema)\n\n'
        f'## Similitudes\n'
        f'(Qué tienen en común)\n\n'
        f'## Diferencias clave\n'
        f'(Tabla markdown o lista de las diferencias más relevantes)\n\n'
        f'## Conclusión\n'
        f'(Cuál es mejor/más relevante según el contexto, o qué conclusión se extrae)\n\n'
        f'---\n'
        f'MATERIAL DE INVESTIGACIÓN — {topic_a}:\n{context_a}\n\n'
        f'---\n'
        f'MATERIAL DE INVESTIGACIÓN — {topic_b}:\n{context_b}\n'
    )

    try:
        import anthropic
        client = anthropic.AsyncAnthropic(api_key=settings.anthropic_api_key)
        msg = await client.messages.create(
            model=settings.claude_model,
            max_tokens=8192,
            messages=[{"role": "user", "content": prompt}]
        )
        try:
            await db.log_api_call(
                session_id_a, "comparison", settings.claude_model,
                msg.usage.input_tokens, msg.usage.output_tokens
            )
        except Exception:
            pass
        return msg.content[0].text.strip()
    except Exception as e:
        cmp_logger.error("Comparison generation failed", error=str(e))
        raise