feat: fase 2 — generación por secciones report_extended, blog_extended, podcast_extended

2026-05-04 10:58:06 +00:00
parent e5b77ad72d
commit a47d7b26ca
3 changed files with 199 additions and 1 deletions
@@ -141,7 +141,9 @@ async def cmd_start(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
        "`/status` — Check current research progress\n"
        "`/finish` — Stop research and proceed to generation\n"
        "`/process` — Manually trigger chunk processing\n"
-        "`/generate <type>` — Generate output (podcast|blog|report|thread)\n"
+        "`/generate <type>` — Generate output\n"
        "  Tipos: podcast|blog|report|thread\n"
        "  Extended: podcast_extended|blog_extended|report_extended\n"
        "`/sources` — List all sources found\n"
        "`/outputs` — List generated outputs\n"
        "`/costs` — Show API usage costs\n"
@@ -274,6 +276,10 @@ async def cmd_generate(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
        "thread": OutputType.THREAD,
        "hilo": OutputType.THREAD,
        "informe": OutputType.REPORT,
        "report_extended":  OutputType.REPORT_EXTENDED,
        "blog_extended":    OutputType.BLOG_EXTENDED,
        "podcast_extended": OutputType.PODCAST_EXTENDED,
        "informe_extended": OutputType.REPORT_EXTENDED,
    }
    if output_arg not in type_map:
@@ -331,6 +337,9 @@ async def cmd_generate(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
                OutputType.BLOG: "post.md",
                OutputType.REPORT: "report.md",
                OutputType.THREAD: "thread.txt",
                OutputType.REPORT_EXTENDED: "report_extended.md",
                OutputType.BLOG_EXTENDED: "blog_extended.md",
                OutputType.PODCAST_EXTENDED: "script_extended.md",
            }
            filename = f"researchowl_{session['topic'][:30].replace(' ', '_')}_{ext_map[output_type]}"
@@ -24,6 +24,9 @@ class OutputType(str, Enum):
    BLOG = "blog"
    REPORT = "report"
    THREAD = "thread"
    REPORT_EXTENDED = "report_extended"
    BLOG_EXTENDED = "blog_extended"
    PODCAST_EXTENDED = "podcast_extended"
 SCHEMA = """
@@ -135,6 +135,61 @@ Escribe el hilo (un tweet por línea, nada más):"""
 }
 OUTLINE_REPORT = """
 Eres un editor de investigación. Dado el tema "{topic}" y el material
 disponible, genera un outline detallado para un informe exhaustivo.
 Devuelve SOLO una lista JSON de secciones, sin texto adicional, sin
 markdown, sin explicaciones. Formato exacto:
 [
  {{"title": "Título de la sección", "query": "términos de búsqueda específicos para esta sección", "words": 800}},
  ...
 ]
 Genera entre 6 y 10 secciones. Cada sección debe:
 - Cubrir un ángulo distinto del tema
 - Tener una query específica para recuperar chunks relevantes
 - Indicar longitud objetivo en palabras (400-1200)
 Material disponible (resumen):
 {context_summary}
 """
 OUTLINE_BLOG = """
 Eres un editor de contenido. Dado el tema "{topic}" y el material
 disponible, genera un outline para un artículo de blog exhaustivo.
 Devuelve SOLO una lista JSON de secciones, sin texto adicional:
 [
  {{"title": "Título de sección", "query": "términos búsqueda", "words": 600}},
  ...
 ]
 Genera entre 5 y 8 secciones. Primera sección = introducción gancho.
 Última sección = conclusión con perspectiva original.
 Material disponible (resumen):
 {context_summary}
 """
 OUTLINE_PODCAST = """
 Eres un productor de podcast. Dado el tema "{topic}" y el material
 disponible, genera un outline para un guion de podcast exhaustivo.
 Devuelve SOLO una lista JSON de segmentos, sin texto adicional:
 [
  {{"title": "Nombre del segmento", "query": "términos búsqueda", "words": 700}},
  ...
 ]
 Genera entre 5 y 7 segmentos. Flujo natural: gancho → contexto →
 desarrollo → controversia → conclusión.
 Material disponible (resumen):
 {context_summary}
 """
 class OutputGenerator:
    def __init__(self, db: ResearchDB, ollama: OllamaClient, processor: ContentProcessor):
        self.db = db
@@ -144,6 +199,11 @@ class OutputGenerator:
    async def generate(self, session_id: int, output_type: OutputType,
                       progress_callback=None) -> str:
        """Generate an output for a research session"""
        if output_type in (OutputType.REPORT_EXTENDED,
                           OutputType.BLOG_EXTENDED,
                           OutputType.PODCAST_EXTENDED):
            return await self.generate_extended(session_id, output_type, progress_callback)
        session = await self.db.get_session(session_id)
        if not session:
            raise ValueError(f"Session {session_id} not found")
@@ -239,6 +299,132 @@ class OutputGenerator:
        }
        return systems.get(output_type, "You are a helpful research assistant.")
    async def generate_extended(self, session_id: int, output_type: OutputType,
                                progress_callback=None) -> str:
        """
        Generación por secciones para outputs exhaustivos.
        1. Recupera muestra de contexto para el outline
        2. Genera outline con Claude (lista de secciones)
        3. Para cada sección: RAG específico → genera sección
        4. Concatena y guarda
        """
        session = await self.db.get_session(session_id)
        if not session:
            raise ValueError(f"Session {session_id} not found")
        topic = session["topic"]
        # Paso 1: contexto resumen para el outline (top 10 chunks)
        top_chunks = await self.db.get_top_chunks(session_id, limit=10)
        if not top_chunks:
            raise ValueError("No processed content available. Run /process first.")
        context_summary = "\n\n".join(
            f"- {c.get('title', '')}: {c['content'][:300]}"
            for c in top_chunks
        )
        if progress_callback:
            await progress_callback("🗂️ Generando estructura del documento…")
        # Paso 2: outline
        base_type = output_type.value.replace("_extended", "")
        outline_prompts = {
            "report": OUTLINE_REPORT,
            "blog": OUTLINE_BLOG,
            "podcast": OUTLINE_PODCAST,
        }
        outline_prompt = outline_prompts[base_type].format(
            topic=topic, context_summary=context_summary
        )
        outline_json = await self._generate_raw(outline_prompt, session_id)
        try:
            import json as _json
            clean = outline_json.strip()
            if clean.startswith("```"):
                clean = "\n".join(clean.split("\n")[1:])
            if clean.endswith("```"):
                clean = "\n".join(clean.split("\n")[:-1])
            sections = _json.loads(clean.strip())
        except Exception as e:
            logger.error("Failed to parse outline", error=str(e), raw=outline_json[:200])
            raise ValueError(f"No se pudo generar el outline: {e}")
        if progress_callback:
            await progress_callback(
                f"✍️ Generando {len(sections)} secciones… (esto tardará varios minutos)"
            )
        # Paso 3: generar cada sección
        base_output_type = OutputType(base_type)
        system = self._get_system(base_output_type)
        sections_text = []
        for i, section in enumerate(sections, 1):
            title = section.get("title", f"Sección {i}")
            query = section.get("query", topic)
            target_words = section.get("words", 600)
            if progress_callback:
                await progress_callback(
                    f"✍️ Sección {i}/{len(sections)}: {title[:40]}…"
                )
            section_context = await self.processor.rag_query(session_id, query, top_k=40)
            if not section_context:
                section_context = context_summary
            section_prompt = (
                f"Escribe la sección '{title}' del {base_type} sobre: '{topic}'\n\n"
                f"REGLAS:\n"
                f"- Esta es UNA sección de un documento más largo — no repitas introducción ni conclusión general\n"
                f"- No incluyas encabezados del documento completo, solo el contenido de esta sección\n"
                f"- Objetivo: aproximadamente {target_words} palabras\n"
                f"- Usa SOLO información del material siguiente — no inventes datos\n"
                f"- Escribe en español\n\n"
                f"MATERIAL:\n{section_context}"
            )
            section_text = await self._generate(
                section_prompt, system, base_output_type, session_id
            )
            sections_text.append(f"## {title}\n\n{section_text}")
        # Paso 4: concatenar
        full_content = "\n\n---\n\n".join(sections_text)
        stats = await self.db.get_session_stats(session_id)
        header = self._build_header(topic, output_type, session, stats)
        full_output = header + "\n\n" + full_content
        await self.db.save_output(session_id, output_type, full_output)
        logger.info("Extended output generated", type=output_type,
                    sections=len(sections), length=len(full_output))
        return full_output
    async def _generate_raw(self, prompt: str,
                             session_id: int | None = None) -> str:
        if settings.anthropic_api_key:
            import anthropic
            try:
                client = anthropic.AsyncAnthropic(api_key=settings.anthropic_api_key)
                msg = await client.messages.create(
                    model=settings.claude_model,
                    max_tokens=2048,
                    messages=[{"role": "user", "content": prompt}],
                )
                if session_id is not None:
                    try:
                        await self.db.log_api_call(
                            session_id, "outline", settings.claude_model,
                            msg.usage.input_tokens, msg.usage.output_tokens
                        )
                    except Exception:
                        pass
                return msg.content[0].text.strip()
            except Exception as e:
                logger.warning("Claude outline failed", error=str(e))
                raise
        raise ValueError("Claude API key required for extended generation")
    def _build_header(self, topic: str, output_type: OutputType,
                      session: dict, stats: dict) -> str:
        from datetime import datetime