feat: fase 2 — generación por secciones report_extended, blog_extended, podcast_extended

2026-05-04 10:58:06 +00:00
parent e5b77ad72d
commit a47d7b26ca
3 changed files with 199 additions and 1 deletions
@@ -141,7 +141,9 @@ async def cmd_start(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
        "`/status` — Check current research progress\n"
        "`/finish` — Stop research and proceed to generation\n"
        "`/process` — Manually trigger chunk processing\n"
-        "`/generate <type>` — Generate output (podcast|blog|report|thread)\n"
+        "`/generate <type>` — Generate output\n"
+        "  Tipos: podcast|blog|report|thread\n"
+        "  Extended: podcast_extended|blog_extended|report_extended\n"
        "`/sources` — List all sources found\n"
        "`/outputs` — List generated outputs\n"
        "`/costs` — Show API usage costs\n"
@@ -274,6 +276,10 @@ async def cmd_generate(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
        "thread": OutputType.THREAD,
        "hilo": OutputType.THREAD,
        "informe": OutputType.REPORT,
+        "report_extended":  OutputType.REPORT_EXTENDED,
+        "blog_extended":    OutputType.BLOG_EXTENDED,
+        "podcast_extended": OutputType.PODCAST_EXTENDED,
+        "informe_extended": OutputType.REPORT_EXTENDED,
    }

    if output_arg not in type_map:
@@ -331,6 +337,9 @@ async def cmd_generate(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
                OutputType.BLOG: "post.md",
                OutputType.REPORT: "report.md",
                OutputType.THREAD: "thread.txt",
+                OutputType.REPORT_EXTENDED: "report_extended.md",
+                OutputType.BLOG_EXTENDED: "blog_extended.md",
+                OutputType.PODCAST_EXTENDED: "script_extended.md",
            }
            filename = f"researchowl_{session['topic'][:30].replace(' ', '_')}_{ext_map[output_type]}"

@@ -24,6 +24,9 @@ class OutputType(str, Enum):
    BLOG = "blog"
    REPORT = "report"
    THREAD = "thread"
+    REPORT_EXTENDED = "report_extended"
+    BLOG_EXTENDED = "blog_extended"
+    PODCAST_EXTENDED = "podcast_extended"


 SCHEMA = """
@@ -135,6 +135,61 @@ Escribe el hilo (un tweet por línea, nada más):"""
 }


+OUTLINE_REPORT = """
+Eres un editor de investigación. Dado el tema "{topic}" y el material
+disponible, genera un outline detallado para un informe exhaustivo.
+
+Devuelve SOLO una lista JSON de secciones, sin texto adicional, sin
+markdown, sin explicaciones. Formato exacto:
+[
+  {{"title": "Título de la sección", "query": "términos de búsqueda específicos para esta sección", "words": 800}},
+  ...
+]
+
+Genera entre 6 y 10 secciones. Cada sección debe:
+- Cubrir un ángulo distinto del tema
+- Tener una query específica para recuperar chunks relevantes
+- Indicar longitud objetivo en palabras (400-1200)
+
+Material disponible (resumen):
+{context_summary}
+"""
+
+OUTLINE_BLOG = """
+Eres un editor de contenido. Dado el tema "{topic}" y el material
+disponible, genera un outline para un artículo de blog exhaustivo.
+
+Devuelve SOLO una lista JSON de secciones, sin texto adicional:
+[
+  {{"title": "Título de sección", "query": "términos búsqueda", "words": 600}},
+  ...
+]
+
+Genera entre 5 y 8 secciones. Primera sección = introducción gancho.
+Última sección = conclusión con perspectiva original.
+
+Material disponible (resumen):
+{context_summary}
+"""
+
+OUTLINE_PODCAST = """
+Eres un productor de podcast. Dado el tema "{topic}" y el material
+disponible, genera un outline para un guion de podcast exhaustivo.
+
+Devuelve SOLO una lista JSON de segmentos, sin texto adicional:
+[
+  {{"title": "Nombre del segmento", "query": "términos búsqueda", "words": 700}},
+  ...
+]
+
+Genera entre 5 y 7 segmentos. Flujo natural: gancho → contexto →
+desarrollo → controversia → conclusión.
+
+Material disponible (resumen):
+{context_summary}
+"""
+
+
 class OutputGenerator:
    def __init__(self, db: ResearchDB, ollama: OllamaClient, processor: ContentProcessor):
        self.db = db
@@ -144,6 +199,11 @@ class OutputGenerator:
    async def generate(self, session_id: int, output_type: OutputType,
                       progress_callback=None) -> str:
        """Generate an output for a research session"""
+        if output_type in (OutputType.REPORT_EXTENDED,
+                           OutputType.BLOG_EXTENDED,
+                           OutputType.PODCAST_EXTENDED):
+            return await self.generate_extended(session_id, output_type, progress_callback)
+
        session = await self.db.get_session(session_id)
        if not session:
            raise ValueError(f"Session {session_id} not found")
@@ -239,6 +299,132 @@ class OutputGenerator:
        }
        return systems.get(output_type, "You are a helpful research assistant.")

+    async def generate_extended(self, session_id: int, output_type: OutputType,
+                                progress_callback=None) -> str:
+        """
+        Generación por secciones para outputs exhaustivos.
+        1. Recupera muestra de contexto para el outline
+        2. Genera outline con Claude (lista de secciones)
+        3. Para cada sección: RAG específico → genera sección
+        4. Concatena y guarda
+        """
+        session = await self.db.get_session(session_id)
+        if not session:
+            raise ValueError(f"Session {session_id} not found")
+        topic = session["topic"]
+
+        # Paso 1: contexto resumen para el outline (top 10 chunks)
+        top_chunks = await self.db.get_top_chunks(session_id, limit=10)
+        if not top_chunks:
+            raise ValueError("No processed content available. Run /process first.")
+        context_summary = "\n\n".join(
+            f"- {c.get('title', '')}: {c['content'][:300]}"
+            for c in top_chunks
+        )
+
+        if progress_callback:
+            await progress_callback("🗂️ Generando estructura del documento…")
+
+        # Paso 2: outline
+        base_type = output_type.value.replace("_extended", "")
+        outline_prompts = {
+            "report": OUTLINE_REPORT,
+            "blog": OUTLINE_BLOG,
+            "podcast": OUTLINE_PODCAST,
+        }
+        outline_prompt = outline_prompts[base_type].format(
+            topic=topic, context_summary=context_summary
+        )
+
+        outline_json = await self._generate_raw(outline_prompt, session_id)
+        try:
+            import json as _json
+            clean = outline_json.strip()
+            if clean.startswith("```"):
+                clean = "\n".join(clean.split("\n")[1:])
+            if clean.endswith("```"):
+                clean = "\n".join(clean.split("\n")[:-1])
+            sections = _json.loads(clean.strip())
+        except Exception as e:
+            logger.error("Failed to parse outline", error=str(e), raw=outline_json[:200])
+            raise ValueError(f"No se pudo generar el outline: {e}")
+
+        if progress_callback:
+            await progress_callback(
+                f"✍️ Generando {len(sections)} secciones… (esto tardará varios minutos)"
+            )
+
+        # Paso 3: generar cada sección
+        base_output_type = OutputType(base_type)
+        system = self._get_system(base_output_type)
+        sections_text = []
+
+        for i, section in enumerate(sections, 1):
+            title = section.get("title", f"Sección {i}")
+            query = section.get("query", topic)
+            target_words = section.get("words", 600)
+
+            if progress_callback:
+                await progress_callback(
+                    f"✍️ Sección {i}/{len(sections)}: {title[:40]}…"
+                )
+
+            section_context = await self.processor.rag_query(session_id, query, top_k=40)
+            if not section_context:
+                section_context = context_summary
+
+            section_prompt = (
+                f"Escribe la sección '{title}' del {base_type} sobre: '{topic}'\n\n"
+                f"REGLAS:\n"
+                f"- Esta es UNA sección de un documento más largo — no repitas introducción ni conclusión general\n"
+                f"- No incluyas encabezados del documento completo, solo el contenido de esta sección\n"
+                f"- Objetivo: aproximadamente {target_words} palabras\n"
+                f"- Usa SOLO información del material siguiente — no inventes datos\n"
+                f"- Escribe en español\n\n"
+                f"MATERIAL:\n{section_context}"
+            )
+
+            section_text = await self._generate(
+                section_prompt, system, base_output_type, session_id
+            )
+            sections_text.append(f"## {title}\n\n{section_text}")
+
+        # Paso 4: concatenar
+        full_content = "\n\n---\n\n".join(sections_text)
+        stats = await self.db.get_session_stats(session_id)
+        header = self._build_header(topic, output_type, session, stats)
+        full_output = header + "\n\n" + full_content
+
+        await self.db.save_output(session_id, output_type, full_output)
+        logger.info("Extended output generated", type=output_type,
+                    sections=len(sections), length=len(full_output))
+        return full_output
+
+    async def _generate_raw(self, prompt: str,
+                             session_id: int | None = None) -> str:
+        if settings.anthropic_api_key:
+            import anthropic
+            try:
+                client = anthropic.AsyncAnthropic(api_key=settings.anthropic_api_key)
+                msg = await client.messages.create(
+                    model=settings.claude_model,
+                    max_tokens=2048,
+                    messages=[{"role": "user", "content": prompt}],
+                )
+                if session_id is not None:
+                    try:
+                        await self.db.log_api_call(
+                            session_id, "outline", settings.claude_model,
+                            msg.usage.input_tokens, msg.usage.output_tokens
+                        )
+                    except Exception:
+                        pass
+                return msg.content[0].text.strip()
+            except Exception as e:
+                logger.warning("Claude outline failed", error=str(e))
+                raise
+        raise ValueError("Claude API key required for extended generation")
+
    def _build_header(self, topic: str, output_type: OutputType,
                      session: dict, stats: dict) -> str:
        from datetime import datetime