From ba0853633734b6f5ba23485eaea25a948eef71e3 Mon Sep 17 00:00:00 2001 From: ChemaVX Date: Mon, 27 Apr 2026 13:49:07 +0000 Subject: [PATCH] feat: initial ResearchOwl --- .env.example | 29 ++ .gitea/workflows/build.yml | 49 ++ CLAUDE.md | 326 ++++++++++++ Dockerfile | 22 + README.md | 108 ++++ k8s/argocd-app.yaml | 20 + k8s/deployment.yaml | 96 ++++ main.py | 12 + requirements.txt | 31 ++ src/__init__.py | 0 src/__pycache__/__init__.cpython-310.pyc | Bin 0 -> 134 bytes src/__pycache__/config.cpython-310.pyc | Bin 0 -> 2259 bytes src/bot/__init__.py | 0 src/bot/__pycache__/__init__.cpython-310.pyc | Bin 0 -> 138 bytes src/bot/__pycache__/bot.cpython-310.pyc | Bin 0 -> 13494 bytes src/bot/bot.py | 467 +++++++++++++++++ src/config.py | 49 ++ src/db/__init__.py | 0 src/db/__pycache__/__init__.cpython-310.pyc | Bin 0 -> 137 bytes src/db/__pycache__/database.cpython-310.pyc | Bin 0 -> 10724 bytes src/db/database.py | 265 ++++++++++ src/generator/__init__.py | 0 .../__pycache__/__init__.cpython-310.pyc | Bin 0 -> 144 bytes .../__pycache__/generator.cpython-310.pyc | Bin 0 -> 6560 bytes src/generator/generator.py | 188 +++++++ src/processor/__init__.py | 0 .../__pycache__/__init__.cpython-310.pyc | Bin 0 -> 144 bytes .../__pycache__/processor.cpython-310.pyc | Bin 0 -> 8769 bytes src/processor/processor.py | 251 +++++++++ src/scraper/__init__.py | 0 .../__pycache__/__init__.cpython-310.pyc | Bin 0 -> 142 bytes .../__pycache__/exhaustive.cpython-310.pyc | Bin 0 -> 16160 bytes src/scraper/exhaustive.py | 490 ++++++++++++++++++ tests/__init__.py | 0 tests/__pycache__/__init__.cpython-310.pyc | Bin 0 -> 136 bytes .../test_scraper.cpython-310-pytest-9.0.3.pyc | Bin 0 -> 4190 bytes tests/test_scraper.py | 28 + 37 files changed, 2431 insertions(+) create mode 100644 .env.example create mode 100644 .gitea/workflows/build.yml create mode 100644 CLAUDE.md create mode 100644 Dockerfile create mode 100644 README.md create mode 100644 k8s/argocd-app.yaml create mode 100644 k8s/deployment.yaml create mode 100644 main.py create mode 100644 requirements.txt create mode 100644 src/__init__.py create mode 100644 src/__pycache__/__init__.cpython-310.pyc create mode 100644 src/__pycache__/config.cpython-310.pyc create mode 100644 src/bot/__init__.py create mode 100644 src/bot/__pycache__/__init__.cpython-310.pyc create mode 100644 src/bot/__pycache__/bot.cpython-310.pyc create mode 100644 src/bot/bot.py create mode 100644 src/config.py create mode 100644 src/db/__init__.py create mode 100644 src/db/__pycache__/__init__.cpython-310.pyc create mode 100644 src/db/__pycache__/database.cpython-310.pyc create mode 100644 src/db/database.py create mode 100644 src/generator/__init__.py create mode 100644 src/generator/__pycache__/__init__.cpython-310.pyc create mode 100644 src/generator/__pycache__/generator.cpython-310.pyc create mode 100644 src/generator/generator.py create mode 100644 src/processor/__init__.py create mode 100644 src/processor/__pycache__/__init__.cpython-310.pyc create mode 100644 src/processor/__pycache__/processor.cpython-310.pyc create mode 100644 src/processor/processor.py create mode 100644 src/scraper/__init__.py create mode 100644 src/scraper/__pycache__/__init__.cpython-310.pyc create mode 100644 src/scraper/__pycache__/exhaustive.cpython-310.pyc create mode 100644 src/scraper/exhaustive.py create mode 100644 tests/__init__.py create mode 100644 tests/__pycache__/__init__.cpython-310.pyc create mode 100644 tests/__pycache__/test_scraper.cpython-310-pytest-9.0.3.pyc create mode 100644 tests/test_scraper.py diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..9013d98 --- /dev/null +++ b/.env.example @@ -0,0 +1,29 @@ +# ResearchOwl — Environment Variables +# Copy to .env and fill in values + +# Required +TELEGRAM_BOT_TOKEN=your_bot_token_here +TELEGRAM_ALLOWED_USERS=123456789 # your Telegram user ID + +# Ollama (default points to your existing instance) +OLLAMA_URL=http://ollama.chemavx.xyz +OLLAMA_MODEL=qwen2.5:3b + +# Claude fallback (optional, only for premium generation) +# ANTHROPIC_API_KEY=sk-ant-... +# CLAUDE_MODEL=claude-haiku-4-5 + +# Storage +DB_PATH=/data/researchowl.db + +# Scraping tuning +MAX_DEPTH=3 # how deep to follow links (1-5) +MAX_SOURCES=150 # hard cap on total sources +MAX_PAGES_PER_SEARCH=5 +REQUEST_DELAY=1.0 # seconds between requests (be polite) +MIN_CONTENT_LENGTH=200 + +# Processing +CHUNK_SIZE=800 +CHUNK_OVERLAP=100 +QUALITY_THRESHOLD=0.4 # 0-1, lower = more permissive diff --git a/.gitea/workflows/build.yml b/.gitea/workflows/build.yml new file mode 100644 index 0000000..20866f4 --- /dev/null +++ b/.gitea/workflows/build.yml @@ -0,0 +1,49 @@ +name: Build & Deploy ResearchOwl + +on: + push: + branches: [main] + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Validate manifests + run: | + for f in k8s/*.yaml; do + python3 -c "import yaml; list(yaml.safe_load_all(open('$f')))" && echo "✅ $f OK" + done + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Gitea Registry + uses: docker/login-action@v3 + with: + registry: git.chemavx.xyz + username: ${{ secrets.REGISTRY_USER }} + password: ${{ secrets.REGISTRY_PASSWORD }} + + - name: Build and push + uses: docker/build-push-action@v5 + with: + context: . + push: true + tags: git.chemavx.xyz/chemavx/researchowl:latest + cache-from: type=registry,ref=git.chemavx.xyz/chemavx/researchowl:cache + cache-to: type=registry,ref=git.chemavx.xyz/chemavx/researchowl:cache,mode=max + + - name: Notify Telegram + if: always() + env: + TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }} + CHAT: ${{ secrets.TELEGRAM_CHAT_ID }} + run: | + STATUS="${{ job.status }}" + EMOJI="✅" + if [ "$STATUS" != "success" ]; then EMOJI="❌"; fi + MSG="${EMOJI} ResearchOwl build ${STATUS} — $(git log -1 --pretty='%s')" + curl -s -X POST "https://api.telegram.org/bot${TOKEN}/sendMessage" \ + -d chat_id="${CHAT}" -d text="${MSG}" diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..db4bea1 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,326 @@ +# ResearchOwl — Instrucciones para Claude Code + +## Contexto del proyecto + +Eres el agente de construcción e implementación de **ResearchOwl**, un bot de Telegram que realiza investigación exhaustiva sobre cualquier tema usando scraping recursivo y Ollama (qwen2.5:3b) para procesamiento y generación de contenido. + +El homelab donde se desplegará tiene: +- **k3s** con Traefik + cert-manager + Cloudflare DNS +- **ArgoCD** para GitOps (repo: `k8s-manifests` en Gitea) +- **Gitea** en `git.chemavx.xyz` + Container Registry +- **Ollama** en `http://ollama.chemavx.xyz` con modelo `qwen2.5:3b` +- **Telegram bot** ya existente en `@chemavx_bot` +- Dominio base: `chemavx.xyz` + +--- + +## Objetivo + +Construir el proyecto completo, corregir todos los bugs, y dejarlo listo para desplegar en k3s. + +--- + +## Tareas a realizar — en orden + +### 1. Crear estructura del proyecto + +``` +researchowl/ +├── src/ +│ ├── __init__.py +│ ├── config.py +│ ├── scraper/ +│ │ ├── __init__.py +│ │ └── exhaustive.py +│ ├── processor/ +│ │ ├── __init__.py +│ │ └── processor.py +│ ├── generator/ +│ │ ├── __init__.py +│ │ └── generator.py +│ ├── bot/ +│ │ ├── __init__.py +│ │ └── bot.py +│ └── db/ +│ ├── __init__.py +│ └── database.py +├── k8s/ +│ ├── deployment.yaml +│ └── argocd-app.yaml +├── .gitea/ +│ └── workflows/ +│ └── build.yml +├── tests/ +│ └── test_scraper.py +├── main.py +├── requirements.txt +├── Dockerfile +├── .env.example +└── README.md +``` + +### 2. Corregir bug crítico en database.py + +La tabla `source_contents` está referenciada en `processor.py` pero no existe en el schema. + +**Añadir al SCHEMA en `database.py`:** + +```sql +CREATE TABLE IF NOT EXISTS source_contents ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + source_id INTEGER NOT NULL UNIQUE REFERENCES sources(id), + content TEXT NOT NULL, + created_at REAL NOT NULL +); + +CREATE INDEX IF NOT EXISTS idx_source_contents ON source_contents(source_id); +``` + +**Añadir método en la clase `ResearchDB`:** + +```python +async def save_source_content(self, source_id: int, content: str): + await self.db.execute( + """INSERT OR REPLACE INTO source_contents (source_id, content, created_at) + VALUES (?, ?, ?)""", + (source_id, content, time.time()) + ) + await self.db.commit() + +async def get_source_content(self, source_id: int) -> Optional[str]: + cursor = await self.db.execute( + "SELECT content FROM source_contents WHERE source_id = ?", (source_id,) + ) + row = await cursor.fetchone() + return row[0] if row else None +``` + +### 3. Corregir bug en exhaustive.py — guardar contenido + +En el método `_mark_scraped` del `ExhaustiveScraper`, después de validar el contenido, hay que guardarlo en `source_contents`. Cambiar el método a: + +```python +async def _mark_scraped(self, source_id: int, content: Optional[str], + title: Optional[str], url: str): + if not content or len(content) < settings.min_content_length: + await self.db.update_source(source_id, status="skipped", + error="Content too short or empty") + return + + word_count = len(content.split()) + + # Guardar contenido raw + await self.db.save_source_content(source_id, content) + + await self.db.update_source( + source_id, + status="scraped", + title=title or url, + word_count=word_count, + scraped_at=time.time(), + quality_score=min(1.0, word_count / 1000) + ) +``` + +### 4. Corregir bug en processor.py — usar save/get content + +En `_process_source`, la consulta a `source_contents` usa `self.db.db.execute` directamente pero ahora debería usar el método del DB: + +```python +async def _process_source(self, session_id: int, topic: str, source: dict) -> int: + source_id = source["id"] + + # Usar el método correcto + content = await self.db.get_source_content(source_id) + if not content: + return 0 + + chunks = simple_chunk(content, settings.chunk_size, settings.chunk_overlap) + stored = 0 + + for i, chunk in enumerate(chunks): + if len(chunk.split()) < 30: + continue + + quality = await self._score_quality(chunk, topic) + if quality < settings.quality_threshold: + continue + + embedding = await self.ollama.embed(chunk[:1000]) + + await self.db.add_chunk( + session_id=session_id, + source_id=source_id, + content=chunk, + chunk_index=i, + token_count=len(chunk.split()), + quality_score=quality, + embedding=embedding + ) + stored += 1 + + return stored +``` + +### 5. Añadir comando /outputs al bot + +En `bot.py`, añadir este handler: + +```python +async def cmd_outputs(update: Update, ctx: ContextTypes.DEFAULT_TYPE): + if not is_authorized(update.effective_user.id): + return + + chat_id = update.effective_chat.id + db_conn = await get_db() + db = ResearchDB(db_conn) + + try: + cursor = await db_conn.execute( + "SELECT * FROM research_sessions WHERE telegram_chat_id = ? ORDER BY created_at DESC LIMIT 1", + (chat_id,) + ) + row = await cursor.fetchone() + if not row: + await update.message.reply_text("No sessions found.") + return + + outputs = await db.get_outputs(row["id"]) + if not outputs: + await update.message.reply_text( + "No outputs generated yet. Use `/generate podcast|blog|report|thread`", + parse_mode=ParseMode.MARKDOWN + ) + return + + lines = [f"📄 *Outputs for: {row['topic']}*\n"] + for o in outputs: + from datetime import datetime + dt = datetime.utcfromtimestamp(o['created_at']).strftime("%Y-%m-%d %H:%M") + lines.append(f"• `{o['output_type']}` — {dt} ({len(o['content'])} chars)") + + await update.message.reply_text( + "\n".join(lines), + parse_mode=ParseMode.MARKDOWN + ) + finally: + await db_conn.close() +``` + +Y registrarlo en `create_bot()`: +```python +app.add_handler(CommandHandler("outputs", cmd_outputs)) +``` + +### 6. Instalar dependencias y verificar que importa correctamente + +```bash +pip install -r requirements.txt +python -c "from src.bot.bot import create_bot; print('OK')" +python -c "from src.scraper.exhaustive import ExhaustiveScraper; print('OK')" +python -c "from src.processor.processor import ContentProcessor; print('OK')" +python -c "from src.generator.generator import OutputGenerator; print('OK')" +``` + +Si hay errores de importación, corrígelos. + +### 7. Escribir test básico + +En `tests/test_scraper.py`: + +```python +import pytest +import asyncio +from src.scraper.exhaustive import ( + detect_source_type, is_blacklisted, normalize_url, simple_chunk +) + +def test_detect_source_type(): + assert detect_source_type("https://youtube.com/watch?v=abc123") == "youtube" + assert detect_source_type("https://reddit.com/r/test/comments/abc") == "reddit" + assert detect_source_type("https://en.wikipedia.org/wiki/Roswell") == "wikipedia" + assert detect_source_type("https://example.com/doc.pdf") == "pdf" + assert detect_source_type("https://example.com/article") == "web" + +def test_is_blacklisted(): + assert is_blacklisted("https://facebook.com/something") == True + assert is_blacklisted("https://en.wikipedia.org/wiki/Test") == False + +def test_normalize_url(): + assert normalize_url("https://example.com/page#section") == "https://example.com/page" + assert normalize_url("https://example.com/page/") == "https://example.com/page" +``` + +Nota: importar `simple_chunk` desde `processor.py`: + +```python +from src.processor.processor import simple_chunk + +def test_simple_chunk(): + text = "\n\n".join([f"Paragraph {i} with some content here." for i in range(50)]) + chunks = simple_chunk(text, chunk_size=100, overlap=20) + assert len(chunks) > 1 + assert all(isinstance(c, str) for c in chunks) +``` + +Ejecutar: `pytest tests/ -v` + +### 8. Build Docker y verificar + +```bash +docker build -t researchowl:test . +docker run --rm researchowl:test python -c "from src.bot.bot import create_bot; print('Docker OK')" +``` + +### 9. Preparar para despliegue + +Verificar que estos ficheros están correctos y completos: +- `k8s/deployment.yaml` — Deployment + PVC + Secret template +- `k8s/argocd-app.yaml` — ArgoCD Application apuntando a `k8s-manifests` +- `.gitea/workflows/build.yml` — CI con build → push → notificación Telegram + +### 10. Instrucciones finales para el usuario + +Al finalizar, mostrar: + +``` +✅ ResearchOwl listo para desplegar. + +Pasos para desplegar: +1. Crear secret en k3s: + kubectl create namespace researchowl + kubectl create secret generic researchowl-secrets \ + --from-literal=telegram-bot-token=TU_TOKEN \ + --from-literal=telegram-allowed-users=TU_USER_ID \ + -n researchowl + +2. Subir código a Gitea: + git init && git remote add origin https://git.chemavx.xyz/chemavx/researchowl + git add . && git commit -m "feat: initial ResearchOwl" + git push -u origin main + +3. Gitea Actions construirá la imagen automáticamente. + +4. Copiar manifests k8s/ a tu repo k8s-manifests/researchowl/ + y aplicar el ArgoCD app: + kubectl apply -f k8s/argocd-app.yaml + +5. ArgoCD desplegará automáticamente. + +Uso desde Telegram: + /research Incidente Roswell + /status + /finish + /generate podcast +``` + +--- + +## Notas importantes + +- **No crear un bot de Telegram nuevo** — el usuario ya tiene `@chemavx_bot`. Solo necesita configurar el token en el secret de k3s. +- **No modificar** los manifests de k8s para añadir Ingress — el bot usa polling de Telegram, no necesita exponer ningún puerto. +- **Ollama** ya está corriendo en el cluster. La URL `http://ollama.chemavx.xyz` es correcta. +- Si `qwen2.5:3b` es lento para scoring de calidad, se puede desactivar el scoring con `QUALITY_THRESHOLD=0` y todos los chunks pasan directamente. +- El proyecto usa **SQLite** (coherente con el resto del homelab). +- Respetar el `REQUEST_DELAY=1.0` para no hacer ban en las fuentes. diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..af116ec --- /dev/null +++ b/Dockerfile @@ -0,0 +1,22 @@ +FROM python:3.12-slim + +WORKDIR /app + +# System dependencies for lxml, pdfplumber +RUN apt-get update && apt-get install -y \ + gcc g++ \ + libxml2-dev libxslt-dev \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +# Data directory +RUN mkdir -p /data + +ENV PYTHONUNBUFFERED=1 +ENV PYTHONPATH=/app + +CMD ["python", "main.py"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..345e19f --- /dev/null +++ b/README.md @@ -0,0 +1,108 @@ +# 🦉 ResearchOwl + +**Exhaustive research engine with Telegram interface.** + +Recursively discovers, scrapes, and processes sources from across the web, +then generates podcast scripts, blog posts, reports, or social threads using Ollama. + +## Architecture + +``` +Telegram (/research ) + ↓ +ExhaustiveScraper + ├── DuckDuckGo (8 queries × 5 results) + ├── Wikipedia + recursive internal links + ├── Reddit (top posts + top comments) + ├── YouTube (transcripts) + ├── PDFs (public documents) + └── Web scraping (trafilatura) + ↓ recursive expansion (depth 1-3) +ContentProcessor (Ollama qwen2.5:3b) + ├── Chunking (800 token chunks, 100 overlap) + ├── Quality scoring (0-10 per chunk) + ├── Embeddings (cosine similarity RAG) + └── Deduplication + ↓ +OutputGenerator (Ollama) + ├── 🎙️ Podcast script (20-30 min) + ├── 📝 Blog post (1500-2500 words) + ├── 📊 Research report (structured) + └── 🐦 Social thread (15-25 tweets) +``` + +## Telegram Commands + +| Command | Description | +|---------|-------------| +| `/research ` | Start exhaustive research | +| `/status` | Check progress | +| `/finish` | Stop early, proceed to generation | +| `/generate podcast\|blog\|report\|thread` | Generate output | +| `/sources` | List all sources found | +| `/cancel` | Cancel current research | + +## Local Development + +```bash +# 1. Clone and setup +git clone https://git.chemavx.xyz/chemavx/researchowl +cd researchowl + +# 2. Create virtualenv +python3 -m venv venv && source venv/bin/activate +pip install -r requirements.txt + +# 3. Configure +cp .env.example .env +# Edit .env with your values + +# 4. Run +python main.py +``` + +## Deploy to k3s + +```bash +# 1. Create namespace and secrets +kubectl create namespace researchowl +kubectl create secret generic researchowl-secrets \ + --from-literal=telegram-bot-token=YOUR_TOKEN \ + --from-literal=telegram-allowed-users=YOUR_USER_ID \ + -n researchowl + +# 2. Copy manifests to your k8s-manifests repo +cp k8s/*.yaml /path/to/k8s-manifests/researchowl/ + +# 3. Apply ArgoCD app +kubectl apply -f k8s/argocd-app.yaml + +# 4. Push to Gitea → Gitea Actions builds → ArgoCD deploys +git add . && git commit -m "feat: add researchowl" && git push +``` + +## Tuning + +| Variable | Default | Description | +|----------|---------|-------------| +| `MAX_SOURCES` | 150 | Hard cap on sources | +| `MAX_DEPTH` | 3 | Link recursion depth | +| `QUALITY_THRESHOLD` | 0.4 | Min chunk quality (0-1) | +| `REQUEST_DELAY` | 1.0s | Delay between requests | + +**Want more thoroughness?** +- Increase `MAX_SOURCES` to 300+ +- Increase `MAX_DEPTH` to 4-5 +- Lower `QUALITY_THRESHOLD` to 0.3 + +**Want faster results?** +- Lower `MAX_SOURCES` to 50 +- Set `MAX_DEPTH` to 1-2 +- Higher `QUALITY_THRESHOLD` to 0.6 + +## Notes + +- Uses **qwen2.5:3b** (your existing Ollama) for all AI tasks — zero API cost +- Optionally add `ANTHROPIC_API_KEY` for Claude fallback on generation +- SQLite database stored in `/data/researchowl.db` +- All outputs saved to DB and available via `/outputs` diff --git a/k8s/argocd-app.yaml b/k8s/argocd-app.yaml new file mode 100644 index 0000000..42b9018 --- /dev/null +++ b/k8s/argocd-app.yaml @@ -0,0 +1,20 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: researchowl + namespace: argocd +spec: + project: default + source: + repoURL: https://git.chemavx.xyz/chemavx/k8s-manifests + targetRevision: HEAD + path: researchowl + destination: + server: https://kubernetes.default.svc + namespace: researchowl + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true diff --git a/k8s/deployment.yaml b/k8s/deployment.yaml new file mode 100644 index 0000000..eba91da --- /dev/null +++ b/k8s/deployment.yaml @@ -0,0 +1,96 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: researchowl + +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: researchowl-data + namespace: researchowl +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: researchowl + namespace: researchowl + labels: + app: researchowl +spec: + replicas: 1 + selector: + matchLabels: + app: researchowl + template: + metadata: + labels: + app: researchowl + spec: + containers: + - name: researchowl + image: git.chemavx.xyz/chemavx/researchowl:latest + imagePullPolicy: Always + env: + - name: TELEGRAM_BOT_TOKEN + valueFrom: + secretKeyRef: + name: researchowl-secrets + key: telegram-bot-token + - name: TELEGRAM_ALLOWED_USERS + valueFrom: + secretKeyRef: + name: researchowl-secrets + key: telegram-allowed-users + - name: OLLAMA_URL + value: "http://ollama.chemavx.xyz" + - name: OLLAMA_MODEL + value: "qwen2.5:3b" + - name: DB_PATH + value: "/data/researchowl.db" + - name: MAX_SOURCES + value: "150" + - name: MAX_DEPTH + value: "3" + - name: QUALITY_THRESHOLD + value: "0.4" + volumeMounts: + - name: data + mountPath: /data + resources: + requests: + memory: "256Mi" + cpu: "100m" + limits: + memory: "1Gi" + cpu: "500m" + volumes: + - name: data + persistentVolumeClaim: + claimName: researchowl-data + imagePullSecrets: + - name: gitea-registry + +--- +# Secret template — fill with real values and apply manually +# kubectl create secret generic researchowl-secrets \ +# --from-literal=telegram-bot-token=YOUR_TOKEN \ +# --from-literal=telegram-allowed-users=YOUR_USER_ID \ +# -n researchowl +apiVersion: v1 +kind: Secret +metadata: + name: researchowl-secrets + namespace: researchowl +type: Opaque +stringData: + telegram-bot-token: "REPLACE_ME" + telegram-allowed-users: "REPLACE_ME" diff --git a/main.py b/main.py new file mode 100644 index 0000000..a609bcb --- /dev/null +++ b/main.py @@ -0,0 +1,12 @@ +import structlog +from src.bot.bot import run + +structlog.configure( + processors=[ + structlog.stdlib.add_log_level, + structlog.dev.ConsoleRenderer(), + ] +) + +if __name__ == "__main__": + run() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..7ceab3e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,31 @@ +# Core +fastapi==0.115.0 +uvicorn==0.30.0 +python-telegram-bot==21.5 +httpx==0.27.0 +aiohttp==3.10.0 + +# Scraping +beautifulsoup4==4.12.3 +lxml==5.2.2 +trafilatura==1.12.0 +youtube-transcript-api==0.6.2 +pdfplumber==0.11.3 +feedparser==6.0.11 +duckduckgo-search==6.2.6 + +# Storage & Embeddings +sqlite-vec==0.1.6 +aiosqlite==0.20.0 + +# Processing +tiktoken==0.7.0 +numpy==1.26.4 +scikit-learn==1.5.1 + +# Utilities +pydantic==2.8.0 +pydantic-settings==2.4.0 +tenacity==9.0.0 +structlog==24.4.0 +python-dotenv==1.0.1 diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/__pycache__/__init__.cpython-310.pyc b/src/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9861eb5e1e736eb903f18178c01e6ae6c7586497 GIT binary patch literal 134 zcmd1j<>g`kf<0;PGePuY5P=LBfgA@QE@lA|DGb33nv8xc8Hzx{2;!Hfenx(7s(x}t zYHng#g?>?LacW{waz=i6j(%}bvVMGgW?p7Ve7s&k>3Xs!~&H%~;xaU*26A zq#}oux#o~djwv|io8&XhDOEZ7l$<%GrzI=SYpeFJ*U}gD}Dri`YtAQDj@SR9kCn*w(Cw9c@PyvPM{hMc)z@<;sE9j`7$5 zX-~0OgXE_UAqL4SBDrD&yuriJbh-gNN1vM9V$g`!ywG$V!|F)S;}7u>@>1PWCd14b_UrkWS_Cy$mTKL z9b|XeJ!EHb9QT2pW1sW$JUvf#RQ3gX@HTRwv@bxtAF@YK??(q}`yxw2otH%XsJI1@ zZR!<$tzO!sE43zV)}HBAD740%eUS~h#k;;?(``4Tp}Wf+LB!moL#a}!eW|a~Rzt5h z#N5Q&uq^i#XS5%1KN!yU!q9tIEV`Cu*hZn#K?y6}216p31;QraxhR=pw;BRlQ; z)-bX6iaWOp_a5Ha7Rj;CW^Glkh~(JIb{V%sc5Kr(S0Ig%M|SMN?QKW_{i-5Usy5f_ zwXKaZEp2VkXZlN#F`TgHyPnyhhG)`UJ{YDumeFT?v1gdO{l&YB_e8Q>DYaJhap>g8 zJ_m8f+6Wy;=H-uIgK0n6{1A zlyMJCKT6Y#G&O3idRcFX8EFjMzTe@&VN{xP(!5n#(;IY4uhWKJs+ZSAPWpL9mj~42 zJ{_fScu{tkt?SQQdIM^?v8mTuO)=~9y*>}1f~L*ge%J*sxc+!7Qb(>ZwWXILb>s+h zYYYy50Vk2&+^EuWt=iP9OA~?8B;v`8@3VP6rO6$tBzJh$_P! zEP%Wr-;f~MB@H+-nfex=-ziLcudI>-1vW{f`nLco^pW*wz>NXA(BSBpK@=Ed=*IKG&Qf!`^@ zB=4Brf;SL}CCd!Lj%$0%H)IMjp9L~poNVOdn~s_+C@fiS$FPEBXdLzYpxn%>{tR@5 z%siM<*y)jgTRVScSUR21OIIlp2oXhKBs!1(f2f`6=l%Dg`kf<0;PGePuY5P=LBfgA@QE@lA|DGb33nv8xc8Hzx{2;!Hnenx(7s(x}t zYHng#g?>?LacW{waz=i6j(%}bvVKy2iGF;1W?p7Ve7s&k7|E+uPea9*@T(CCZY#k|kM3+mz%uv8^a}WQn%bNR%MbRmCJdo^F=p zo$i-r_eAl!Y-P}n=^}>Q+9^`of{!VXR!!2PDB1?;KP`d+0fH7qTNLP`Eef>->a<0S zrfy@|mixWg-TOh)aT1io4tL+q&d)dReeauLePFvL9k=ck$@6?J~jm}2Jy;9L;HOQisSXH0ZtMS>m zkZTpAnwU*gle5X{!0bRZHJhpq&JK!qu}Zo+G&>~j^-88XJUcAz! z9!oCooma*rYabgpB0+k$knRa&sUs2_wDz#HwSQh^LoD-*I-6s|kYsU<;2O2=!rSqA zgKc43@%A3}aW-bf**3^jb_d)3Omy~Mw!_ldPHV7yA7pnj6Zt0kWV_gI$PV<$?qYi& zJJ=`N%l1Kbs84n`+Yj0OeX<-IhwNj0vU}LQkUh{RyN^vk_VGU10d^3wPoP(cd4(Nf z_v7h7_A&OrGt%rsOqrH)pJ<+hCKzWe+bZzl;_0P|Ib&6<1zxC{kJTOHRH0Neo3_Qx zQq8gWe4%KWH?O{I7Ah69Sg%$KHD;TONGldI7cFiXZ=gJB>W1v;tl(HqscPkLf4bq6 z>a{{8r+C^^4SM6nPFc2HSg>+YFLAWdsFaEY%HySu`Q<(-O66WE$QM4x=1Ubc#rBfN z>S)syXJ)x!*{CFbroe6MRGo<$ZOd^=wFNt;dfI~JuNHjSmb9(aP zCCr6CaZ#u46e?b%W-WP98^dZBDR1}_!wof4^kxb-Zlo;?&$_ariu(vta32--s<_u! z)RkV9nC8kzW3Gx+ccVz-u7=cb6{Lyl15oAUlozwbWHC;o=xOtX64cU*HLMzgDm2rO zzWMsg&zR#U!bY3TEf9_yZ@&K0E9P{)$uXDae7#v?ht0XS=z}@q=4;QIz7~+MIYdI! zHE&-3qWQR}fuxyGdhPekGk$#}rs1DM>0kJgIb|)G&SJ?nc~JZE+j}n?XKiP>pFF9Xz@(o^J!1URa)h2OLre9rAWc7VTQEzvyi1wcH zJ|CwqV^x3Uc`7ZRVg$1AN5f4N*SB%mb4Z?sPgsdK5>p%~-m2_IRwKuy&+o>R$*zju z$QJ2YSY1q;R94)mtFXum$_a_7PfD^BesfXf_c@Wa%A)IX`0i4IshBpc_24wd%dFIE z3nsM(mc}eDHftAcvp#P=aPZ(kJYr>=`#YAYQmU4mT&X>X0imhBA$vxH7Jt5q)jzcW z5q|#ZQJ9t}FVq$+FH*5;p24j~WjRlkU`g1^wPL9bO|4i~Bd76u@Cm<{k_jXjq{5Xv zeI=K?sA$gzCcs9!^Wv4o4&ob|lt8(vDp@%#XE62dO-*>6^Vs(YaEjRUQ7& zJ*^Wl9p zYB1#E!ccb&nHjz_Cu<9(nq?S)p|uYia~&IC9&zf8Qt^>F@imMicTB5e49(C~;0s#e zP210RY|$!SG>c8nVH!J`%uqX#d%jdF*^6PR^+q=n%sgc*TJRuF-CXdEC#)mB3LY%9 z3PwI?RA62$vo1`wxxZ0o#e(g)7b^7y7js;UQ$=NhF?r>gd_e96x<->W0D}BI(1FY_J6JVwL4F9o#vBw{{ANay2Qd zvL?d}80ok8BM|i|9Vr}LtGMj%AR(n&i}WfTo+4%CdAOOf%C|DP9i_WeP zl{Lj4N3K{|XAvh}HkjJ4nro7ixGeEUoy3|nCbg66lKm*GaFs5$FrYALzqElp|&s3~} zZIOmuE-}k2K*ylp6UO9~LbXw`V5Gacf3|KfSwJ==SOx}7o)gp6e2j{ErTzVHUhA%* zLWT5enT%DaCF#X(s*TFdLz!^X|4Uonq2ix=s7qHgYg$h_3FcnM%gygFC9SZd(Af_aaR-hjW% zN(qKlygq9&R&r=K%tqaIeBt?ebEbL0G98#rIMY%Ca$|n>D9Y zF>P1^9|Qzr;IDRC&Nh_~Y*|^JD5681p9JnG5P1og{a>(Oq=2guoCB0%g&c>p3ar!m z*VYwRZYyi*8n8j|R&f<4vZfq?!afi8v>-n(k%9o?!udcsA`Uqxd;@2t!cnR1=z3T( zDY1s{Nojl3)s42}EXriy4{a@4Hp&TL4ni2QHDHu6!XWzfsFN%Yuy|L=oz$wd5_4ni z!LWSZa?+?fwyLb?&qe2DX0QZW6nFLZkZ4hSJ>vI5mTGaTshwa+;S|MUI49MnRLRMd zhnHd0>6h9I*BM_!P_|C(&i1xDC}E&m!g;CoAW9fn{wrBpF=VN=%NZ?iVJT;;Yplv_ z@C8Mdg0IqnOkzXi!e5oHE8TL=N#`*aE5ni@Nwo+?y84$!#%D|1?IXNfH07gJv z-s6uT^eG$Jt-v&p0EELBAaRej~2A7is_2QFPik662a?vg_^HM;zd4h0x0pWa* zMu_2t;th6iaJ~q9=XlA`Eaa>9LJw?9)N38ob_L(s1TvMYIuNxYZwF|A(SE=6Aer>g z*-jj()Qg3R{m2B?-zL(Y5&1IN6q$ewj>Kv*G{* zkU8mtz~(+P0MrC!hfx&3Ffw(}vkiGG4Of!yAh1+&7EM2DnwXd{ z$UOG=^bdF2bLP^bRf9o<~O@DG}O&wJ_v;c7qXrAF@(uHeV(tFMmg2G1`Pk%CSURQV{=_7eWn5>=m*eMBI@fQSeNXbL*Y^(C}6 zK%|5NC?HT3?Y63>-)TMklc-l;sm(Hq4Vtiqkb`z~EbXV{eCdJpL zAMYU;h+3T2d2@uC3JB223cfrcO*hi3$jUcpC8S0=jmZ9wN5G5tvl!wv|hGF?IoVr&jY6;yHp| zfLGv8P~IRe=3WAhxlkdj=-9$Qcv@YM8eTkTP@QvPggn{0p?WdQsYC2U1UAe=d=KR0 zE8TJA;eHs}Ym;ga+dq$l@TU$0nxRwyU7)1&lsu{=aSvTYV5%V?Rkh^XS|TMIa#9(M zYY1A6_tmSZs0xWHCvT6cDU@(f)~_nJm8%(5c_$N9ZtqrFyFXfgbAtk8^TX8APa^Sp z;5PNkQww~-_SAB{R3p9uU}w`Q9HnnnRtKjeCMgl93aMO9?Lp;>A|r`?bI+(|e~fyY zI|Z>#R?Eley0T4-5Y{Jptg6`vjS$8mqA3`KSn`LNMDkct<_&z(X9g)iAUzd<t5|HVXB@gJ^^`Bz%wctIUq{`%%#~NORrNj zF$Cz9W|`n=SUhEerxEcq8a!mg zp>l@p>bjXtd*3%6fFrx7Zhdxo^5o>P8FSox{Osvd9fuNH1=~FL}u13pXIv zqxmWGQS27LmPh z(B1eai2cQKX%Z?Uw(PA4Zh#>I!L_G-!SyeKS&d=w*I?+qcn|`BZqt46Z!uw~_>=fL z_~NDJEpIX81reqJ*L=FJJrkBnWtXt58n|+L)pyE1>`_-=pD1nM4v>FtK zdVkFnAc0DnOpzEwM z#AgS@axkx6%(`L~o1oS8dCMs-!dk&*m5NS}N&75H@rLNz&_DQofD9b3QX zBzugTMbghv@_8iiC_aPjX&`+-e#1@HmjoaNBMfgRy5=RRf~K9fs`WBhH1TI6d0~)k z^z;1zO19^agqD$fh@zxn8C4=QW#e{A#gmeOh0JWp{`)BWN?O+7Q#7C(4PM2FsP7r7 z`mPaG-!U}xmJw6mHgxre$+-HzhN1qCnn?Zk?m_-6D(SPOr0jGFTS}JpLr6GbOTmP| z(h={#$6@8bhuHKtwoc)iRMyLJP<0eS_Jk$E{`v1Bo4w1aQ_Y`4y#nh@!_x}<6e7%@ z5IuKkw2&n8QFQAGm;`@rI+nqY*t<}5j-qzoq=W?9JlF=a0){QLA``jfrYb5lNhlGi z6LdqsnI0AF=+=#1$0>BHjxRRV4d0yn3tkI?JJb!r47$&7De3n#jr1dZo39~X-^`MS zB%)!J3P{72efZ1@rfUnO6!H^kKs*=d1g#vyrQ?d@GJ-44BClz*b>XU)WquelpC~7p z>I{@q8$%A_$%!L(20MOO15u#C5651VTc2qC06umIltZuI)!~map$7$p5Vtwi(N3~h z=s(HN!q;f+;V_>ck~>*9jqZJ2$l&Ss8Q(+uK}tVK*5P8!Kt&4mS;|A$?JOQRW}R zUa?f!zA}VSi!n-dmO&b4!!(X8TT)jtY{YL9lh`Ox5$zqc2P^#dRyX5D1v#Ov47)?+ zootM4qutmSBG=L@SvT9h(@n3-Sb5p9xiUiGzlz`02nZJakYvBTgm?6N9+$Jr9br3G zm2123C5_2$H|q|=n@8JstrK>*vokg(f=8OkT`#C1hUqIien@&zw)gjyzNfv{9ig_= z;?6!QAL9_9D4MC-1Zau7UzC?Gf|?l>)Xe^Jj_r2F-BFC!T|~_UGrT8|No?j^cn)!epY^EFi+5#N1FQ7#}jTMs9uwCZU>7Gh_I&Dl6q3&ruqwcATrApn4 zm1^^KUbULX@e#Ss6Sd2QN-5;ZO`_{Xzz_e|W$}MR#4kKwtNhdBA;awd`6zhAIa=-` z9SYKJzVhqD?*@Ad6uXYM^l_jiP9B^PNh>`E27}%Naqs@-ZisVan*~E!Ei5Ary=a;D z-}iB|TB^Yt+vd$Lf2jy)-ua<>^7uX6MGNG08x!BVsL5SGS>$D?iLbmQS(5lRe2Z<} zqxd6&La+e$;{Jxr=SkEh1CrRk=x_n|qFS4buX0t9M?;cHL{Z^3N0(&k{jl_D>{PVk zJ{3GsWi4n2xNO|SeZ8Pt!&i+72m5Nt&}CsWIsXPlKm1W7o}BmOrB(#lAJ0WRmG#LiRj`mo8*AVcn+O`RPPAzqX= z+&QN`VJ~*(^zmayr)NC%v6H8t@U*j&XHK7;@wAyI&rTjazL^*7Vmkf{C;{dj3BfeJ zLdgIUFX;#9VrR1=_yoZqa3VK69dkw7?-oBO9v!RNpuPi`W2+N~HN4CehJnpMUgrhA z{5ZXgE^)wRPZcAi*0DYAr6~V$zB7W+W&?*FJf&`90yIU~fZge;G-LC$`FJ1UNJJ)o$r= zio;bU`JNip!CL@pC<28l#(Tt2WRNrSp#EKw)4hg>J^5}jrfEPXa#l%2=nWxLVAFT^ zYZ*lIy1e~%QuWz5)TgB0+4+7_&nW8EG|K4VIlysnLc@JXhRhTpnQ!B={{t)f!;#FU z@&4ZuR^1TWBYG#?+Ku$>N8o{UY=&%%<&7|xk1$+~;&`wDYCGX}@W_f@LiFzHI3X}7 z(1$9}N4%YM6ARM1g81Jc|9EK%2RmZ$x7zi{g1nM+vH4kkZy%kVq;XsN``86hfw}DH-i+ohHXdm zzvH?ldU4IKpAF!A!{3M&?hGYc1)8`X5&25aFpVlR+gacwnNSySMQ0;Oumu~(H`4Br z2)2MCKvog_j&~V-9)4%ubxRpH`rML&{J_SNO_zz} zX*i_MC+M&d!`>~EFDhJN%Fn_XUBV_C_E}XC*^3DJ4d|jYC6*f)Yh~h~C)*pLeEbNl zyekd<$YtcVDb`Z1?vz?PMfm4$(RW=?Cv!_N##@nbbN?;*Kr0?@N95j!;(Kr4QWKcN zSTQ>C-o!Gs{&Ufiz z5tt8(puPff)*lRFaUEvO7qB_wX$7>h#=I!(cGU*=+pO~6rpA%k_rrENWWp~}(h0J1 z!y7E~re>EcE(Ri~HFg09WXMAQF^Odd9u8+kY$8Y@^;Db>3z*|rF#291i1JGyp!uIt z3xwhALXkwz9M971(`biw*N^d4vb7?^8c%yB>O0AWeD$xLAB?|9v}u%#g=l<2GwS^ zfVQGpOhjzDu$Q_UgMGUjJZYR6l~grF8@;e%gKnH6JDkx-)21@wK5RR)0Z3Eg>p{Vp zrkxBVLu0V4uxT0Ja$dR^v}m{ujubYEwz@x}T^Of+8X104mgAGVTX8vQyAp_d}Mf9ocp13dI0AFoAYl{jhJeQlO^+vE+^Vb?LB|r z-s*jOnYs7L!+TFP$#l^6-W>m3+zIpGAJ%Cp=6(Q1fA6vt^G}`S_@5zHF8k9i>jW+9 zUa@`ebp6i+^g9`gJm}bn5I|R67m1ay2XSf28_wb$)1pkoq6P zW&aiuM{3LKQX9u<#L2GcI*#7pn1<$H56ID1fdOGx{Ab09_dSEf?0Z(7MBg(G%k({K z4!EP9@?tnV)BCK08|!_JJK#?I{NhxXI8qEALZyCsdKhL3&yAzL;&>td3i`yqOUd6+ zLQ|I;;xYX3Vi%gF3PVhTZOn^S%@3|Jot+h&cg~9!UQ}fC5(S1a_0OPlol1Lz5*kyn z6(=}kK{pDdF5Hk%e=*J2gpR;xe^62UZD!Q=Cp1+w=Qu1161)FbRrvSt>I0J(fly4| zZrsr1(Ru)_rGTKrM3r&!sT>9E3|X)aT3e@Sr-x{Za2uRLW;6gtMFu+u#>LrG8>e6? zw)0fM?obQ71a0p&5QCxwPP*g8kDff4pE-YKa+?1eo;P%npM~r%sc#g?8wh?fvXJ>o z{ID0xQMi|r(bH;WGOPkJ~-qIOQD;!s4eAvS>sP)x5s zLvbpNLL^T9-|Nio1dKWiYz>><1P%bLc#&lDr3GIP21&p!OyD0A3Kwu(uqQB2Q*k1` zAEiwGff$a8!+#L)j|ps{mz_QI?p-%x_e7RXC_B-)^Fw)q97V9fPm{of8Cs#{M)1EE zo;rh#Wp6N_FVt}0LY%m_F^WWQgQ=_6E50qGh+Ld(@=qNnk54{+^r@3GV#0h4ZcvVA zDOsb0!psfI>nk};zdv(bYbWk@;)SOl0fp17SdWlRv1w^&lCF|RfVqP{<0O-sRJP)Y kK%1_NMzV4yvNM`Z;y06B$R5a!WDD7Qviq|GDJ_})KjJOLGXMYp literal 0 HcmV?d00001 diff --git a/src/bot/bot.py b/src/bot/bot.py new file mode 100644 index 0000000..56f8396 --- /dev/null +++ b/src/bot/bot.py @@ -0,0 +1,467 @@ +""" +ResearchOwl Telegram Bot +Main user interface — all commands handled here +""" +import asyncio +import os +from datetime import datetime +from typing import Optional + +import structlog +from telegram import Update, Message +from telegram.ext import ( + Application, CommandHandler, MessageHandler, + filters, ContextTypes +) +from telegram.constants import ParseMode + +from src.config import settings +from src.db.database import get_db, ResearchDB, ResearchStatus, OutputType +from src.scraper.exhaustive import ExhaustiveScraper +from src.processor.processor import OllamaClient, ContentProcessor +from src.generator.generator import OutputGenerator + +logger = structlog.get_logger() + +# Active research tasks per chat +_active_tasks: dict[int, asyncio.Task] = {} +_active_sessions: dict[int, int] = {} # chat_id -> session_id + + +def is_authorized(user_id: int) -> bool: + allowed = settings.allowed_user_ids + return not allowed or user_id in allowed + + +def fmt_progress(iteration: int, total: int, new: int, stats: dict) -> str: + scraped = stats.get("scraped", 0) + failed = stats.get("failed", 0) + pending = stats.get("pending", 0) + return ( + f"🔄 *Iteration {iteration}*\n" + f"📚 Sources found: `{total}`\n" + f"✅ Scraped: `{scraped}` | ❌ Failed: `{failed}` | ⏳ Pending: `{pending}`\n" + f"🆕 New this round: `{new}`" + ) + + +async def send_chunked(message: Message, text: str, parse_mode=None): + """Send long text in chunks of 4000 chars (Telegram limit)""" + max_len = 4000 + for i in range(0, len(text), max_len): + chunk = text[i:i + max_len] + await message.reply_text(chunk, parse_mode=parse_mode) + if len(text) > max_len: + await asyncio.sleep(0.5) + + +# ─── Commands ───────────────────────────────────────────────────────────────── + +async def cmd_start(update: Update, ctx: ContextTypes.DEFAULT_TYPE): + if not is_authorized(update.effective_user.id): + return + await update.message.reply_text( + "🦉 *ResearchOwl* — Exhaustive Research Engine\n\n" + "Commands:\n" + "`/research ` — Start exhaustive research\n" + "`/status` — Check current research progress\n" + "`/finish` — Stop research and proceed to generation\n" + "`/generate ` — Generate output (podcast|blog|report|thread)\n" + "`/sources` — List all sources found\n" + "`/outputs` — List generated outputs\n" + "`/cancel` — Cancel current research\n" + "`/help` — Show this message", + parse_mode=ParseMode.MARKDOWN + ) + + +async def cmd_research(update: Update, ctx: ContextTypes.DEFAULT_TYPE): + if not is_authorized(update.effective_user.id): + return + + chat_id = update.effective_chat.id + topic = " ".join(ctx.args).strip() if ctx.args else "" + + if not topic: + await update.message.reply_text( + "❌ Please provide a topic.\nExample: `/research Roswell incident`", + parse_mode=ParseMode.MARKDOWN + ) + return + + # Check for existing active research + if chat_id in _active_tasks and not _active_tasks[chat_id].done(): + await update.message.reply_text( + "⚠️ Research already in progress. Use /status or /finish first." + ) + return + + msg = await update.message.reply_text( + f"🦉 *ResearchOwl* starting research on:\n`{topic}`\n\n" + f"🌱 Seeding sources from:\n" + f"• DuckDuckGo (8 queries)\n" + f"• Wikipedia + internal links\n" + f"• Reddit top posts\n" + f"• YouTube transcripts\n\n" + f"This will run exhaustively until saturation. Use /finish to stop early.", + parse_mode=ParseMode.MARKDOWN + ) + + async def run_research(): + db_conn = await get_db() + db = ResearchDB(db_conn) + try: + session_id = await db.create_session(topic, chat_id) + _active_sessions[chat_id] = session_id + + progress_msg = msg + iteration_count = [0] + + async def on_progress(iteration, total, new_this_round, stats): + iteration_count[0] = iteration + text = fmt_progress(iteration, total, new_this_round, stats) + try: + await progress_msg.edit_text(text, parse_mode=ParseMode.MARKDOWN) + except Exception: + pass + + scraper = ExhaustiveScraper(db, session_id, topic, on_progress) + final_stats = await scraper.run() + + await db.update_session(session_id, status=ResearchStatus.SATURATED) + + scraped = final_stats.get("scraped", 0) + await update.message.reply_text( + f"✅ *Research complete!*\n\n" + f"📊 Results:\n" + f"• Sources found & scraped: `{scraped}`\n" + f"• Iterations: `{iteration_count[0]}`\n\n" + f"Now processing content with Ollama...\n" + f"Use `/generate podcast|blog|report|thread` when ready.", + parse_mode=ParseMode.MARKDOWN + ) + + # Auto-process after scraping + ollama = OllamaClient() + if await ollama.is_available(): + processor = ContentProcessor(db, ollama) + + async def proc_progress(total_chunks, total_words): + await update.message.reply_text( + f"🧠 *Processing complete!*\n" + f"• Chunks stored: `{total_chunks}`\n" + f"• Words researched: `{total_words:,}`\n\n" + f"Ready! Use `/generate podcast|blog|report|thread`", + parse_mode=ParseMode.MARKDOWN + ) + + await processor.process_session(session_id, topic, proc_progress) + else: + await update.message.reply_text( + "⚠️ Ollama not reachable — skipping processing.\n" + "You can still use `/generate` (will use raw content)." + ) + + except asyncio.CancelledError: + await db.update_session( + _active_sessions.get(chat_id, 0), + status=ResearchStatus.FINISHED + ) + await update.message.reply_text("🛑 Research cancelled.") + except Exception as e: + logger.error("Research task failed", error=str(e)) + await update.message.reply_text(f"❌ Research failed: {str(e)[:200]}") + finally: + await db_conn.close() + + task = asyncio.create_task(run_research()) + _active_tasks[chat_id] = task + + +async def cmd_status(update: Update, ctx: ContextTypes.DEFAULT_TYPE): + if not is_authorized(update.effective_user.id): + return + + chat_id = update.effective_chat.id + db_conn = await get_db() + db = ResearchDB(db_conn) + + try: + session = await db.get_active_session(chat_id) + if not session: + # Try to find last session + cursor = await db_conn.execute( + "SELECT * FROM research_sessions WHERE telegram_chat_id = ? ORDER BY created_at DESC LIMIT 1", + (chat_id,) + ) + row = await cursor.fetchone() + session = dict(row) if row else None + + if not session: + await update.message.reply_text("No research sessions found. Start with /research ") + return + + stats = await db.get_session_stats(session["id"]) + is_active = chat_id in _active_tasks and not _active_tasks[chat_id].done() + + status_emoji = {"running": "🔄", "saturated": "✅", "finished": "🏁", "error": "❌"} + emoji = status_emoji.get(session["status"], "❓") + + await update.message.reply_text( + f"{emoji} *Research Status*\n\n" + f"📝 Topic: `{session['topic']}`\n" + f"🔁 Status: `{session['status']}`\n" + f"🔢 Iterations: `{session.get('iterations', 0)}`\n" + f"📚 Total sources: `{stats.get('total', 0)}`\n" + f"✅ Scraped: `{stats.get('scraped', 0)}`\n" + f"❌ Failed: `{stats.get('failed', 0)}`\n" + f"⏳ Pending: `{stats.get('pending', 0)}`\n" + f"💬 Chunks: `{session.get('total_chunks', 0)}`\n" + f"📖 Words: `{session.get('total_words', 0):,}`\n" + f"{'🟢 Active' if is_active else '⚫ Idle'}", + parse_mode=ParseMode.MARKDOWN + ) + finally: + await db_conn.close() + + +async def cmd_finish(update: Update, ctx: ContextTypes.DEFAULT_TYPE): + if not is_authorized(update.effective_user.id): + return + + chat_id = update.effective_chat.id + task = _active_tasks.get(chat_id) + + if task and not task.done(): + task.cancel() + await update.message.reply_text( + "🛑 Stopping research...\n" + "Use `/generate podcast|blog|report|thread` to generate output.", + parse_mode=ParseMode.MARKDOWN + ) + else: + await update.message.reply_text( + "No active research. Use `/generate` to create output from last session.", + parse_mode=ParseMode.MARKDOWN + ) + + +async def cmd_generate(update: Update, ctx: ContextTypes.DEFAULT_TYPE): + if not is_authorized(update.effective_user.id): + return + + chat_id = update.effective_chat.id + output_arg = ctx.args[0].lower() if ctx.args else "" + + type_map = { + "podcast": OutputType.PODCAST, + "blog": OutputType.BLOG, + "report": OutputType.REPORT, + "thread": OutputType.THREAD, + "hilo": OutputType.THREAD, + "informe": OutputType.REPORT, + } + + if output_arg not in type_map: + await update.message.reply_text( + "❌ Invalid output type.\n" + "Use: `/generate podcast|blog|report|thread`", + parse_mode=ParseMode.MARKDOWN + ) + return + + output_type = type_map[output_arg] + + db_conn = await get_db() + db = ResearchDB(db_conn) + + try: + # Find last session for this chat + cursor = await db_conn.execute( + """SELECT * FROM research_sessions WHERE telegram_chat_id = ? + ORDER BY created_at DESC LIMIT 1""", + (chat_id,) + ) + row = await cursor.fetchone() + if not row: + await update.message.reply_text("No research sessions found. Start with /research ") + return + + session = dict(row) + session_id = session["id"] + + msg = await update.message.reply_text( + f"⚙️ Generating *{output_type}* for: `{session['topic']}`\n" + f"Using Ollama ({settings.ollama_model})...\n" + f"This may take 2-5 minutes ☕", + parse_mode=ParseMode.MARKDOWN + ) + + async def gen_progress(text): + try: + await msg.edit_text(text) + except Exception: + pass + + ollama = OllamaClient() + processor = ContentProcessor(db, ollama) + generator = OutputGenerator(db, ollama, processor) + + output = await generator.generate(session_id, output_type, gen_progress) + + # Send as file if very long + if len(output) > 8000: + import tempfile + ext_map = { + OutputType.PODCAST: "script.md", + OutputType.BLOG: "post.md", + OutputType.REPORT: "report.md", + OutputType.THREAD: "thread.txt", + } + filename = f"researchowl_{session['topic'][:30].replace(' ', '_')}_{ext_map[output_type]}" + + with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f: + f.write(output) + tmp_path = f.name + + with open(tmp_path, "rb") as f: + await update.message.reply_document( + document=f, + filename=filename, + caption=f"📄 *{output_type.upper()}* — {session['topic']}\n" + f"Generated by ResearchOwl 🦉", + parse_mode=ParseMode.MARKDOWN + ) + os.unlink(tmp_path) + else: + await send_chunked(update.message, output) + + except Exception as e: + logger.error("Generate failed", error=str(e)) + await update.message.reply_text(f"❌ Generation failed: {str(e)[:200]}") + finally: + await db_conn.close() + + +async def cmd_sources(update: Update, ctx: ContextTypes.DEFAULT_TYPE): + if not is_authorized(update.effective_user.id): + return + + chat_id = update.effective_chat.id + db_conn = await get_db() + db = ResearchDB(db_conn) + + try: + cursor = await db_conn.execute( + "SELECT * FROM research_sessions WHERE telegram_chat_id = ? ORDER BY created_at DESC LIMIT 1", + (chat_id,) + ) + row = await cursor.fetchone() + if not row: + await update.message.reply_text("No sessions found.") + return + + session_id = row["id"] + sources = await db.get_all_sources(session_id) + + by_type: dict = {} + for s in sources: + t = s["source_type"] + by_type.setdefault(t, []).append(s) + + lines = [f"📚 *Sources for session #{session_id}*\n"] + for stype, srcs in by_type.items(): + scraped = sum(1 for s in srcs if s["status"] == "scraped") + lines.append(f"\n*{stype.upper()}* ({scraped}/{len(srcs)} scraped)") + for s in srcs[:5]: # show top 5 per type + quality = s.get("quality_score", 0) + status_icon = {"scraped": "✅", "failed": "❌", "pending": "⏳", "skipped": "⏭️"}.get(s["status"], "❓") + title = (s.get("title") or s["url"])[:50] + lines.append(f"{status_icon} {title} (q:{quality:.1f})") + if len(srcs) > 5: + lines.append(f" ... and {len(srcs)-5} more") + + await send_chunked(update.message, "\n".join(lines), parse_mode=ParseMode.MARKDOWN) + finally: + await db_conn.close() + + +async def cmd_outputs(update: Update, ctx: ContextTypes.DEFAULT_TYPE): + if not is_authorized(update.effective_user.id): + return + + chat_id = update.effective_chat.id + db_conn = await get_db() + db = ResearchDB(db_conn) + + try: + cursor = await db_conn.execute( + "SELECT * FROM research_sessions WHERE telegram_chat_id = ? ORDER BY created_at DESC LIMIT 1", + (chat_id,) + ) + row = await cursor.fetchone() + if not row: + await update.message.reply_text("No sessions found.") + return + + outputs = await db.get_outputs(row["id"]) + if not outputs: + await update.message.reply_text( + "No outputs generated yet. Use `/generate podcast|blog|report|thread`", + parse_mode=ParseMode.MARKDOWN + ) + return + + lines = [f"📄 *Outputs for: {row['topic']}*\n"] + for o in outputs: + from datetime import datetime + dt = datetime.utcfromtimestamp(o['created_at']).strftime("%Y-%m-%d %H:%M") + lines.append(f"• `{o['output_type']}` — {dt} ({len(o['content'])} chars)") + + await update.message.reply_text( + "\n".join(lines), + parse_mode=ParseMode.MARKDOWN + ) + finally: + await db_conn.close() + + +async def cmd_cancel(update: Update, ctx: ContextTypes.DEFAULT_TYPE): + if not is_authorized(update.effective_user.id): + return + + chat_id = update.effective_chat.id + task = _active_tasks.get(chat_id) + if task and not task.done(): + task.cancel() + await update.message.reply_text("🛑 Research cancelled.") + else: + await update.message.reply_text("No active research to cancel.") + + +async def cmd_help(update: Update, ctx: ContextTypes.DEFAULT_TYPE): + await cmd_start(update, ctx) + + +# ─── Bot setup ──────────────────────────────────────────────────────────────── + +def create_bot() -> Application: + app = Application.builder().token(settings.telegram_bot_token).build() + + app.add_handler(CommandHandler("start", cmd_start)) + app.add_handler(CommandHandler("help", cmd_help)) + app.add_handler(CommandHandler("research", cmd_research)) + app.add_handler(CommandHandler("status", cmd_status)) + app.add_handler(CommandHandler("finish", cmd_finish)) + app.add_handler(CommandHandler("generate", cmd_generate)) + app.add_handler(CommandHandler("sources", cmd_sources)) + app.add_handler(CommandHandler("outputs", cmd_outputs)) + app.add_handler(CommandHandler("cancel", cmd_cancel)) + + return app + + +def run(): + logger.info("Starting ResearchOwl bot") + app = create_bot() + app.run_polling(allowed_updates=Update.ALL_TYPES) diff --git a/src/config.py b/src/config.py new file mode 100644 index 0000000..81cf9f9 --- /dev/null +++ b/src/config.py @@ -0,0 +1,49 @@ +from pydantic_settings import BaseSettings +from pydantic import Field +from typing import Optional + + +class Settings(BaseSettings): + # Telegram + telegram_bot_token: str = Field(..., env="TELEGRAM_BOT_TOKEN") + telegram_allowed_users: str = Field("", env="TELEGRAM_ALLOWED_USERS") # comma-separated user IDs + + # Ollama + ollama_url: str = Field("http://ollama.chemavx.xyz", env="OLLAMA_URL") + ollama_model: str = Field("qwen2.5:3b", env="OLLAMA_MODEL") + ollama_embed_model: str = Field("qwen2.5:3b", env="OLLAMA_EMBED_MODEL") + + # Claude fallback (optional) + anthropic_api_key: Optional[str] = Field(None, env="ANTHROPIC_API_KEY") + claude_model: str = Field("claude-haiku-4-5", env="CLAUDE_MODEL") + + # Database + db_path: str = Field("/data/researchowl.db", env="DB_PATH") + + # Scraping + max_depth: int = Field(3, env="MAX_DEPTH") # recursion depth + max_sources: int = Field(150, env="MAX_SOURCES") # hard cap + max_pages_per_search: int = Field(5, env="MAX_PAGES_PER_SEARCH") + request_timeout: int = Field(30, env="REQUEST_TIMEOUT") + request_delay: float = Field(1.0, env="REQUEST_DELAY") # seconds between requests + min_content_length: int = Field(200, env="MIN_CONTENT_LENGTH") # chars + + # Processing + chunk_size: int = Field(800, env="CHUNK_SIZE") # tokens per chunk + chunk_overlap: int = Field(100, env="CHUNK_OVERLAP") + quality_threshold: float = Field(0.5, env="QUALITY_THRESHOLD") # 0-1, chunks below discarded + + # App + log_level: str = Field("INFO", env="LOG_LEVEL") + + @property + def allowed_user_ids(self) -> list[int]: + if not self.telegram_allowed_users: + return [] + return [int(uid.strip()) for uid in self.telegram_allowed_users.split(",") if uid.strip()] + + class Config: + env_file = ".env" + + +settings = Settings() diff --git a/src/db/__init__.py b/src/db/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/db/__pycache__/__init__.cpython-310.pyc b/src/db/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d0a25f527f91ceba24d75ca82402becdea23706f GIT binary patch literal 137 zcmd1j<>g`kf<0;PGePuY5P=LBfgA@QE@lA|DGb33nv8xc8Hzx{2;!HHenx(7s(x}t zYHng#g?>?LacW{waz=i6j(%}bvVKaEetdjpUS>&ryk0@&Ee@O9{FKt1R6CF<#Y{kg Gg#iGlZyo;t literal 0 HcmV?d00001 diff --git a/src/db/__pycache__/database.cpython-310.pyc b/src/db/__pycache__/database.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..73791ad78c3ce5fe2065566da6fc503881efcca6 GIT binary patch literal 10724 zcmcgyOK=Iyrf!Gd-x6x{lo6|Jw(4mw+Uv zGhJyv?tbs@|M>kMi|)ikPQvwX>;KvO>D!X@dwPjpDZE_4ZT}4lCNa4st$KguRav|% zs|x-qEw!z!YBIH{tyEiI)y1>cO1Cqs8S$KIW!t&coGe|Jn9kBqB$l?utjuF=bsRMr zmPJk0ubE)E)jS)sq^%?C3L9q=PqfuZ^vbg%=yl|=vU-%sRjDxfIrb%$3$m--GM!EQ z%`ERY&2GnRQC+FiYg3(VIZm^)VK=CE^pf#%1^2U4C@hK2g!9O&DpOW9rmm)#wyLuf zR?^*k#j-7vH#Vz|>GW(j&3l~=W_7bR-nr>m%+0JfJ575NPbrJ@E-x3-Zl+%Em~E?G zcXRc6yUThldLFOW@AS-;-;%DZl*^0d>u$C>S6ivf)k+I)=GtO;v3dhfsZyn~Tse^V zG3-(lUlh-8cH7qZ#-`Oa@7_Dl{jGI(Tjy=wIM3G3Gt)8GOxv2>xzE!WH%7-KZf7J- zrhge()Q_YI*U=g0&Mo(xot{&>zhk-Soi1ybw&QARt?q`ab8Dx|9ancYxn(k(mXBi) zpP(X-!cE^=UYMV&)?DrC((-jzuas^rS8A?ayHP34EesCiJx$ZN85C3W@LKR@{v;zw zxp|DJm5kcl)uocLc+Ds;*NoCDt`NQq4H+ zi@<4r602oxaI;-+Y?@9zvN&yZFr^c8Yh$ueAx(ADF>vOk#EyD9VC~4( zrei@A1r36u2fMux9Mv&fb-UZ+4a*McGqh7vNuS*=XCu3`EypxIt}d4cB>U-u z)49S2xfiiB*xT1-qR;&ZJwwugkk?9;QhB~KL|V=?S;3=CkGJBi_DR-sT9#<=-n|Xh z9jJs?OTy+|=gt|s&F$un#hPXjg2`CZDH`{?J*T&36^$LXUNm;CH8k0_kUVDXIGdx` zKoHS%?$_-`ms_4NBp5~HuQ$5Ajx%ZwyTMItCy*xp1QD?gW<0%Pbr=BcbkQ(y4F9cY z_!AV3b+g&B7%Ff+%qX@mU@k(MmGa_`R!V0EtSA~FYT;!p_a*KBpW{9&hbU`QWPFJn z2uh>daV*e$2x5q%)SDe<-3wWg0D$hc)rm9BzCOqfgyIlkqQpLg<^!^O zD-RSe)NhY|FXGAwOnuo|efjWb{v*yTaCRspd8!gU`PUM&S2)mOd7<=K!d5ofy#V|J zb7=sdgQW*N0!aHNchoeV(ne?TY9iYI%9Hq_Pg^-ONzsTa#zLt&|MH2GoC^;L%@fLN zpYgi@{y)f(GXpoS6Z412Kh$vl8n<0T@kn+gN8VB%%UkM}#$^0cey+%pliJs*{;9kO zANYp!05x0bzEscjrTwgv3+iPF?_*CTuNS?+w^Yluf&$;lOg~b(>ah`g{?)rF>=IQeZz9Wld(9MD!L{E<0~nq)^&lg}PwKNPjoSm`y~Gno5z{M9Xt z#QfvzI6LtqwR(h|6f?cSPGQ_6`#_9(OYG#R*on@*!3^}A@>lU{HTEa$n;3B{Tt&C0 zah7kfZ=?1#tba+gV0QL4*8NlbJ&pT}HNy(n!_gB6g0S)=JImg|?60#Ahpc>#&7$qN zzj7MqJI^j){u83+BIc#JX`T#wmwg8wFXM=V(CA@NowdYRrZ=i2OMG;)27zhnK^}K=yQqO(q zsq{eYOIz|jLb<-|K=%4dU+t^wsi%sso|7?@S~KB2Q-5GHqEu8mLTmR8`6p82R~4E`4oFltV-Z3)u3X@+HvR+Zpn_^is*DlP`KPkc=@|vF%0nF_$$gbqC*{4F2st91N1mTw!VGkh zU%c!jNvZHomJ}*Fr!iMv@TCr~Vd$BhV7an@aQEu%h!`UHs?Hlri#Hc*#zp=iZOx|( z35=i%pQEQRzrck7n4+V?fHdz~0eSY&%{K-lFPfVBGOvR`Zzc#79~Gk$DJ|W^6SzAQeSO9#ZrCsZvmvgRQp;W zUu09!rUw#6xhF_fy6ol#%J_u>H6`O7Tm(aeEQ0+f@$d-K{5bOf3t|E3m&zxSEZxBd z0dmToM*JuXavdnHGJC<5x0A%nUfQra*1a8m`L8kEGtvy@;c;oN7@?H6p@6otms;J1 z*|IOshC}z>T)DMCSr6_fc!IvRQ7zSsFD8trCLsOtJuKsDTis^I)mrc(T>Y-u>RI+c zMFfBZs6l$+>f5^}->?a?9h?6MRmrpD;~RZ~EFS9l*oU;)KiM3K`F=JqNHVL-2VbEkL0~?4J-;wI<&v9MU1*GT0`3cB+$2z$=%lY zH}Rx>3Xx$Fl^VSB`?E4sT9$4LeAp!~e9!Ss|8m7xyk1_0i4jgfK)Fab73EgmuFGy3+6B_DfV5mf1hrf@ryV=sc1~NrR>_N*1HtiF%@lUDv zjEdV-key4iF=sO7hkbvCW_t#O@=#HTgX6;JABPU(?W?`lB77W-LY6y-WMirvLvz~0 zBiQ?Wpg3IpD1ytL)sG>#y00-!Ab9E!f~)%x=S(2DUgp;@|47*8*U`#vP$BHtV)!Ip z$)fQk6n{?-H&N`p6FF&sR^pJukocI7#DOpn{_--W0qd4&%3D+j^dy&=6RgFs_QzCF zp`uE~Fr1x;?cxC4goCX-B#=D?sNFk0YRjLKC+?q=la$rXz@ zW}44Jjf^l1s8-&yO-`679B`0KkXA01*h*$pD# zGh*llS!w+Ny#BuYSpJ1_N2z+;KalxHXg^T+$N1yQhdkljB{Gkc47>a*nvEQQX=(4> z2z?2}{EY`0lZar1!w=Mf>iYht*=im5e!1{Ge}c(9_fvyb+ezg-He{uLz>o;a>d?`A zDlhN7_2Lc2bTevZpOLHgY96`Icy++|YhQ!$AxjVEY)0~un?xX?rfc-n2ww-PYoYF+ zseT>B-fV&lp!kb;#5XYLP%yx^Xg;w-xQ=^-m=T%opU~%7#N5fb=Xni`I5~O;G5sSZ zg!)Ipag+(8^^ZwC(mI3?@A<^P=mkSB)jIlyL19ruEcs=u8~FS}B8A-;W+*0nm=`%H z`(H554x%B2;-TkLS`36@-9-k9^5X7TBn$42IfI0^I}wu$HydWO8)WQ*D~ktOSD+T5 zY@BLj_+kF_ewt4?2o3t_r*a1$Zyjx4dn&DGLQF&Y4Iya6l=L$}od1u}IN!@T8Zis+ zAFYWm9)89U5A;RMOYR?XCL^azWGstDc*{+;8 zvygZC6S!itKHJ2{IpO&Fylo_n&&T)(9OS(XTV$St!6LI@&o;epT;#j?muKr5vqP+1 zV>Tw)#$_mBa>)=Jlbs_G`e>8)sxbNQAh8FN=OPRuTtn{4?vO$0A>_ z8e|%Ri^v^DJKx7k^t_b51k53_Sr79ZA_pQ-RwVqEplxZM$@nA+ZQ<9tfy@?-$Tp(r z7JgGI3<*{WG5MfOLj-c7?#^FQlYpFI@G@qb$R~Ll_z(1q2uU9HWl{sMMDuNt?lDwR zF_vOPfhtMwrmy()3UNi`g<|vyvW**qs1H9+S(=-Vg?-`IA{d7t0vb-Tq_QW_yZ{3{ zP_K}D1=R!V0Zu2x?wBBezqW6P{)^}~u*(#|O{ZeM;gCV2^^0PasJy*MzxathVrtSc zK|ko(`vD&N-^3%?h5_+>eG4ckk`9y;5b74VCV~b+X6C3VK?sqKjML#u>Nbz!Py#aE zZ{*Q=3xg@PnfDLjrRHvwxcfMLgg>n*b4iKf_R4__n+~gkU1C z?T=Jx>Ml_#s#}C;euskM))>~cE;rFSn zPsIZ&{)&o+R6L^MXH*PCMLbPY-FXxZ+%|=P8EI0UNzLSD()rv}R+*gqN+(`Hbr@^D7ojSj!cc6%uZ!yGAGkw-olii0pF#CNSEpJt+Ux`u8B{! z_(k0P9sEi|(7@F!{0@;5+T-I1Z_Gl8*P9#Svo3!^6A+Av-@Wm(^z@htvUy^uVP|tQ im%LvRUItqw`#6e>l)<;zaqpiprB0=_^yJ)B_J08(Ok(N) literal 0 HcmV?d00001 diff --git a/src/db/database.py b/src/db/database.py new file mode 100644 index 0000000..cf099fa --- /dev/null +++ b/src/db/database.py @@ -0,0 +1,265 @@ +import aiosqlite +import json +import time +from pathlib import Path +from typing import Optional +from enum import Enum + +from src.config import settings + + +class ResearchStatus(str, Enum): + RUNNING = "running" + SATURATED = "saturated" + FINISHED = "finished" + ERROR = "error" + + +class OutputType(str, Enum): + PODCAST = "podcast" + BLOG = "blog" + REPORT = "report" + THREAD = "thread" + + +SCHEMA = """ +CREATE TABLE IF NOT EXISTS research_sessions ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + topic TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'running', + telegram_chat_id INTEGER NOT NULL, + telegram_message_id INTEGER, + created_at REAL NOT NULL, + updated_at REAL NOT NULL, + iterations INTEGER DEFAULT 0, + total_sources INTEGER DEFAULT 0, + total_chunks INTEGER DEFAULT 0, + total_words INTEGER DEFAULT 0, + meta JSON DEFAULT '{}' +); + +CREATE TABLE IF NOT EXISTS sources ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + session_id INTEGER NOT NULL REFERENCES research_sessions(id), + url TEXT NOT NULL, + title TEXT, + source_type TEXT, -- wikipedia, reddit, youtube, pdf, web, rss + depth INTEGER DEFAULT 0, + quality_score REAL DEFAULT 0, + word_count INTEGER DEFAULT 0, + scraped_at REAL, + status TEXT DEFAULT 'pending', -- pending, scraped, failed, skipped + error TEXT, + UNIQUE(session_id, url) +); + +CREATE TABLE IF NOT EXISTS chunks ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + session_id INTEGER NOT NULL REFERENCES research_sessions(id), + source_id INTEGER NOT NULL REFERENCES sources(id), + content TEXT NOT NULL, + chunk_index INTEGER NOT NULL, + token_count INTEGER, + quality_score REAL DEFAULT 0, + embedding JSON, -- stored as JSON array for sqlite-vec compat + created_at REAL NOT NULL +); + +CREATE TABLE IF NOT EXISTS outputs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + session_id INTEGER NOT NULL REFERENCES research_sessions(id), + output_type TEXT NOT NULL, + content TEXT NOT NULL, + created_at REAL NOT NULL +); + +CREATE TABLE IF NOT EXISTS source_contents ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + source_id INTEGER NOT NULL UNIQUE REFERENCES sources(id), + content TEXT NOT NULL, + created_at REAL NOT NULL +); + +CREATE INDEX IF NOT EXISTS idx_sources_session ON sources(session_id); +CREATE INDEX IF NOT EXISTS idx_chunks_session ON chunks(session_id); +CREATE INDEX IF NOT EXISTS idx_chunks_quality ON chunks(session_id, quality_score DESC); +CREATE INDEX IF NOT EXISTS idx_source_contents ON source_contents(source_id); +""" + + +async def get_db() -> aiosqlite.Connection: + Path(settings.db_path).parent.mkdir(parents=True, exist_ok=True) + db = await aiosqlite.connect(settings.db_path) + db.row_factory = aiosqlite.Row + await db.executescript(SCHEMA) + await db.commit() + return db + + +class ResearchDB: + def __init__(self, db: aiosqlite.Connection): + self.db = db + + # --- Sessions --- + + async def create_session(self, topic: str, chat_id: int) -> int: + now = time.time() + cursor = await self.db.execute( + """INSERT INTO research_sessions (topic, status, telegram_chat_id, created_at, updated_at) + VALUES (?, ?, ?, ?, ?)""", + (topic, ResearchStatus.RUNNING, chat_id, now, now) + ) + await self.db.commit() + return cursor.lastrowid + + async def get_session(self, session_id: int) -> Optional[dict]: + cursor = await self.db.execute( + "SELECT * FROM research_sessions WHERE id = ?", (session_id,) + ) + row = await cursor.fetchone() + return dict(row) if row else None + + async def get_active_session(self, chat_id: int) -> Optional[dict]: + cursor = await self.db.execute( + """SELECT * FROM research_sessions + WHERE telegram_chat_id = ? AND status = 'running' + ORDER BY created_at DESC LIMIT 1""", + (chat_id,) + ) + row = await cursor.fetchone() + return dict(row) if row else None + + async def update_session(self, session_id: int, **kwargs): + kwargs["updated_at"] = time.time() + sets = ", ".join(f"{k} = ?" for k in kwargs) + values = list(kwargs.values()) + [session_id] + await self.db.execute( + f"UPDATE research_sessions SET {sets} WHERE id = ?", values + ) + await self.db.commit() + + async def get_session_stats(self, session_id: int) -> dict: + cursor = await self.db.execute( + """SELECT + COUNT(*) as total, + SUM(CASE WHEN status='scraped' THEN 1 ELSE 0 END) as scraped, + SUM(CASE WHEN status='failed' THEN 1 ELSE 0 END) as failed, + SUM(CASE WHEN status='pending' THEN 1 ELSE 0 END) as pending + FROM sources WHERE session_id = ?""", + (session_id,) + ) + row = await cursor.fetchone() + return dict(row) if row else {} + + # --- Sources --- + + async def add_source(self, session_id: int, url: str, source_type: str, + depth: int = 0, title: str = None) -> Optional[int]: + try: + cursor = await self.db.execute( + """INSERT OR IGNORE INTO sources (session_id, url, title, source_type, depth) + VALUES (?, ?, ?, ?, ?)""", + (session_id, url, title, source_type, depth) + ) + await self.db.commit() + return cursor.lastrowid if cursor.rowcount > 0 else None + except Exception: + return None + + async def update_source(self, source_id: int, **kwargs): + sets = ", ".join(f"{k} = ?" for k in kwargs) + values = list(kwargs.values()) + [source_id] + await self.db.execute(f"UPDATE sources SET {sets} WHERE id = ?", values) + await self.db.commit() + + async def get_pending_sources(self, session_id: int, limit: int = 10) -> list[dict]: + cursor = await self.db.execute( + """SELECT * FROM sources WHERE session_id = ? AND status = 'pending' + ORDER BY depth ASC, id ASC LIMIT ?""", + (session_id, limit) + ) + rows = await cursor.fetchall() + return [dict(r) for r in rows] + + async def get_all_sources(self, session_id: int) -> list[dict]: + cursor = await self.db.execute( + "SELECT * FROM sources WHERE session_id = ? ORDER BY quality_score DESC", + (session_id,) + ) + rows = await cursor.fetchall() + return [dict(r) for r in rows] + + async def source_exists(self, session_id: int, url: str) -> bool: + cursor = await self.db.execute( + "SELECT 1 FROM sources WHERE session_id = ? AND url = ?", + (session_id, url) + ) + return await cursor.fetchone() is not None + + # --- Chunks --- + + async def add_chunk(self, session_id: int, source_id: int, content: str, + chunk_index: int, token_count: int, quality_score: float, + embedding: Optional[list] = None) -> int: + cursor = await self.db.execute( + """INSERT INTO chunks (session_id, source_id, content, chunk_index, + token_count, quality_score, embedding, created_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?)""", + (session_id, source_id, content, chunk_index, + token_count, quality_score, + json.dumps(embedding) if embedding else None, + time.time()) + ) + await self.db.commit() + return cursor.lastrowid + + async def get_top_chunks(self, session_id: int, limit: int = 50) -> list[dict]: + cursor = await self.db.execute( + """SELECT c.*, s.url, s.title, s.source_type FROM chunks c + JOIN sources s ON c.source_id = s.id + WHERE c.session_id = ? AND c.quality_score >= ? + ORDER BY c.quality_score DESC LIMIT ?""", + (session_id, settings.quality_threshold, limit) + ) + rows = await cursor.fetchall() + return [dict(r) for r in rows] + + async def get_chunks_count(self, session_id: int) -> int: + cursor = await self.db.execute( + "SELECT COUNT(*) FROM chunks WHERE session_id = ?", (session_id,) + ) + row = await cursor.fetchone() + return row[0] + + # --- Outputs --- + + async def save_output(self, session_id: int, output_type: str, content: str) -> int: + cursor = await self.db.execute( + "INSERT INTO outputs (session_id, output_type, content, created_at) VALUES (?, ?, ?, ?)", + (session_id, output_type, content, time.time()) + ) + await self.db.commit() + return cursor.lastrowid + + async def save_source_content(self, source_id: int, content: str): + await self.db.execute( + """INSERT OR REPLACE INTO source_contents (source_id, content, created_at) + VALUES (?, ?, ?)""", + (source_id, content, time.time()) + ) + await self.db.commit() + + async def get_source_content(self, source_id: int) -> Optional[str]: + cursor = await self.db.execute( + "SELECT content FROM source_contents WHERE source_id = ?", (source_id,) + ) + row = await cursor.fetchone() + return row[0] if row else None + + async def get_outputs(self, session_id: int) -> list[dict]: + cursor = await self.db.execute( + "SELECT * FROM outputs WHERE session_id = ? ORDER BY created_at DESC", + (session_id,) + ) + rows = await cursor.fetchall() + return [dict(r) for r in rows] diff --git a/src/generator/__init__.py b/src/generator/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/generator/__pycache__/__init__.cpython-310.pyc b/src/generator/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7ed6028bec4073fd7457d2fbedd7d97d62de1a68 GIT binary patch literal 144 zcmd1j<>g`kf<0;PGePuY5P=LBfgA@QE@lA|DGb33nv8xc8Hzx{2;!Hqenx(7s(x}t zYHng#g?>?LacW{waz=i6j(%}bvVMAMUTRTdNq&)je0*kJW=VX!UP0w84x8Nkl+v73 NJCI?;OhAH#0RXMmAqM~e literal 0 HcmV?d00001 diff --git a/src/generator/__pycache__/generator.cpython-310.pyc b/src/generator/__pycache__/generator.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0e6e594de0d87bf9f69ffc7ee8ccae4faebff7f0 GIT binary patch literal 6560 zcmb_g%WovddGFWs%ruA3rT5_%3W`_Rfyg13D<#0=3_ieY)iRikxOol5)udy_?C--++)Cp{0DtZ4uMY2*>R%$zUrPKm(oee5M8XU zuE+QMecxBn__(9t_xt=e{;$5NY5z_ySAP|}Jj4@AG+f)!xXz70-_h~k42-V1V|J|_ zOQ&%wD0S@}yIbBV>zdHdD;L^Mm6vuLZtslo@=lFccE&~dV1ifCPI3qB6dyx7&1=53 zGxJ2_<9y;%jZb`H?A+i+ORG=zKXSH2EL`bzHjV@KM1(@RNhD)uQ${@XM9kture2at z!C90hy)=p0o{YMzCnJ20F)WK6SQ`7`KHCTax9d8egC%XfZe+E5U^VbXm}FC{QJA2E z{b_k5F=Ir-<42jZp$^$T?Fn~E*R+qKl(|wcH)P_ZCuG8U5%=6UVX-IuUUDq`M92nP zmBLK~6XCwQ4=U!^u?T|sE5zs1W9B3sH(`511T1vHmm9F-jtE)Yiw^P64Wg7c*z;I0 zFA9%@jNQbKLd*)>a6fhT#T=6z0U7xo20uhajfW z&%P{?-5}a$xggZKgD91t*pHKuTw2YI`KXupUB53lXuU2SS0+B_kJ+)GbXd+wN4OkR zgSDYlN$D{JK*XU7buiCQlz`#}b4X}JCy$Opwin5+nG(6Gh|Hqek8yS}ij!v&|L}HZ!nv4A-kQD762roew=GcoQtkhAgh!f$ZYCKN6 zAntUIagn%w00Q>>uwXET#5^)GI>QIY&^;8%eQXKkg7h>bvE}+Cy~Q-1+sPFp8*#7B zPTXB3(vBbl?vcc#57w_T%HqiLVMtxUeYY_2k=90X-^PA0^MyZUNe6~bMy0%{;WS;Z z!;)hm64p)O6}y6gR48WY`-{x$xYG5ou%I+dySsQ{_wFyO&#|S2^+uynpJSaUIwYP0 ze{Zh?E2sI+ntLdi4>!Xef*4U+A5wH_y;20?$PE+LapO+n?#B%7m2}7zdBgoRT=NTf18%EK#Z_Jt2w2VQ?%r^m))0`Eoq6_beS2$z z-6IazTTJc3e5fu0d-D%i*AMHM)oMOjYp!pz`^)T+>mBY(QYU+pDJ(ikuFP0kW87tX>tC+A~BXGI6B%JnZgK9I<$Kj z(PuwoFLq%9ha!nzLUvotR&!-*^(k9h*=}w%wK>w>dTNJWOpCNS~`a z$1O4geERS@^pifwjg?nEU0QhhfYJ6)6=svZ_msXn{(dLG18~ClS{fuiq`&}kBb5U7 z5bdBjngL6Upgah$0EGO21fJs_yQj4Ihmj;_O~YiF-G66saefI6JH{w|-Ht-B487%8 zNv5$2Zy5U6*HG6;wxszX7%jX1@c*N^k?-aZ`?@dQZ?NVFYg;+ivU^KIZtKnxVFn}y z_$O_^Yg!&ci1VP4yIcXr>|Tyig4dn5vFFeOR|;%)-f6H_4%dH@BFH7DYz1jZ9Fq|_ ze*o-!9lFUmW27yBpw+HdBZ{=bHC?rQiceoxZzQW5|3BrADt1kP|v-jU` zZmAfM#}U}&{dZue`CAMOO56a0S0jykGKw!`$LlJt5WaUQwk$1Da9LWU=z_1qJSR>i z;-AV9mA~QBqtp*Li&OYlh~!~cP$Wa3pm;_bHv!YiX%S-{zE+R&(H`B*}Jx5a9x<(5EeH@Y0u^s zFMVq5*j1Z0oRlju=8pBa2Nvm=$5XI11OdFt(1Ab9wrzJX^IPka;2Kp$vr zZD63)2PRsBYc1^;`g+}zWqg-Zy~rw>x|Lb62==ZG|3Z^BeEsd)3!SJd7Ccyodvvle zM3Lw?Scs*!kf(vjj4ykQ-f328xBbvh+U-5;5!+~N-PRqwfBPD9jp4-g%pn{AwzPeo zjUlms+qV-`5}7RpQvL8XhNoytJpV%b{}mkXr(pO%JJ$#Lh5igB*Hi7x8d&jcVsK-i z|6JqdXXYpR&-8)CtvlLTDKQVARtTuX?N_F*4NN&dFnM`kU~Yxx=DXmFcCO}K7-u%l zun)=*uFb35d1V$Sl?Ubc13LepGSK-Lue~ygFBLw1UgZ;f@>6Y4d8~Z`>dt8PpMkC^ zMNj$MxzNt4$=IO!iO#1FE%`B@Ij;_?C>y8$(zv5t=={bl4XaG~=b&&_QTUB(6xId? zc6O5S0ahGL3`&DCzxm2I8v|972UCNwfx~a{+aro=N?Oxbq%{scXRyu~ScCBk>uh2$ z!SCGC1~r^Lsb=2*-^O6_qTpKBlnlQ)FhKhxX#bXy<0Opu+x>cBzLWtJrDtA3To#;# zy4U-aR<3x~cPR78^YyHx^1=Reu@NO&xly3zw8$>x40NBqh2}s1{%>9Z*c0iCBbv7h zT)#XAg^WPv9;oyCCUEWZ?_yD1S4XrDKc6k$0u)$Dzw9{k^Yf15!Kg+*@+}ijOwVmJ zKZRMmET6&LxOPYTK-0C4ff``l$jT}w3zK!_Ko~F@i<$1Jx4aq023hr8@|}~OeD5!4 zQ{}-wxvBNvy6z!`i;2d&L4>L`ey=fF*?+JeU8&`ZgPD7T5{Uv~gKecDTPQFCiK;~6 z=J#&lc%Aj@j`IT2U@qsXNo%V*TZWDl5_uiIlkw2 z1hmz?q~jy#Acu;P!oQj9{i&S$qP#@h z$Lbp-E;~SwOgi;hg;UwszDU}I1!c|$Zjg$ml##@|2ytCzmXE|Nn`qNK>F&2t>&sIG zcv*Gyo=wna7+Txwq~T$lSqG6HW~CUJM3R{}G8B)>RxX&# zWmwAD_>PiDLG!p$`$^JPiK!&p&TMSd?Ij2s3P-__BE}w4v6luxLB%z@lntyu#1k*0 zF@7>*TKbe;F{X6OxT)97+w|XS;P7Yq#sI2apI9T z4S!_&6pB>zdWcS<9RO+o#QJ9j`iANos&A^ksrr`cTb~)eb!HAsz%*7}(A`*Rz5hmj zVS{@jiCZyB&+|IrO6Au7Hai^E4}gxmS{&XZ^cfqCaTg^_w72K0a({UB(0>!z&+BOs zD(=WTBIGeZYCDmQevYmCl!boYCv4V^`;WBvEIKsy756p7W80Lm#FI^mc!`61X-lw zUcQUnvx!0k?bb)F?dF>NBN`kdaSTV^qml7kG{dn!R%5xChGP#ghGa&0kaC`K?xbH5 z6h(-C#G?dd8Lz_|8^|vcj_rw z|K-@uZ~Nc*l2u$qa7hJ2eUdWV(CvzLJ9FCYF6!cd`n5I!jT_`&B;lGQZ2bYvHNXmExo|r(sqTSRDyY_{lJ73& str: + """Generate an output for a research session""" + session = await self.db.get_session(session_id) + if not session: + raise ValueError(f"Session {session_id} not found") + + topic = session["topic"] + logger.info("Generating output", type=output_type, topic=topic) + + if progress_callback: + await progress_callback(f"🔍 Retrieving best research material for {output_type}...") + + # RAG: get most relevant context for this output type + query = self._get_rag_query(output_type, topic) + context = await self.processor.rag_query(session_id, query, top_k=30) + + if not context: + # Fallback: use raw top chunks + chunks = await self.db.get_top_chunks(session_id, limit=20) + context = "\n\n---\n\n".join(c["content"] for c in chunks) + + if not context: + raise ValueError("No processed content available. Run /process first.") + + # Truncate context to avoid Ollama context limits + context_words = context.split() + if len(context_words) > 6000: + context = " ".join(context_words[:6000]) + "\n\n[... additional material truncated ...]" + + if progress_callback: + await progress_callback(f"✍️ Generating {output_type} with Ollama... (this takes 2-5 min)") + + # Build prompt + system = self._get_system(output_type) + prompt = PROMPTS[output_type].format(topic=topic, context=context) + + # Generate — may take a while with local LLM + output = await self.ollama.generate(prompt, system=system, timeout=300) + + # Add metadata header + stats = await self.db.get_session_stats(session_id) + header = self._build_header(topic, output_type, session, stats) + full_output = header + "\n\n" + output + + # Save to DB + await self.db.save_output(session_id, output_type, full_output) + + logger.info("Output generated", type=output_type, length=len(full_output)) + return full_output + + def _get_rag_query(self, output_type: OutputType, topic: str) -> str: + queries = { + OutputType.PODCAST: f"{topic} story narrative facts interesting", + OutputType.BLOG: f"{topic} key facts evidence analysis", + OutputType.REPORT: f"{topic} evidence data official findings", + OutputType.THREAD: f"{topic} surprising facts shocking revelations", + } + return queries.get(output_type, topic) + + def _get_system(self, output_type: OutputType) -> str: + systems = { + OutputType.PODCAST: PODCAST_SYSTEM, + OutputType.BLOG: BLOG_SYSTEM, + OutputType.REPORT: REPORT_SYSTEM, + OutputType.THREAD: THREAD_SYSTEM, + } + return systems.get(output_type, "You are a helpful research assistant.") + + def _build_header(self, topic: str, output_type: OutputType, + session: dict, stats: dict) -> str: + from datetime import datetime + dt = datetime.utcnow().strftime("%Y-%m-%d %H:%M UTC") + return f"""--- +ResearchOwl | {output_type.upper()} OUTPUT +Topic: {topic} +Generated: {dt} +Sources: {stats.get('scraped', 0)} scraped | {stats.get('failed', 0)} failed +Iterations: {session.get('iterations', 0)} +Total words researched: {session.get('total_words', 0):,} +--- +""" diff --git a/src/processor/__init__.py b/src/processor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/processor/__pycache__/__init__.cpython-310.pyc b/src/processor/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..33116638ac00fb7b62139d5b632e8f51126073d5 GIT binary patch literal 144 zcmd1j<>g`kf<0;PGePuY5P=LBfgA@QE@lA|DGb33nv8xc8Hzx{2;!Hqenx(7s(x}t zYHng#g?>?LacW{waz=i6j(%}bvVK8PesXGYaek40e0*kJW=VX!UP0w84x8Nkl+v73 NJCI?;OhAH#0RXYbAs_$% literal 0 HcmV?d00001 diff --git a/src/processor/__pycache__/processor.cpython-310.pyc b/src/processor/__pycache__/processor.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..797514e5d81b9f97369a387325eb601c31e96874 GIT binary patch literal 8769 zcma)BOK{vqdPd_tnBhYs>OuL1D00?QmMD3x&8`($57V+&dCP5c|N)wbttR*ZG#9o6T~yZATbO52Lfu-_i31AG~qKQhZ}hPwHA#Xy()OPf|7oKD%3z`jYtgg>fE@e&6^doV(~-V4lVAW zxS`D}p#`FNyb7W?AeDs{H>`x-v<7mSe2kBSAXT*17Ii+sC-F4KO%S9mYsIN%<0wqB z-cwg`jF#V0PyS*@3=Vzu2iIE+I~rVcZ=lSgq$f~i+6vpoMt4}QH4QwO__w%*m*{g} zu(J(Y(0J)}6cw9mB9^vDGZA&AyAY(IYN?gI?G6uPX{kRD&rHL3@iR?~V}&oypIz>> z!?UgBupQjlI4eqKcGlywsc4<;4j8(3H`CpeZnGIBQPyl8qaB;5##xQ^j*K3EW-w-M zcZ2q7W36^KlXkk9W?@@8S=0_YYiJN*wkDDmEi@(Q+jx+~DS? zI=4PG`)00-damatw=ZfLThViaJ0G+2+I2|R_q9*H)wgnM#lT$iZ#3>cq>+`6^`f_L z=k~U>qcIKt_Kvpg5Zib3ZFfg|SJSbAiQkq{jAo$zOXa%h^>xNDk7@6rD8AI@-dOtN zXLS9;BLLRlUhAC<|l>uK{62rAV^06JQau0`WX{b z`0I_G4Z6|Ur7)poLcrZzNjr&j!KiL0NkgeMDrzlhEoWJGLsk{&3Is~C+ri?bbmXK6 zBH#ijY^E7@l!~XR*;ooQX({A195IF_wJEg+wFBvNgUz@TaA_AnmL|Php{J-OXdQR3 z#i#J3Fs$D-4V~%CW%mD?w#$y{PvgGM&V6|RCxe;N%)*VV1=0;K8+WvO1%rRW%^nOs z)%$wyRbn~Ga>;TNEPuXlq{>@dY_bI5FrV*o(OZak&0 zjru8kR^(M2*ipSlonp<0p07moQ!??cn#}#np8&nZ(#Yw6N4B|IY$;!2jR;Z9D?YOhF6bk9A2O@#Gl?H$U-J@XA zpc&6o^;=YZi{`TMATBJmyBE<}f}*DD>?k8B)Y$ni$IUt$XYO77t6qJd@=(+EDMUVQHcaOo!teWDr+(I#7g^g_^w{~EQVJCFJ#}t>)hr+FC zZHe_Qtbi4#@g}v3o$S5C0uZ5K$in3AceTEKyNm4|Z?E^Ksqm!dX&^zz5Ac zetG0Bc}|KHslTx42VL0Hji?Q);IDTC_gkH{B%4tyz$NQKsib~45W$iNy376odU+tC zF!k4$!^DSSNF%_hn6OXKh`(fuZxSU*FD!wI?|VwDa6+L&yzu{*`fYfDip>osPGKA@ zo<4ImJ2r7Ma6@$fVsH8VMdkV`%2uh{XS`z}EnH(!`nnbx2 zW)gHNW)v+4z$?RzW!e7mx)zGdJm;qNkGqz3indHmnTG6@# zn3wIAYFE(T0(@cn3~sOL=>>9^R&{Za)`s(B=Qf}#}_O&k)^E7?9sQB^Mtj4MR7FC4th9$1!4>oqKEsY*L zc@x6D40rvu$k0jRDBe~hcn4l{(h==uQ2s9*B;%pS$n{5bQfczw zs^W%p)D}wmU#LD*zE-TYp-Y(d0KWIO@c_mZ{*?jW8b%O+&~i%|TRSsXEN*8AvfvXt zMeNT9 z@2TV!nC@!d%e>sns$07FVeaG}%&T+zjjW~;8Zfw}mUL;KD`O7~jB)wo#}v+Cw=l_u z5YT-^D}{tbt9t&F5*&wmjS^LU5XT2`RXXe?9hC?RqTbwNdVyM>U`#pk37kO>Ls1sS z6b2N*AO&aADN#JCmLy9LGM&t6jw0Sb)jR(f!{H-PLZ(^nazthB8QFSyNb)aAnW4BL z>tsQ^OF(_>K%^V~3MC~y`5}9-7jGW}EVOOyfevzOJ7wfd8uv*q_eeZ_YSiK0!#|S~ z2${O@fd)Bw`)HuMkqnFl~;A}{b+#LiSa1<3@kycYNpP=m=gcjMm zzo7Q_Q1xE^<}?}_aPX?8P*iw$De@tNslvrVBr<^v(+ppj;XxKq(n`=-l*W@2NJ}&! zbj=cwVg_B(j1ol2*Xh;?D7O}MWaZ|7_tIX1y+pX`Meto9M@cIbze7vDPcz2Xx_D7I z5QRpyh`NS=k%z$CC1~@s2|qMm2wJOZLlDKnkcd@UYlEsyTF}ARwKz*96N+}Q(Tblc zIwqEkfKlOaVt}4f8rLx38co!aMb=eQDZt zjOh9QA3wxxEZibY4lc?#4U;Lqzyp#g24~Zh6hUrno8_ibZ&h;HqtF#^|YGcJi??vie zsczTu3P%p_PxWbS_|_`zV`U7s(8O$*iNU(LmsgR*af;QTx*hH)b?@g6knQ$nm2p3< zGR)!W!chA`0w;hBv>(8cO_pN#vx=27{tcCKOpAD`44tGN@wp=0cox)&{u8n=^*3xMe4uq^KLytr#50Q4uI;&w)$d(){U4bSgyhB0Rk6q>BjTh*y6S~V4%hgnz zppv?RVi-gyD(Ed^KO}jOJya<`1%L5<8Z%z3uL5`YF7nXewb?SDx!j4l@*Ilryb^GT zJ}rt|kb>wC`3fq66p#2bs{Vi$hatk=xnz;F)HLao%d7Jo)D~UL`4pwZvkMFpmM2k9 zA`sGnZ4dPc-A4YW_|Y>^{y$#Dd=j1yD1PzRSbd)lMLA*@<>&O;JE+J#$#o=sU{EPX zOxVJc_0Z}Yy^|%z_uLgTgZsj5pk61ui0YIdt>-cY(u$s7yOgG z<&z#f-s_}95n90e^Ld!07$9G8+9y?f8fk|V4sDh;Jh;EJl(Jrklz03>(#d?v zEv&_A{%M~YkQZ*PE(B55@VvK?zr5hhMgI-F_g?Yn;|%iDy!5N*e*Zn%av_UA^g<}q zzGwWoj*p-(4idHQOz-fmg%@t|7hkx1;nocP8t8QJ zSgBEEsicmSyR!)4gf17aqFEa4C@B(dWulu9Rrm^&PZQ73Ag9<>id2J=-Mv(n3IczM zG4$y!^-&=M2Q_Uz@@)4D-K!)1r*LYL*{D@M?zzX5^nQumB~L+Gx7R|ksl4#!>OPH4 zVp>5_2>%`^TBN_Bsf(m0QwePi?yUjDBU7c)+|+92rig**7}+kH9CFHZAJYVWL2gOc8QeA9$9*Hz15KD7f$e`xk=c@?{;q2~y!!cmg=hmdi+{|-O4 zJ)T#0kmP6_ACXZ=8S-FvLcfiVBLnFKii_~DNW&|Z_-lmJk*Lpq(O(TWRkm%hbPfvcfs>I-gZye5?%SM! z6^9|ro^M;`c=Ut5Edte9%#f@pS?i#gAP@J~C}Is&Z|0kAa*z)Eg8f)lOS746c0*~b zhYLOZeIK$DWijOaN6Bi^Sxp5u9E&YbbQ#>Ac}ENt*N*OD*jV~F3qSHvUH4&!kF%O#8ZJAH<~D zWv9G*@u@($+eAQV(sxer6YA+wMJia3`V@ap)t+8ecS^a2=vYBf0Tg{AsJpsF9|-Vq z;Q6B`{S$&lASrU0R=7g<$BJF_sexD~npAy)O4@)3#8-W;P#;A=JpxAY;Y&pw>YGFX3U#`RH1Y;j>PVdKXkSk{5@fUZ!g7g(I3>5z c)LhdAH|YD93l7zgJa+YQ!$1A(v-R5l0gk3oVE_OC literal 0 HcmV?d00001 diff --git a/src/processor/processor.py b/src/processor/processor.py new file mode 100644 index 0000000..0fda083 --- /dev/null +++ b/src/processor/processor.py @@ -0,0 +1,251 @@ +""" +ResearchOwl Processor +Chunking → Quality scoring via Ollama → Embeddings → RAG synthesis +""" +import asyncio +import json +import math +import re +from typing import Optional + +import httpx +import structlog + +from src.config import settings +from src.db.database import ResearchDB + +logger = structlog.get_logger() + + +class OllamaClient: + """Async client for Ollama API""" + + def __init__(self): + self.base_url = settings.ollama_url.rstrip("/") + self.model = settings.ollama_model + + async def generate(self, prompt: str, system: str = None, + timeout: int = 120) -> str: + payload = { + "model": self.model, + "prompt": prompt, + "stream": False, + "options": {"temperature": 0.1, "num_predict": 512} + } + if system: + payload["system"] = system + + async with httpx.AsyncClient(timeout=timeout) as client: + resp = await client.post(f"{self.base_url}/api/generate", json=payload) + resp.raise_for_status() + return resp.json().get("response", "").strip() + + async def embed(self, text: str) -> Optional[list[float]]: + """Get embedding vector for a text""" + payload = {"model": self.model, "prompt": text} + try: + async with httpx.AsyncClient(timeout=60) as client: + resp = await client.post(f"{self.base_url}/api/embeddings", json=payload) + resp.raise_for_status() + return resp.json().get("embedding") + except Exception as e: + logger.warning("Embedding failed", error=str(e)) + return None + + async def is_available(self) -> bool: + try: + async with httpx.AsyncClient(timeout=5) as client: + resp = await client.get(f"{self.base_url}/api/tags") + return resp.status_code == 200 + except Exception: + return False + + +def simple_chunk(text: str, chunk_size: int = 800, overlap: int = 100) -> list[str]: + """ + Split text into overlapping chunks by approximate word count. + Respects paragraph boundaries when possible. + """ + paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()] + chunks = [] + current = [] + current_words = 0 + + for para in paragraphs: + para_words = len(para.split()) + if current_words + para_words > chunk_size and current: + chunks.append("\n\n".join(current)) + # overlap: keep last paragraph + if overlap > 0 and current: + current = [current[-1]] + current_words = len(current[0].split()) + else: + current = [] + current_words = 0 + current.append(para) + current_words += para_words + + if current: + chunks.append("\n\n".join(current)) + + return chunks + + +def cosine_similarity(a: list[float], b: list[float]) -> float: + """Simple cosine similarity""" + if not a or not b or len(a) != len(b): + return 0.0 + dot = sum(x * y for x, y in zip(a, b)) + norm_a = math.sqrt(sum(x * x for x in a)) + norm_b = math.sqrt(sum(x * x for x in b)) + if norm_a == 0 or norm_b == 0: + return 0.0 + return dot / (norm_a * norm_b) + + +class ContentProcessor: + """ + Processes scraped sources: + 1. Chunks content + 2. Scores quality with Ollama + 3. Generates embeddings + 4. Stores high-quality chunks + """ + + def __init__(self, db: ResearchDB, ollama: OllamaClient): + self.db = db + self.ollama = ollama + + async def process_session(self, session_id: int, topic: str, + progress_callback=None) -> dict: + """Process all scraped sources for a session""" + from src.db.database import ResearchDB + sources = await self.db.get_all_sources(session_id) + scraped = [s for s in sources if s["status"] == "scraped"] + + logger.info("Processing sources", total=len(scraped)) + total_chunks = 0 + total_words = 0 + + semaphore = asyncio.Semaphore(3) # process 3 sources at once + + async def process_one(source): + async with semaphore: + n = await self._process_source(session_id, topic, source) + return n + + results = await asyncio.gather(*[process_one(s) for s in scraped], + return_exceptions=True) + + for r in results: + if isinstance(r, int): + total_chunks += r + + total_words = sum(s.get("word_count", 0) for s in scraped) + await self.db.update_session( + session_id, + total_chunks=total_chunks, + total_words=total_words + ) + + if progress_callback: + await progress_callback(total_chunks=total_chunks, total_words=total_words) + + return {"total_chunks": total_chunks, "total_words": total_words} + + async def _process_source(self, session_id: int, topic: str, source: dict) -> int: + """Chunk, score, embed and store a single source. Returns chunk count.""" + source_id = source["id"] + + content = await self.db.get_source_content(source_id) + if not content: + return 0 + + chunks = simple_chunk(content, settings.chunk_size, settings.chunk_overlap) + stored = 0 + + for i, chunk in enumerate(chunks): + if len(chunk.split()) < 30: + continue + + quality = await self._score_quality(chunk, topic) + if quality < settings.quality_threshold: + continue + + embedding = await self.ollama.embed(chunk[:1000]) + + await self.db.add_chunk( + session_id=session_id, + source_id=source_id, + content=chunk, + chunk_index=i, + token_count=len(chunk.split()), + quality_score=quality, + embedding=embedding + ) + stored += 1 + + return stored + + async def _score_quality(self, chunk: str, topic: str) -> float: + """ + Ask Ollama to score relevance and quality of a chunk. + Returns 0.0-1.0 + """ + prompt = f"""Rate this text chunk on a scale of 0-10 for: +1. Relevance to topic: "{topic}" +2. Information density (facts, data, insights) +3. Credibility (not speculation, not clickbait) + +Text: +{chunk[:500]} + +Respond with ONLY a single number 0-10. No explanation.""" + + try: + response = await self.ollama.generate(prompt) + # Extract number from response + numbers = re.findall(r'\b(\d+(?:\.\d+)?)\b', response) + if numbers: + score = float(numbers[0]) + return min(1.0, score / 10.0) + return 0.5 + except Exception: + return 0.5 # default on error + + async def rag_query(self, session_id: int, query: str, top_k: int = 20) -> str: + """ + Retrieve most relevant chunks for a query using embeddings + keyword fallback + """ + # Get query embedding + query_embedding = await self.ollama.embed(query) + + # Get top quality chunks + chunks = await self.db.get_top_chunks(session_id, limit=100) + + if query_embedding and chunks: + # Rank by embedding similarity + scored = [] + for chunk in chunks: + emb = chunk.get("embedding") + if emb and isinstance(emb, str): + try: + emb = json.loads(emb) + except Exception: + emb = None + sim = cosine_similarity(query_embedding, emb) if emb else 0.5 + scored.append((sim * 0.7 + chunk["quality_score"] * 0.3, chunk)) + + scored.sort(key=lambda x: x[0], reverse=True) + top_chunks = [c for _, c in scored[:top_k]] + else: + # Fallback: just use quality score + top_chunks = chunks[:top_k] + + # Build context + context_parts = [] + for chunk in top_chunks: + source_label = f"[{chunk.get('source_type', 'web').upper()}] {chunk.get('title', 'Unknown')}" + context_parts.append(f"{source_label}:\n{chunk['content']}") + + return "\n\n---\n\n".join(context_parts) diff --git a/src/scraper/__init__.py b/src/scraper/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/scraper/__pycache__/__init__.cpython-310.pyc b/src/scraper/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5d42cfb77fa8f2292d1db7131cd25ae02b71e857 GIT binary patch literal 142 zcmd1j<>g`kf<0;PGePuY5P=LBfgA@QE@lA|DGb33nv8xc8Hzx{2;!Haenx(7s(x}t zYHng#g?>?LacW{waz=i6j(%}bvVL)LQDQ-Ak$!x9W?p7Ve7s&kI^f`tJ9VdWq$ literal 0 HcmV?d00001 diff --git a/src/scraper/__pycache__/exhaustive.cpython-310.pyc b/src/scraper/__pycache__/exhaustive.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6ee0e8352471f62eb3490277b8509da2e3ec25f1 GIT binary patch literal 16160 zcma)jX>c4@eqUeH)6;WcFaSaDkTg7(Ba#5AV|9_DD3Myxk|3FYRuPuUq%p4n^k8Os z___xm+A|7ia=lu+xT|%%_O5KXfVH=@T3<<3Dj#y%UMF^{;;Oh@KB{xLvboGpvCH}3 zq9roF|Lg7n1_ZeQYF@v7@Ad0<{D1%RwVThI68`>d{{J|C`kEyDce>g9r{U%W{Ja-r zNn+Bh#AK#4CD+P+-i9>?zH{0Z6_R_ayJTIb~`PGm!!Du^VP8A`EI*v&HFa5 zTE1^L+kRMFu^hjOS-35`3-@3a=^_5F1@_hNA7bvJMt4Qukri4)}`HMiNe zd}pp;z3?6(bj&RUIgz8B?8t7!#pOO%?vv*f!= zBA@kF94r_YH@SJMX3x2CRj%Q*mTcxk59OVf=UWThYKo@$w$s96w&#o6g2lXv1=n3@ z*y4VOxgD%u4J)xU(X##9qfsTA*?idn0a3Hy=_6;JU3EJ?Hv1A4Os-gdZSm5X<ddwll9@n@>X-~EY`CwrA3dugK0=8s%D(>+)TK#Ytze$>SgK6z>7 z=p=e;V#PeVsRZLI&XUu{WUNaQE?=0W+sPvnM=PNU>I`*|QHQr`U^)Fi$`dnE@$=@8 z_|lsEzO*j$G?S+#Uy1Ju1TGj13x zb56Q0UzhnLGtnvwxaOw9bi5v+iuuvNXl7xiKmH(N@hi?U%_2;R?7*m%c*Kw6`Zwc9 zu1Q_#-E>dxV(iP(7bRJ`tT1U>`b3`kL;~29i4&pBK9TvuXv{x{fdXy(`KRgCv~tSTF)292Y~QZ=)rfCY{nfTT zajN0gtcLgeME@B;S>>J=YV)?uLe28sCZ9kTp}JzvRrJt&=Z&{!-g@zT_2T(ZkA@TK zZ=QRZ?lTw9pF4M9hO)W0FT8r;&GY9joTWR}YOV4|(TkW?lsihDmhK(Xag=+zk(jb3 zZ~hs$AHz?RIZbR}v57=oLrxi__=F5*Ed4V!C%V3}5Rp%O3ZK>gSzRkATjhRZC(*}k*po_fJuH#n;Zh^y+IDre~_>-gy1&g{kQ;4?QS*&|u@Xe=D;pLDh zSmALwC+o6um%oUDfmNo}rO~{EpLY;Rw8lizb5UfB*GM{7S5CLU~B&+H1XV4*FU?i==)a8!NS8vOc6)jo>qZ&^;)E9_N5WDbv zh^2}uKojahy0I`-YuHx2?zAn@r1LG8H?4*f*i}4xo+hCqF_eN_kol{~*6V z{5&5NW=qh0B+`3k)r`VUFxA$WW~W%n*5`Gmv-FMBEEo#Q+>mAsX4;v$Ij@ig0-YdB zS_sN>Y={+c&9h-v!nMFg*eI?;YzG^|wa9j|aa@P-dp&-IlsyXsC3JG)oG(ea~arb30yg{EhF9w1e9Kw{8yoLViliCid8yg|U@31qr*SE@=~1*zOs zkt$scDVPybO{6JCwK@Z{s-{UARMaU>-0bCw!C%LfPf_v)l5n`q-37=1uL_>sm;-vk zj04dxMjhsT*S8umclW|nl{7J2NK2}uLd~-q^ILl4Z&Kd^AR8z#s#T{2eXV*Cm0l5v zuIOM0$-gNFyYJbNiR7UHenIn~1^EH>MC|1ncmyU2YXTEhre-QKpGGlc-dnhA9f?LW zc$c0frlVZNo5${bGd1Fmpuxba>9zEfcs1?CHN3bkaf8yj%y;@qUG=qkipe+So{T%) zPj>|aqXAMYu@~Ut*l8>*I8%Djwiq<>Fzq`{8_E*4Xa-$}dadDlkd|r7acQ~3{7Vf7 zls6MKG^ZaW3aMEZxqNyMu9WBNsJhXL1GF9+Bv7`S0wH|RN zkbMNBx+Yc93}y59c_f@^sH3uRH`saaKt&e=gQOiKM2Q`I1(|EIkDVg|sw*UDWbD$K zx=yJk(v(Q`b!kDvj;8Ad_7j5Mk-sKmA}KKu2=Gthapm)_PWM|X(|nzp>vR?TrC_$+_X<}s0gHZ5<-+p_9}b(=M? zWCFg*)uNI>So?IhzAcFGz`*L$ei35D>VE)&iAaP9;}>D%e*~^2a!Nm^_H$Z4m+I&A zelGoyvS9S&;JPngm%iTVDP8%T^lr5W{#aM|=Vhri7o76du2R>!s-LRs%($ssmwQ@Q zL%V~$6mXhagOFwBI+g2PoiF&Au6|wi7SM-T&mzrrbw5vKEGt^))@7EznF1XYhz>5l zZ4!>eDDrCwT=f}&rlkFX((Vziq^PsN+6EWJ(YVWF3PFMWA3 zx~|A8C}jl=4ClIN!3xvcl$?~jUtN(C6_a|RcH)h{zv+!T*l_&564Cg^`>6HC8!ID0 z`SpZSVXl`+S#sbG0Lg)@!KwVkpa2O5{>WF$uqS;lD8YT;FuPT=At0^Bs^@q?vFy&z zJ2gTBv@4hUc*yQ#>@zr3c4~eqpci1YpWwvQuftu!F zXfPrcxjiowJ0a6Ru8!ZZKT*OIgL1maJy6(GP)vqT!*m)p19hK6O_;JdcX?2vdLn7U zVAKK57sF4#OV!NX>o;D0~uL@HMs3Hf9Q1(I<%v313LM(=+ z#aJvpBcV%H1}7MWDX#&gAxk8Mk6x>7Z20%wuBmE z3tluWDDTno$UIV8WqS*P`67BAZC!qXqz~GB_+&3d5bi&tM_^~>lEyD|b!P68TJi!Q+f9Q(lzK+j;MGlIFt(22 z^6P+JI)ZC5E-IOpPR8%__!gj|4nQ?mzX_niqY2iKt>;)4EG7fSkn00A-OId}p`Mt? z5o@UAgJUn-aHf>sPCOrFi~9~tl=9Is6gi*`N*v$_t+Tb{-ImLtTn!#hyjyl_q7T0f zj|o43{Scr66gtRH1*h)+#H7`BCfbYbGvES1^l7(kw<07QjxMKz9fyU^d-~*w!{8uI z%RgQB+}14>T?e~1Mt=6q3j&HU4mlBNrNBv0M^HRNIia|ADJQho8s&ZhNvKh$q1v!p z8|dNxkgBgDshAOHpvpqGu7Xdw3~H4@^Cy!H{DictG-eas0!VEj%kr&IL%kQdPQFb2 zZBaxVlzkgNk9--*)skY$du2r}$ch3O-Bs?6Y6V!nLCQ{5P1W@ggouwQZSN*0UP=IGX zd=~A;F?*$l@J3gJWj$fsUv(KZ?Y`v2_rR@>ly;F z8K8yKC8?h2X2?zzEr%d7i$Y}LEfATffDyTm8c{|iLeyVpnif19<{vra%FB*bb{lLWQk=+lS^lxRe7 zLkS+pg|ksR8BzQMQR11)r-RaA4?^!WNG?XeV^8uSY~k&piyQ8G@;C!%Uqbnx-pLz{*B8#n-60a4&v9IWdV;+x19P(*^TDS|vd7U!eCGhzT?Z zYkWIU)zRd)@bk#P0%+h!Dai*Rr-YsliIfAJOvogYWYQh9_}j6&iuTy2g?tRHhwlfi zXuk$k`rNMDiR5JWih1}s~0UH23}S1&-xDzKs0%B8ofK3t4!_=eO~ zBL{a6BgCkUs9!O11MgASibzw!0;1<=tZIFj%J8GP8DPe0G$%3t68|Vvae?W&|rRn!GbwRi4F4??}X>DBC(V2r^ji z;~io*!iz@^z}Mki2if-nsSVd{KijW8IZ>V_mP+0%vTovdX}|uliSnD=MF7kz&rz_e z{KKF6S@b4+h2_L|c(mNISITc)e9h}W_xJ=vPusTqLC+!H1dwhx4*Tt=ZV|F>jUuVQ znbtRbstS$>dPsZP&nqoW) zvW~<=U_Q|!p(*aF9vt4cQYaW8cXF=~xsTxIjU!orM*l7hb0T_#sN{9!8_Hz_)PyF5 zfe%&Tex%ikZ6v(q2p$V%?~}-56=podVcUzCQlx!}NP~HbEHX&t_Va&@=6^%)9hZX2 zz0l;H)}{zZ|0!qjP)v~csdMD$ajx_6?M00#e$LP4gYi$Gb7Rnv-Dg{H!y}>>TsG8 z(KT`6aD-fjF^MHjBek4NSH3_AjVCgfh4~(L$bW{4g#nzI?r&`qg|Krw&2X4{PF&(* zz+s`&hVo!1rICZVN8P+f2}6R27CC#vbUY^Bq8cFu{t4v>;bC^(fk6ZpHm!o#hL5RE zP?QQn`E%TPuOWd?F|9-I7?(Ar2#Jsr>6l`wqX=b{l#)CO?==MtH97a0u17hAT7FtG zp#Gzc{-=dwqo@ydY`dLMw!}#b{yn^}rXorHvdGZy2l#n2*e5v2`WpKO@`pIHLN4W9 zV9Df)#8typ!!;HBVOM1-=<#=COv@wHki5ZLy&cHXx zqQ1{8zWsn%{F?{PV%yo%oVU)HG&1OK*$?E^J+XCi3_>=~@?Fg@2>YZ!Vp**E5X9e5 zTnbz1%h+i)ydf}!>rPlM+rMYSs!1%R4`lDaW_^-upGqls>--j}G|yq>Sg|+|v99E} zP+we;kfM_m9z+=;TyXOBtDsGS$4pFx*@&>kQGm!3r8q271`mLwg^__TmUA~KtkW5U(z?N zw^%n}zS^|-QZ+jIzyyneXi7SY%m(<yX0|eyg6M=u2Tcj!W!$s6`EwxTs zYKq#efi-n;1S4tFe-;kE_6gD?{q$OT9i|flqhk6V>AJ$wH}#~w2pJkUmE+;~5Y7(; zyI+dKyT0p|J!l=EabBhaXRG|TF<`V5$72{tIQ;$98Rsv-yep}Q2F0G4D=ueM0T2Xt zqvkla7UnN^V9figATww)l}scF^XMfkHJuio5Rv67{8$TcV+)Jt_o+7J;mDaEj(FBG zgkVG^abq~85e^JhJYR|JpKUgQ{~@~B>bxTP@*L4j8JWm=1^IPXQ*sKVn?llz{JuJ9 zr+7Qq&gvP!47m#jIGx#qZ*?9=sla>XL`4_YH1w<3OAA@@ z;Susf)m5e}sW?UzKXo6OqSZBFF2yphm%5ifwz3IN^j_J_5h)-9@&JL*d5fD`kHWSo zW?Wauzg0Kj8$(*pteNZZ3U}q60nc!z3zMpz1Id05J|@hoULbFCoF9S@OzoL?t_bg# z4x3+rWIf!=c5!k8vO4Dv`=zeg&0VK(Q@)#D8)11IKKmGJlt<9%W03Ob+76^+-FzbF zP?}lWiF*WmE?WC! z5gYMdp22}FoCvHnI?S#L2Xa_QZhZ?U|GW@~371`tpbQ-wK!gORAOjWWCPUeRPm80! zC?W6AEn;w$@hB`(qbgz33%0$A+b|1`J?}JxHX5d5Hp_`!L~IbADNd0PXs|e#6DgG$ ziorw?A5xv6P|=Z1o71r)xR{(4t72p#D8m6!ST0cfxbNXWWCX4-A7jbB0%O7x>T9Uu zd?zd?c%Mn1j6@ERHWywla&VEqxY-bF4lz+2ZH>Hc|6WwhS%_KxCu%QN;Fl;TG)<9^ zrpf;uk{>7By?~5z)qvBupo|J{*4Uj<^~h(MPS_Lp`_wdWI1Md!!v|)-14e!?xRn*< zj;Ru6gNL`_%JKSDA((2U=O=$d$n+7}5{Lx;Hay65#tPIufIspN!!ZH;9sUvcQ-M=* zS0wn;9*Dm)1NZ~VSqf;Qm<*v1=baqbiXsgA#jajg1?nh7n?i^-%}h}Xjn!K~=sq3u z5@StYz(LFJU{tLu>v|wEs#rA7EbVZ{%XSxWpf5bBus2NEr*aU|8LZ}px^&YIU?df zkB-ycmY)sA6A$#(_z_=6%gPYn2HXP1IQa=c^8bNWVTRdcx#M(4Xppf-!{h&xUiT^` zLhk)9%1u%7zmcHh4K((F$p4b+Zy^aYA_QEe1I$7MXA@b9r3h6j7Fr2m^hcocchRIn z==?k~umcB>X{L&FEJh{&a|zMYf}*K}%Tk11qym!%AB4#yTsMHsG+x&CLVpK0Br78< zB8sDUB3?l!AaUMQ?y12ZpHw6j)UJL9wjXF3_=%7M2OW+yTn7dSP0Z-3M7i@y|2*mw z$zcJq17MY;SKow9qQk2^>_f(ppBhw7ha7~fP}m#Up8lRLVkROd{K`2xu@(8X5}z*a zVmI<9%G8ul@+(@XAe5olUXT)J;Kk(&FT%}`fn!;mZuj_4VaoS7V~A{soO32aM&TqurUaLb{VbvEZ)LjVW%(6sYji~R$p>suY&C3h8YS+QAQR8?*>bS! z-YLg+3h-nI?=>m%WRpxm>3z8RD4uo`MsH-p-VWT^y0I zkU}gKCl2mXH$xlC4p|KeNei;AneaQP_!bfOL1b|BO(DmH2~|%65$ktH)hOUkftCUV z`R{BS)KA1qyix;%Od_UF$p09xf`%`z!5~K1L)j@&7(Z6{RPreYzeeIkgecW*LX>Fp z;X$DSY6wxP{D}-1*J|ILfO>J_#egEIkzD}2p86%#Z%eCK^+T=q>Soach-6@3Bizrj z_{Dh<;uKzS2%$8>o#YLN8Wt&jaBxuAWe9G-@8S8OP0y>5!483P6w058=S&eoRd6tu zo`?F!KZaN+VKyJ5pDg+*qMwfk+K#Xc+LAyT6?G6}U3lvuR=R9QHw`s&jDoCe=SL|v z4kt-wUB=o%aCOstpl4S946rT;SmX2(+XV+^j)WK6O|Aw3YqeK+ufVW+OymTt_iU(; zaR`y3chDpe%Y>|ozPb=0BnpU@5xVE+vFX8h6#Go#{U`bX&?yo4sY)mip6_ z`x#0|9&vIO|4UzQ?diq03#AFf6qkwNbRGfJfpdhjQa%Vanw}7o07i)nEy7$aSgT^zT*SH#T7D2Tp^bLcpRq$Fcku3@SD)# zVBE(#Cdxj1p1?+-E0JzczIO+w!@tF<4g3uSDtvFmOj_AQMgEv#-uipO|05>|} zL+>`g6t?snGX>Ooa|294g&Y)T?r$nzFA6=<>>32XZl<2?nHxPJ#AYB~du{X6!RH~V zb6rGwzrL%NrLz%`$ys>&z)(mR6hSuVqRBI?hT-p*M=ChN@6qwZ?L)| z^-e1z7+-w1LehMA^)LFcr-Oz?zys^t3fAM#TS9G zXvar5WdTHNGNf!DyFnq*E+cn^BS~d7(CDkJmmvamuY>b{SMaflK8r+9v>}bOA3^re#TgBI(p*)W{G}}1Z2pf4H^^GrJrTZ%s{}xUD z16uTLS@Jp-piq!K3l?;zq{0gsv!bFw@$|iS6nG%%bD+Jb-K~&@9W$W^*|Hd8bVj@! z;#&e*pK3KUtJS8|RbHS;h^4=UD$ zOUWW7AZLl!DOo~-(_l``=l`7U8 zb6TrS1iQh@Uqc1`yc0+a30zzsziVWRs-fdQYwS11j2(s1@n;H;lJT#|=G|jsM-J{k zuz%$6K2A?oj`IVwPc*hrflT~k%INqQ5B8uKVwi&yx#7%Bh;NyMKSX@}6rHi7wFpi6 zHZQ7%Md@c&A6>YVpPWXBC}oaRi9y{4(fSgZ|JHz}hyQ705#q0$mKT2>AI zPfiLV&r*`3M@>KlM@L%o&O-cA0~N5j2|6((94fSlp-I|ubO=5o`#I|21**bvlx7?F z5qfrLUiiY)8yC;NbawhYe}dXPNeS6={3<0+BOz_VZA1>KSE-Peal73KjZ+bcJ&y&J uyi-Ut$ str: + if YOUTUBE_RE.search(url): + return "youtube" + if PDF_RE.search(url): + return "pdf" + if REDDIT_RE.search(url): + return "reddit" + if WIKIPEDIA_RE.search(url): + return "wikipedia" + if "arxiv.org" in url: + return "arxiv" + if any(d in url for d in ["rss", "feed", "atom"]): + return "rss" + return "web" + + +def is_blacklisted(url: str) -> bool: + try: + domain = urlparse(url).netloc.lower().replace("www.", "") + return any(bl in domain for bl in BLACKLIST_DOMAINS) + except Exception: + return True + + +def normalize_url(url: str) -> str: + """Remove fragments and tracking params""" + parsed = urlparse(url) + clean = parsed._replace(fragment="", query="") + return clean.geturl().rstrip("/") + + +class ExhaustiveScraper: + """ + Recursive source discoverer and content extractor. + Keeps expanding until saturation or limits hit. + """ + + def __init__(self, db: ResearchDB, session_id: int, topic: str, + progress_callback=None): + self.db = db + self.session_id = session_id + self.topic = topic + self.progress_callback = progress_callback + self.iteration = 0 + self.total_sources = 0 + self._stop = False + self._http: Optional[aiohttp.ClientSession] = None + + async def stop(self): + self._stop = True + + async def _get_http(self) -> aiohttp.ClientSession: + if not self._http or self._http.closed: + timeout = aiohttp.ClientTimeout(total=settings.request_timeout) + self._http = aiohttp.ClientSession(headers=HEADERS, timeout=timeout) + return self._http + + async def close(self): + if self._http and not self._http.closed: + await self._http.close() + + # ─── Seed discovery ─────────────────────────────────────────────────────── + + async def seed(self): + """Initial broad search across multiple sources""" + logger.info("Seeding research", topic=self.topic) + tasks = [ + self._seed_duckduckgo(), + self._seed_wikipedia(), + self._seed_reddit(), + self._seed_youtube(), + ] + await asyncio.gather(*tasks, return_exceptions=True) + + async def _seed_duckduckgo(self): + """Multiple DDG queries for breadth""" + queries = [ + self.topic, + f"{self.topic} history facts", + f"{self.topic} evidence analysis", + f"{self.topic} official report", + f"{self.topic} investigation", + f"{self.topic} wikipedia", + f"{self.topic} documentary", + f"{self.topic} research study", + ] + try: + with DDGS() as ddgs: + for query in queries: + if self._stop: + break + try: + results = list(ddgs.text(query, max_results=settings.max_pages_per_search)) + for r in results: + url = normalize_url(r.get("href", "")) + if url and not is_blacklisted(url): + await self.db.add_source( + self.session_id, url, + detect_source_type(url), + depth=0, + title=r.get("title") + ) + await asyncio.sleep(settings.request_delay) + except Exception as e: + logger.warning("DDG query failed", query=query, error=str(e)) + except Exception as e: + logger.error("DDG seeding failed", error=str(e)) + + async def _seed_wikipedia(self): + """Fetch Wikipedia article + all internal links""" + topic_encoded = quote_plus(self.topic.replace(" ", "_")) + wiki_url = f"https://en.wikipedia.org/wiki/{topic_encoded}" + await self.db.add_source(self.session_id, wiki_url, "wikipedia", depth=0) + + # Also search Wikipedia API for related articles + try: + http = await self._get_http() + api_url = ( + f"https://en.wikipedia.org/w/api.php?action=opensearch" + f"&search={quote_plus(self.topic)}&limit=10&format=json" + ) + async with http.get(api_url) as resp: + data = await resp.json() + urls = data[3] if len(data) > 3 else [] + for url in urls: + if url: + await self.db.add_source(self.session_id, url, "wikipedia", depth=0) + except Exception as e: + logger.warning("Wikipedia API seed failed", error=str(e)) + + async def _seed_reddit(self): + """Search Reddit via old.reddit.com JSON""" + try: + http = await self._get_http() + url = f"https://www.reddit.com/search.json?q={quote_plus(self.topic)}&sort=top&limit=25" + async with http.get(url, headers={**HEADERS, "User-Agent": "ResearchOwl/1.0"}) as resp: + if resp.status == 200: + data = await resp.json() + posts = data.get("data", {}).get("children", []) + for post in posts: + post_data = post.get("data", {}) + permalink = post_data.get("permalink", "") + if permalink: + full_url = f"https://www.reddit.com{permalink}" + await self.db.add_source( + self.session_id, full_url, "reddit", depth=0, + title=post_data.get("title") + ) + except Exception as e: + logger.warning("Reddit seed failed", error=str(e)) + + async def _seed_youtube(self): + """Search YouTube via DDG for video transcripts""" + try: + with DDGS() as ddgs: + results = list(ddgs.videos( + f"{self.topic} documentary explanation", + max_results=10 + )) + for r in results: + url = r.get("content", "") + if "youtube.com" in url or "youtu.be" in url: + await self.db.add_source( + self.session_id, url, "youtube", depth=0, + title=r.get("title") + ) + except Exception as e: + logger.warning("YouTube seed failed", error=str(e)) + + # ─── Main pipeline ──────────────────────────────────────────────────────── + + async def run(self) -> dict: + """ + Main exhaustive loop: + 1. Seed initial sources + 2. Process batch → extract content + new URLs + 3. Repeat until saturated or limits hit + """ + await self.seed() + + while not self._stop: + self.iteration += 1 + pending = await self.db.get_pending_sources(self.session_id, limit=20) + + if not pending: + logger.info("No more pending sources — saturated", iteration=self.iteration) + break + + if self.total_sources >= settings.max_sources: + logger.info("Max sources reached", total=self.total_sources) + break + + logger.info("Processing batch", iteration=self.iteration, batch_size=len(pending)) + + # Process sources concurrently (but not too many at once) + semaphore = asyncio.Semaphore(5) + tasks = [self._process_source(s, semaphore) for s in pending] + results = await asyncio.gather(*tasks, return_exceptions=True) + + new_sources = sum(1 for r in results if r and isinstance(r, int) and r > 0) + self.total_sources += len(pending) + + stats = await self.db.get_session_stats(self.session_id) + await self.db.update_session( + self.session_id, + iterations=self.iteration, + total_sources=self.total_sources + ) + + if self.progress_callback: + await self.progress_callback( + iteration=self.iteration, + total=self.total_sources, + new_this_round=new_sources, + stats=stats + ) + + # Saturation check: if we found very few new URLs, we're done + if new_sources < 3 and self.iteration > 2: + logger.info("Saturation detected", new_sources=new_sources) + break + + await asyncio.sleep(settings.request_delay) + + await self.close() + final_stats = await self.db.get_session_stats(self.session_id) + return final_stats + + async def _process_source(self, source: dict, semaphore: asyncio.Semaphore) -> int: + """Extract content from a source and discover new URLs. Returns count of new URLs found.""" + async with semaphore: + source_type = source["source_type"] + url = source["url"] + source_id = source["id"] + + try: + if source_type == "youtube": + content, title = await self._extract_youtube(url) + elif source_type == "wikipedia": + content, title, new_urls = await self._extract_wikipedia(url) + for new_url in (new_urls or []): + await self.db.add_source( + self.session_id, new_url, "wikipedia", + depth=source["depth"] + 1 + ) + await self._mark_scraped(source_id, content, title, url) + return len(new_urls or []) + elif source_type == "reddit": + content, title = await self._extract_reddit(url) + elif source_type == "pdf": + content, title = await self._extract_pdf(url) + else: + content, title, new_urls = await self._extract_web(url, source["depth"]) + for new_url in (new_urls or []): + await self.db.add_source( + self.session_id, new_url, + detect_source_type(new_url), + depth=source["depth"] + 1 + ) + await self._mark_scraped(source_id, content, title, url) + return len(new_urls or []) + + await self._mark_scraped(source_id, content, title, url) + return 0 + + except Exception as e: + logger.warning("Source extraction failed", url=url, error=str(e)) + await self.db.update_source(source_id, status="failed", error=str(e)[:200]) + return 0 + + async def _mark_scraped(self, source_id: int, content: Optional[str], + title: Optional[str], url: str): + if not content or len(content) < settings.min_content_length: + await self.db.update_source(source_id, status="skipped", + error="Content too short or empty") + return + + word_count = len(content.split()) + + await self.db.save_source_content(source_id, content) + + await self.db.update_source( + source_id, + status="scraped", + title=title or url, + word_count=word_count, + scraped_at=time.time(), + quality_score=min(1.0, word_count / 1000) + ) + + # ─── Extractors ─────────────────────────────────────────────────────────── + + async def _extract_web(self, url: str, depth: int) -> tuple[Optional[str], Optional[str], list[str]]: + """Extract text + discover internal/external links""" + if is_blacklisted(url): + return None, None, [] + + http = await self._get_http() + async with http.get(url) as resp: + if resp.status != 200: + return None, None, [] + html = await resp.text(errors="replace") + + # Extract main content with trafilatura (much better than BS4 for articles) + content = trafilatura.extract( + html, + include_links=False, + include_tables=True, + favor_recall=True + ) + + # Extract title and new URLs with BS4 + soup = BeautifulSoup(html, "lxml") + title = soup.title.string.strip() if soup.title else url + + new_urls = [] + if depth < settings.max_depth: + base = f"{urlparse(url).scheme}://{urlparse(url).netloc}" + for a in soup.find_all("a", href=True): + href = a["href"] + full_url = normalize_url(urljoin(base, href)) + if (full_url.startswith("http") and + not is_blacklisted(full_url) and + not await self.db.source_exists(self.session_id, full_url)): + new_urls.append(full_url) + + return content, title, new_urls[:30] # cap links per page + + async def _extract_wikipedia(self, url: str) -> tuple[Optional[str], Optional[str], list[str]]: + """Wikipedia: extract content + follow internal wiki links""" + http = await self._get_http() + async with http.get(url) as resp: + if resp.status != 200: + return None, None, [] + html = await resp.text(errors="replace") + + soup = BeautifulSoup(html, "lxml") + title_tag = soup.find("h1", {"id": "firstHeading"}) + title = title_tag.text if title_tag else url + + # Get clean content + content_div = soup.find("div", {"id": "mw-content-text"}) + if not content_div: + return None, title, [] + + # Remove navboxes, references, etc. + for tag in content_div.find_all(["table", "sup", "style"]): + tag.decompose() + + content = content_div.get_text(separator="\n", strip=True) + + # Extract Wikipedia internal links (only "See also" and body links) + new_urls = [] + for a in content_div.find_all("a", href=True): + href = a["href"] + if href.startswith("/wiki/") and ":" not in href: + full_url = f"https://en.wikipedia.org{href}" + full_url = normalize_url(full_url) + if not await self.db.source_exists(self.session_id, full_url): + new_urls.append(full_url) + + return content, title, new_urls[:20] + + async def _extract_youtube(self, url: str) -> tuple[Optional[str], Optional[str]]: + """Extract YouTube transcript""" + match = YOUTUBE_RE.search(url) + if not match: + return None, None + + video_id = match.group(1) + try: + transcript_list = YouTubeTranscriptApi.get_transcript( + video_id, languages=["en", "es", "en-US", "en-GB"] + ) + text = " ".join(t["text"] for t in transcript_list) + return text, f"YouTube: {video_id}" + except NoTranscriptFound: + return None, None + except Exception as e: + logger.warning("YouTube transcript failed", video_id=video_id, error=str(e)) + return None, None + + async def _extract_reddit(self, url: str) -> tuple[Optional[str], Optional[str]]: + """Extract Reddit post + top comments via JSON API""" + json_url = url.rstrip("/") + ".json?limit=100&sort=top" + http = await self._get_http() + try: + async with http.get( + json_url, + headers={**HEADERS, "User-Agent": "ResearchOwl/1.0"} + ) as resp: + if resp.status != 200: + return None, None + data = await resp.json() + + post = data[0]["data"]["children"][0]["data"] + title = post.get("title", "") + selftext = post.get("selftext", "") + + comments = [] + if len(data) > 1: + for child in data[1]["data"]["children"][:50]: + body = child.get("data", {}).get("body", "") + if body and body != "[deleted]" and len(body) > 50: + score = child.get("data", {}).get("score", 0) + if score > 5: # only upvoted comments + comments.append(body) + + content = f"# {title}\n\n{selftext}\n\n## Top Comments\n\n" + "\n\n---\n\n".join(comments) + return content, title + + except Exception as e: + logger.warning("Reddit extraction failed", url=url, error=str(e)) + return None, None + + async def _extract_pdf(self, url: str) -> tuple[Optional[str], Optional[str]]: + """Download and extract PDF text""" + import pdfplumber + import tempfile + import os + + http = await self._get_http() + try: + async with http.get(url) as resp: + if resp.status != 200: + return None, None + content_length = int(resp.headers.get("content-length", 0)) + if content_length > 50 * 1024 * 1024: # skip PDFs > 50MB + return None, None + pdf_bytes = await resp.read() + + with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f: + f.write(pdf_bytes) + tmp_path = f.name + + try: + with pdfplumber.open(tmp_path) as pdf: + pages = [p.extract_text() or "" for p in pdf.pages[:50]] # max 50 pages + text = "\n\n".join(pages) + return text, url.split("/")[-1] + finally: + os.unlink(tmp_path) + + except Exception as e: + logger.warning("PDF extraction failed", url=url, error=str(e)) + return None, None diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/__pycache__/__init__.cpython-310.pyc b/tests/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..50295268db25ccb9a0da31c92dc10f61a6399400 GIT binary patch literal 136 zcmd1j<>g`k0_F7gnIQTxh(HF6K#l_t7qb9~6oz01O-8?!3`HPe1o2B-KO;XkRX;f+ zH8-)WLcb`rI5n{-IU~P3N53SsxTIJ=K0Y%qvm`!Vub}c4hfQvNN@-529mtSkCLqDW F002f%9b5na literal 0 HcmV?d00001 diff --git a/tests/__pycache__/test_scraper.cpython-310-pytest-9.0.3.pyc b/tests/__pycache__/test_scraper.cpython-310-pytest-9.0.3.pyc new file mode 100644 index 0000000000000000000000000000000000000000..23d30496b36f9ec6bac8192ff2429e469567c133 GIT binary patch literal 4190 zcmeHK&2Qtz73YwYL{YNWUhjuDUsaRcWLX5#et16|*iM3MQlJMTZGyHjP!Kd{Br&E$ z?aaus8476QJ@ir_1$xT{uyY9DOMv_Zz2vW$V|v?&d8^*tIQ2d!Ncng|ngP;Z_v9UpPZ}u$HAV$yLD3N@o zyitZ3Gp_WU4X0P#sP^0qSI=2-tykZugKpE(J!4~rmT3k0S?bU#^rxsxYtYY8@`2&i zAAl`m-Luqb$|MV=FOne-nJ;$-OwC5Z-;9HBD~^O@RL%4g-V5R=W&V)I9#M4>^#(EX z!|t%Z72>TXzcMHlXzxH1FIezEWNb(yGxo@sP;+c%Br~_nv6WfkJxTUW3R;^`8?U#` z0{}0refz-JHpDA1Uy{ZF;g>-xqt+)d4|)c=?#WSa9uQi-Z=W;9Hmy)6vm*j}NmgK$ zla&t09{CiUY8zv01e!HAX|;gOJ%;Tcu&Jj1ZnGAe=ja@ZM7q1_gxeXN1sz*KmG4eXT3y zK)lgv?IuGx++@u#>9t0I47+b`uhQR+mPX5e==?GKNL7v&9Fm9T>gpj$uUxu3*j@63 z`^k~gSZ$zo#S@pYzvc zb%KpEN+YSE@s?ylwg6?1^`&S9o1rS_^Xa9@de(1_qOEAaXcROP-f7{e^}9rjSRAYB zbS`~~l8Y z0CW&=rs^>5Zw{kaMtz~maRMuZTJ%LX8OBsE^*eF08N_}+=rL9GMbKt=2Q~Y)_Dz)Z z-{m~vYQb+~K9oM&8N@+9kT7#tRyDMcyYXsqaJ@LVSsdKR2l?G@=MeM3)hmYvzX-?m zu+{1&J=O}ltQTzWv^Wzi;9)lz#rhzGmc9siFkrkn*j4AX6CTa#n3)AuqWc8@Uy`o~ ze-(z0Pv0x#|LGe_&3{%}@b#{(r!P$cyd8vWGfB2|gbO%Q*^T<04>=Y~UPS@L(%^VN z{1l2g5b4X4?LYlSeF*XX9=rg0>sj1&4#jyCizqIjcn-z$C|*GEA_|PHTwDT8E$E;* zUiN2U;s+os5HS0MzXrdy zb~1`CJZ@hLVn=WIZ(`@Pj-6u^aREVGM&X?d8w~I%LA;|g%swz$YTYv=(7-)%2dLr! zl14K#XbD;D->fl_Rt6LfD4PS+(afwZdu(U6pwiwi(Q0M`g>><{rjRvR-mhp1xdii$ zw17e`gH}ZiC}fS+rz_sY6&YCMzrhu#!HT-p;my}Q?4mQ8T$bo8J(ZO-xhzW;W@}lw zKrT%nmiE}9b9DZmk?Cg{S7be_0Cp#H#C>|2E=+Cj!G1^9!S-L#4r*X~LEEex*>=G8 znfn%y+3JzKYQbJ*>`)A!v+Jqpx*@o8`om&hg7yJ4fytl)Ec{Onuy6qOIq>o?tiPLg zP1tOv?)w1`I#5EojdZawie$HeDXI}BeF;^*(S<_T)ak-AmD#+aNOUrTxGVUvfYo*l zla4_1i$;1$2S@R9p}CUR-ZfArJ04tv)ek6&Ga$f-t3NG#0pA>|l58^`2ZIOrcpiC3 z);yB7km6jQupdxPu%D|r`&rYB<|ZfOhDmtRf7O)^ z>jR|A=oe z&V>VFk<5~>$$;=*!@!$Y&V>5u;;1h8IazGxeuL!NlGz664{i^SErllaL9K05h9txnq%e>6)rDq()hIx{Elg3$wG* aOXx6WYZ&_iOY+Oc8B!;Xjz#+b literal 0 HcmV?d00001 diff --git a/tests/test_scraper.py b/tests/test_scraper.py new file mode 100644 index 0000000..8324198 --- /dev/null +++ b/tests/test_scraper.py @@ -0,0 +1,28 @@ +import pytest +from src.scraper.exhaustive import detect_source_type, is_blacklisted, normalize_url +from src.processor.processor import simple_chunk + + +def test_detect_source_type(): + assert detect_source_type("https://youtube.com/watch?v=dQw4w9WgXcY") == "youtube" + assert detect_source_type("https://reddit.com/r/test/comments/abc") == "reddit" + assert detect_source_type("https://en.wikipedia.org/wiki/Roswell") == "wikipedia" + assert detect_source_type("https://example.com/doc.pdf") == "pdf" + assert detect_source_type("https://example.com/article") == "web" + + +def test_is_blacklisted(): + assert is_blacklisted("https://facebook.com/something") == True + assert is_blacklisted("https://en.wikipedia.org/wiki/Test") == False + + +def test_normalize_url(): + assert normalize_url("https://example.com/page#section") == "https://example.com/page" + assert normalize_url("https://example.com/page/") == "https://example.com/page" + + +def test_simple_chunk(): + text = "\n\n".join([f"Paragraph {i} with some content here." for i in range(50)]) + chunks = simple_chunk(text, chunk_size=100, overlap=20) + assert len(chunks) > 1 + assert all(isinstance(c, str) for c in chunks)