diff --git a/.claude/settings.local.json b/.claude/settings.local.json index aef8b44..998b54c 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -4,7 +4,24 @@ "Bash(poetry:*)", "Bash(pip:*)", "Bash(python:*)", - "Bash(git:*)" + "Bash(git:*)", + "Bash(docker-compose -f docker-compose.dev.yml ps)", + "Bash(docker-compose -f docker-compose.dev.yml logs api --tail=50)", + "Bash(docker-compose -f docker-compose.dev.yml logs chat --tail=50)", + "Bash(docker-compose -f docker-compose.dev.yml down)", + "Bash(docker-compose -f docker-compose.dev.yml up --build -d)", + "Bash(docker-compose -f docker-compose.dev.yml logs --tail=20)", + "Bash(docker-compose -f docker-compose.dev.yml logs --tail=30 api chat worker)", + "Bash(docker-compose -f docker-compose.dev.yml logs chat --tail=20)", + "Bash(docker-compose -f docker-compose.dev.yml logs worker --tail=20)", + "Bash(docker-compose -f docker-compose.dev.yml logs api --tail=20)", + "Bash(docker-compose -f docker-compose.dev.yml stop chat worker)", + "Bash(docker-compose -f docker-compose.dev.yml rm -f chat worker)", + "Bash(docker-compose -f docker-compose.dev.yml up --build -d api)", + "Bash(docker-compose -f docker-compose.dev.yml logs api --tail=30)", + "Bash(curl -s http://localhost:8000/health)", + "Bash(docker-compose -f docker-compose.dev.yml logs api --tail=10)", + "Bash(docker-compose -f docker-compose.dev.yml logs api --tail=15)" ], "deny": [], "ask": [], diff --git a/.env.example b/.env.example index ca0d852..c970cd4 100644 --- a/.env.example +++ b/.env.example @@ -1,22 +1,92 @@ -# MongoDB +# ============================================================================= +# Datacenter Documentation System - Configuration Template +# Copy this file to .env and fill in your actual values +# ============================================================================= + +# ============================================================================= +# MongoDB Configuration +# ============================================================================= MONGO_ROOT_USER=admin MONGO_ROOT_PASSWORD=changeme_secure_mongo_password MONGODB_URL=mongodb://admin:changeme_secure_mongo_password@mongodb:27017 MONGODB_DATABASE=datacenter_docs -# Redis +# ============================================================================= +# Redis Configuration +# ============================================================================= REDIS_PASSWORD=changeme_redis_password +REDIS_URL=redis://redis:6379/0 -# MCP Server +# ============================================================================= +# MCP Server Configuration +# ============================================================================= MCP_SERVER_URL=https://mcp.company.local MCP_API_KEY=your_mcp_api_key_here -# Anthropic API -ANTHROPIC_API_KEY=your_anthropic_api_key_here +# ============================================================================= +# LLM Configuration (OpenAI-compatible API) +# Choose one of the configurations below and uncomment it +# ============================================================================= -# CORS +# --- OpenAI (Default) --- +LLM_BASE_URL=https://api.openai.com/v1 +LLM_API_KEY=sk-your-openai-api-key-here +LLM_MODEL=gpt-4-turbo-preview +# Alternative models: gpt-4, gpt-3.5-turbo + +# --- Anthropic Claude (OpenAI-compatible) --- +# LLM_BASE_URL=https://api.anthropic.com/v1 +# LLM_API_KEY=sk-ant-your-anthropic-key-here +# LLM_MODEL=claude-sonnet-4-20250514 +# Alternative models: claude-3-opus-20240229, claude-3-sonnet-20240229 + +# --- LLMStudio (Local) --- +# LLM_BASE_URL=http://localhost:1234/v1 +# LLM_API_KEY=not-needed +# LLM_MODEL=your-local-model-name + +# --- Open-WebUI (Local) --- +# LLM_BASE_URL=http://localhost:8080/v1 +# LLM_API_KEY=your-open-webui-key +# LLM_MODEL=llama3 +# Alternative models: mistral, mixtral, codellama + +# --- Ollama (Local) --- +# LLM_BASE_URL=http://localhost:11434/v1 +# LLM_API_KEY=ollama +# LLM_MODEL=llama3 +# Alternative models: mistral, mixtral, codellama, phi3 + +# LLM Generation Settings +LLM_TEMPERATURE=0.3 +LLM_MAX_TOKENS=4096 + +# ============================================================================= +# API Configuration +# ============================================================================= +API_HOST=0.0.0.0 +API_PORT=8000 +WORKERS=4 + +# ============================================================================= +# CORS Configuration +# ============================================================================= CORS_ORIGINS=http://localhost:3000,https://docs.company.local -# Optional +# ============================================================================= +# Application Settings +# ============================================================================= LOG_LEVEL=INFO DEBUG=false + +# ============================================================================= +# Celery Configuration +# ============================================================================= +CELERY_BROKER_URL=redis://redis:6379/0 +CELERY_RESULT_BACKEND=redis://redis:6379/0 + +# ============================================================================= +# Vector Store Configuration +# ============================================================================= +VECTOR_STORE_PATH=./data/chroma_db +EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2 diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml index 8f3c5a7..b3807ea 100644 --- a/.gitea/workflows/ci.yml +++ b/.gitea/workflows/ci.yml @@ -12,7 +12,7 @@ on: env: POETRY_VERSION: 1.8.0 - PYTHON_VERSION: "3.14" + PYTHON_VERSION: "3.12" REGISTRY: ${{ vars.PACKAGES_REGISTRY }} IMAGE_NAME: ${{ gitea.repository }} diff --git a/.github/workflows/build-deploy.yml b/.github/workflows/build-deploy.yml index c4f90f2..8879173 100644 --- a/.github/workflows/build-deploy.yml +++ b/.github/workflows/build-deploy.yml @@ -26,7 +26,7 @@ on: env: DOCKER_REGISTRY: ghcr.io IMAGE_NAME: ${{ github.repository }}/docs-server - PYTHON_VERSION: '3.14' + PYTHON_VERSION: '3.12' jobs: # Job 1: Linting e validazione diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 4e5a81c..2b73efc 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -9,7 +9,7 @@ stages: variables: POETRY_VERSION: "1.8.0" - PYTHON_VERSION: "3.14" + PYTHON_VERSION: "3.12" DOCKER_DRIVER: overlay2 DOCKER_TLS_CERTDIR: "/certs" PIP_CACHE_DIR: "$CI_PROJECT_DIR/.cache/pip" diff --git a/ARCHITECTURE_STATUS.md b/ARCHITECTURE_STATUS.md new file mode 100644 index 0000000..fab89f9 --- /dev/null +++ b/ARCHITECTURE_STATUS.md @@ -0,0 +1,276 @@ +# Architecture Status Overview + +## πŸ—οΈ Struttura Moduli - Stato Attuale vs Target + +``` +src/datacenter_docs/ +β”œβ”€β”€ __init__.py βœ… Presente +β”‚ +β”œβ”€β”€ api/ βœ… COMPLETO (80%) +β”‚ β”œβ”€β”€ __init__.py βœ… +β”‚ β”œβ”€β”€ main.py βœ… FastAPI app principale +β”‚ β”œβ”€β”€ main_enhanced.py βœ… Versione enhanced +β”‚ β”œβ”€β”€ models.py βœ… Pydantic models +β”‚ β”œβ”€β”€ auto_remediation.py βœ… Auto-remediation engine +β”‚ └── reliability.py βœ… Reliability scoring +β”‚ +β”œβ”€β”€ chat/ ⚠️ PARZIALE (40%) +β”‚ β”œβ”€β”€ __init__.py βœ… +β”‚ β”œβ”€β”€ agent.py βœ… DocumentationAgent +β”‚ └── main.py ❌ MANCA - WebSocket server +β”‚ +β”œβ”€β”€ workers/ ❌ DIRECTORY NON ESISTE (0%) +β”‚ β”œβ”€β”€ __init__.py ❌ Da creare +β”‚ β”œβ”€β”€ celery_app.py ❌ Da creare - Celery config +β”‚ └── tasks.py ❌ Da creare - Celery tasks +β”‚ +β”œβ”€β”€ collectors/ ⚠️ SKELETON (5%) +β”‚ β”œβ”€β”€ __init__.py βœ… +β”‚ β”œβ”€β”€ base.py ❌ Da creare - Base collector +β”‚ β”œβ”€β”€ vmware_collector.py ❌ Da creare +β”‚ β”œβ”€β”€ kubernetes_collector.py ❌ Da creare +β”‚ β”œβ”€β”€ network_collector.py ❌ Da creare +β”‚ β”œβ”€β”€ storage_collector.py ❌ Da creare +β”‚ β”œβ”€β”€ database_collector.py ❌ Da creare +β”‚ └── monitoring_collector.py ❌ Da creare +β”‚ +β”œβ”€β”€ generators/ ⚠️ SKELETON (5%) +β”‚ β”œβ”€β”€ __init__.py βœ… +β”‚ β”œβ”€β”€ base.py ❌ Da creare - Base generator +β”‚ β”œβ”€β”€ infrastructure_generator.py ❌ Da creare +β”‚ β”œβ”€β”€ network_generator.py ❌ Da creare +β”‚ β”œβ”€β”€ virtualization_generator.py ❌ Da creare +β”‚ β”œβ”€β”€ kubernetes_generator.py ❌ Da creare +β”‚ β”œβ”€β”€ storage_generator.py ❌ Da creare +β”‚ β”œβ”€β”€ database_generator.py ❌ Da creare +β”‚ β”œβ”€β”€ monitoring_generator.py ❌ Da creare +β”‚ β”œβ”€β”€ security_generator.py ❌ Da creare +β”‚ β”œβ”€β”€ runbook_generator.py ❌ Da creare +β”‚ └── troubleshooting_generator.py ❌ Da creare +β”‚ +β”œβ”€β”€ validators/ ⚠️ SKELETON (5%) +β”‚ β”œβ”€β”€ __init__.py βœ… +β”‚ β”œβ”€β”€ base.py ❌ Da creare +β”‚ β”œβ”€β”€ config_validator.py ❌ Da creare +β”‚ β”œβ”€β”€ security_validator.py ❌ Da creare +β”‚ └── compliance_validator.py ❌ Da creare +β”‚ +β”œβ”€β”€ mcp/ βœ… BASE (60%) +β”‚ β”œβ”€β”€ __init__.py βœ… +β”‚ β”œβ”€β”€ client.py βœ… MCP client +β”‚ └── server.py ❌ Da creare (se necessario) +β”‚ +β”œβ”€β”€ utils/ βœ… BASE (70%) +β”‚ β”œβ”€β”€ __init__.py βœ… +β”‚ β”œβ”€β”€ config.py βœ… Configuration management +β”‚ β”œβ”€β”€ database.py βœ… MongoDB utilities +β”‚ β”œβ”€β”€ logging.py ❌ Da creare +β”‚ └── helpers.py ❌ Da creare +β”‚ +└── cli.py ❌ MANCA (0%) - CLI tool principale +``` + +--- + +## πŸ“Š Completamento per Categoria + +| Categoria | Completamento | PrioritΓ  | Note | +|-----------|---------------|----------|------| +| **API Service** | 🟒 80% | βœ… Completato | Funzionante in produzione | +| **Database Layer** | 🟒 70% | βœ… Completato | MongoDB + Beanie OK | +| **MCP Integration** | 🟑 60% | Alta | Client base funzionante | +| **Chat Service** | 🟑 40% | Media | Agent OK, manca WebSocket server | +| **Auto-Remediation** | 🟒 75% | βœ… Completato | Engine + reliability OK | +| **CLI Tool** | πŸ”΄ 0% | **Critica** | Necessario per gestione | +| **Workers (Celery)** | πŸ”΄ 0% | **Critica** | Necessario per task async | +| **Collectors** | 🟑 5% | Alta | Solo skeleton | +| **Generators** | 🟑 5% | Alta | Solo skeleton | +| **Validators** | 🟑 5% | Media | Solo skeleton | +| **Frontend** | 🟑 20% | Bassa | Skeleton React + build | +| **CI/CD** | 🟒 90% | βœ… Completato | GitHub/GitLab/Gitea | +| **Docker** | 🟒 85% | βœ… Completato | Tutti i Dockerfile OK | + +**Overall Project Completion: ~35%** + +--- + +## πŸ”„ Data Flow - Stato Implementazione + +### Target Architecture +```mermaid +graph TD + A[External Trigger] -->|1| B[API/CLI] + B -->|2| C[Celery Task] + C -->|3| D[Collectors] + D -->|4| E[MCP Server] + E -->|5| F[Infrastructure] + F -->|6| E + E -->|7| D + D -->|8| G[Generators] + G -->|9| H[LLM Claude] + H -->|10| G + G -->|11| I[MongoDB] + I -->|12| J[API Response] +``` + +### Current Status +``` +βœ… [External Trigger] + ↓ +βœ… [API] β†’ ⚠️ [CLI - MANCA] + ↓ +❌ [Celery Task - MANCA] + ↓ +⚠️ [Collectors - SKELETON] β†’ βœ… [MCP Client] β†’ ❓ [MCP Server - External] + ↓ +⚠️ [Generators - SKELETON] β†’ βœ… [LLM Integration OK] + ↓ +βœ… [MongoDB Storage] + ↓ +βœ… [API Response] +``` + +**Blocchi Critici**: +- ❌ **Celery Workers** - Nessun task asincrono funzionante +- ❌ **CLI Tool** - Nessun modo di gestire il sistema da command line +- ⚠️ **Collectors** - Non puΓ² raccogliere dati dall'infrastruttura +- ⚠️ **Generators** - Non puΓ² generare documentazione + +--- + +## 🎯 Milestone per Completamento + +### Milestone 1: Core System (MVP) +**Target**: Sistema base funzionante end-to-end +**Completamento**: 35% β†’ 60% + +- [ ] CLI tool base (`cli.py`) +- [ ] Celery workers setup (`workers/celery_app.py`, `workers/tasks.py`) +- [ ] 1 Collector funzionante (es: VMware) +- [ ] 1 Generator funzionante (es: Infrastructure) +- [ ] Task scheduling per generazione periodica docs + +**Risultato**: Generazione automatica documentazione ogni 6 ore + +--- + +### Milestone 2: Complete Data Pipeline +**Target**: Tutti i collector e generator implementati +**Completamento**: 60% β†’ 80% + +- [ ] Tutti i 6+ collectors implementati +- [ ] Tutti i 10 generators implementati +- [ ] Base validators +- [ ] Logging completo +- [ ] Error handling robusto + +**Risultato**: Documentazione completa di tutta l'infrastruttura + +--- + +### Milestone 3: Advanced Features +**Target**: Chat + Auto-remediation completo +**Completamento**: 80% β†’ 95% + +- [ ] Chat WebSocket server (`chat/main.py`) +- [ ] Frontend React completato +- [ ] Auto-remediation testing esteso +- [ ] Analytics e dashboard +- [ ] Advanced validators + +**Risultato**: Sistema completo con UI e auto-remediation + +--- + +### Milestone 4: Production Ready +**Target**: Sistema production-ready +**Completamento**: 95% β†’ 100% + +- [ ] Testing completo (unit + integration) +- [ ] Performance optimization +- [ ] Security hardening +- [ ] Documentation completa +- [ ] Monitoring e alerting +- [ ] Backup e disaster recovery + +**Risultato**: Deploy in produzione + +--- + +## πŸ” Analisi Dipendenze Critiche + +### Per Avviare Generazione Docs (MVP) +**Dipendenze minime**: +1. βœ… API Service (giΓ  presente) +2. ❌ CLI tool β†’ **BLOCKING** +3. ❌ Celery workers β†’ **BLOCKING** +4. ❌ Almeno 1 collector β†’ **BLOCKING** +5. ❌ Almeno 1 generator β†’ **BLOCKING** +6. βœ… MongoDB (giΓ  configurato) +7. βœ… Redis (giΓ  configurato) +8. βœ… LLM integration (giΓ  presente) + +**Effort Stimato per MVP**: 3-5 giorni di sviluppo + +--- + +### Per Chat Service Completo +**Dipendenze**: +1. βœ… DocumentationAgent (giΓ  presente) +2. ❌ WebSocket server β†’ **BLOCKING** +3. ⚠️ Frontend chat UI (opzionale - puΓ² usare Postman/WebSocket client) +4. βœ… MongoDB (giΓ  configurato) +5. βœ… LLM integration (giΓ  presente) + +**Effort Stimato**: 1-2 giorni di sviluppo + +--- + +### Per Auto-Remediation Completo +**Dipendenze**: +1. βœ… Auto-remediation engine (giΓ  presente) +2. βœ… Reliability scoring (giΓ  presente) +3. ❌ Celery workers per execution β†’ **BLOCKING** +4. ⚠️ Testing infrastructure (importante per sicurezza) +5. ⚠️ Approval workflows (UI opzionale) + +**Effort Stimato**: 2-3 giorni di sviluppo + testing + +--- + +## πŸ’‘ Raccomandazioni + +### PrioritΓ  Sviluppo Immediato +1. **CLI Tool** (1 giorno) + - Essenziale per gestione sistema + - PermetterΓ  testing manuale + +2. **Celery Workers** (1-2 giorni) + - Necessario per task asincroni + - Fondamentale per generazione docs + +3. **1 Collector + 1 Generator** (2-3 giorni) + - Completa il ciclo base + - Permette testing end-to-end + +**Totale effort MVP**: ~5-6 giorni + +### Quick Wins +- βœ… Docker setup Γ¨ completo - infrastruttura OK +- βœ… API Γ¨ funzionante - puΓ² essere testata +- βœ… Database layer Γ¨ pronto - storage OK +- βœ… LLM integration Γ¨ pronta - generazione OK + +**Manca solo**: Logica business per collectors/generators e orchestrazione via Celery + +--- + +## πŸ“ˆ Progress Tracking + +**Last Updated**: 2025-10-19 + +**Current Sprint Focus**: Infrastructure setup βœ… COMPLETATO +**Next Sprint Focus**: Core business logic (Collectors/Generators/Workers) + +**Team Velocity**: N/A +**Estimated Completion**: 2-3 settimane per MVP diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..904b8a6 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,465 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +--- + +## Project Overview + +**LLM Automation - Docs & Remediation Engine**: AI-powered datacenter documentation generation with autonomous problem resolution capabilities. The system uses LLMs to automatically generate infrastructure documentation and can autonomously execute remediation actions on datacenter infrastructure. + +**Current Status**: ~35% complete - Infrastructure and API are functional, but CLI tool, Celery workers, collectors, and generators are not yet implemented. + +**Language**: Python 3.12 (standardized across entire project) + +**Database**: MongoDB with Beanie ODM (async, document-based) + +--- + +## Essential Commands + +### Development Environment Setup + +```bash +# Install dependencies +poetry install + +# Start Docker development stack (6 services: MongoDB, Redis, API, Chat, Worker, Frontend) +cd deploy/docker +docker-compose -f docker-compose.dev.yml up --build -d + +# Check service status +docker-compose -f docker-compose.dev.yml ps + +# View logs +docker-compose -f docker-compose.dev.yml logs -f api +docker-compose -f docker-compose.dev.yml logs -f --tail=50 api + +# Stop services +docker-compose -f docker-compose.dev.yml down + +# Restart single service after code changes +docker-compose -f docker-compose.dev.yml restart api +``` + +### Testing & Code Quality + +```bash +# Run all tests +poetry run pytest + +# Run specific test file +poetry run pytest tests/test_reliability.py + +# Run with coverage +poetry run pytest --cov=src/datacenter_docs --cov-report=html + +# Linting +poetry run black src/ +poetry run ruff check src/ +poetry run mypy src/ + +# Format code (100 char line length) +poetry run black src/ tests/ +``` + +### Running Services Locally + +```bash +# API server (development with auto-reload) +poetry run uvicorn datacenter_docs.api.main:app --reload --host 0.0.0.0 --port 8000 + +# CLI tool (NOT YET IMPLEMENTED - needs src/datacenter_docs/cli.py) +poetry run datacenter-docs --help + +# Celery worker (NOT YET IMPLEMENTED - needs src/datacenter_docs/workers/) +poetry run docs-worker + +# Chat server (NOT YET IMPLEMENTED - needs src/datacenter_docs/chat/main.py) +poetry run docs-chat +``` + +### Database Operations + +```bash +# Access MongoDB shell in Docker +docker exec -it datacenter-docs-mongodb-dev mongosh -u admin -p admin123 + +# Access Redis CLI +docker exec -it datacenter-docs-redis-dev redis-cli + +# Check database connectivity +curl http://localhost:8000/health +``` + +--- + +## High-Level Architecture + +### 1. **LLM Provider System (OpenAI-Compatible API)** + +**Location**: `src/datacenter_docs/utils/llm_client.py` + +**Key Concept**: All LLM interactions go through `LLMClient` which uses the OpenAI SDK and can connect to ANY OpenAI-compatible provider: +- OpenAI (GPT-4, GPT-3.5) +- Anthropic Claude (via OpenAI-compatible endpoint) +- LLMStudio (local models) +- Open-WebUI (local models) +- Ollama (local models) + +**Configuration** (in `.env`): +```bash +LLM_BASE_URL=https://api.openai.com/v1 +LLM_API_KEY=sk-your-key +LLM_MODEL=gpt-4-turbo-preview +``` + +**Usage**: +```python +from datacenter_docs.utils.llm_client import get_llm_client + +llm = get_llm_client() +response = await llm.chat_completion(messages=[...]) +json_response = await llm.generate_json(messages=[...]) +``` + +### 2. **Database Architecture (MongoDB + Beanie ODM)** + +**Location**: `src/datacenter_docs/api/models.py` + +**Key Characteristics**: +- Models inherit from `beanie.Document` +- MongoDB atomic operations +- Async operations: `await Ticket.find_one()`, `await ticket.save()` +- ObjectId for primary keys: `PydanticObjectId` +- Supports embedded documents and references + +**Example**: +```python +from beanie import Document, PydanticObjectId +from datetime import datetime + +class Ticket(Document): + ticket_id: str + status: TicketStatus + created_at: datetime = datetime.now() + + class Settings: + name = "tickets" # Collection name + indexes = ["ticket_id", "status"] + +# Usage +ticket = await Ticket.find_one(Ticket.ticket_id == "INC-123") +ticket.status = TicketStatus.RESOLVED +await ticket.save() +``` + +### 3. **Auto-Remediation Decision Flow** + +**Multi-layered safety system** that decides whether AI can execute infrastructure changes. + +**Flow** (`src/datacenter_docs/api/reliability.py` β†’ `auto_remediation.py`): + +``` +Ticket Created + ↓ +ReliabilityCalculator.calculate_reliability() + β”œβ”€ AI Confidence Score (25%) + β”œβ”€ Human Feedback History (30%) + β”œβ”€ Historical Success Rate (25%) + └─ Pattern Matching (20%) + ↓ +Overall Reliability Score (0-100%) + ↓ +AutoRemediationDecisionEngine.should_execute() + β”œβ”€ Check if enabled for ticket + β”œβ”€ Check minimum reliability (85%) + β”œβ”€ Check action risk level + β”œβ”€ Check rate limits + └─ Determine if approval needed + ↓ +AutoRemediationEngine.execute_remediation() + β”œβ”€ Pre-execution checks + β”œβ”€ Execute via MCP Client + β”œβ”€ Post-execution validation + └─ Log everything +``` + +**Key Classes**: +- `ReliabilityCalculator`: Calculates weighted reliability score +- `AutoRemediationDecisionEngine`: Decides if/how to execute +- `AutoRemediationEngine`: Actually executes actions via MCP + +### 4. **MCP Client Integration** + +**Location**: `src/datacenter_docs/mcp/client.py` + +MCP (Model Context Protocol) is the bridge to infrastructure. It's an external service that connects to VMware, Kubernetes, network devices, etc. + +**Important**: MCP Client is EXTERNAL. We don't implement the infrastructure connections - we call MCP's API. + +**Operations**: +- Read operations: Get VM status, list pods, check network config +- Write operations (auto-remediation): Restart VM, scale deployment, enable port + +### 5. **Documentation Agent (Agentic AI)** + +**Location**: `src/datacenter_docs/chat/agent.py` + +**Architecture Pattern**: RAG (Retrieval Augmented Generation) + +``` +User Query + ↓ +Vector Search (ChromaDB + HuggingFace embeddings) + ↓ +Retrieve Top-K Relevant Docs + ↓ +Build Context + Query β†’ LLM + ↓ +Generate Response with Citations +``` + +**Key Methods**: +- `search_documentation()`: Semantic search in vector store +- `resolve_ticket()`: Analyze problem + suggest resolution +- `chat_with_context()`: Conversational interface with doc search + +### 6. **Missing Critical Components** (TODO) + +**See `TODO.md` for comprehensive list**. When implementing new features, check TODO.md first. + +**High Priority Missing Components**: + +1. **CLI Tool** (`src/datacenter_docs/cli.py`): + - Entry point: `datacenter-docs` command + - Uses Typer + Rich for CLI + - Commands: generate, serve, worker, init-db, stats + +2. **Celery Workers** (`src/datacenter_docs/workers/`): + - `celery_app.py`: Celery configuration + - `tasks.py`: Async tasks (documentation generation, auto-remediation execution) + - Background task processing + +3. **Collectors** (`src/datacenter_docs/collectors/`): + - Base class exists, implementations missing + - Need: VMware, Kubernetes, Network, Storage collectors + - Pattern: `async def collect() -> dict` + +4. **Generators** (`src/datacenter_docs/generators/`): + - Base class exists, implementations missing + - Need: Infrastructure, Network, Virtualization generators + - Pattern: `async def generate(data: dict) -> str` (returns Markdown) + +**When implementing these**: +- Follow existing patterns in base classes +- Use `LLMClient` for AI generation +- Use `MCPClient` for infrastructure data collection +- All operations are async +- Use MongoDB/Beanie for storage + +--- + +## Code Patterns & Conventions + +### Async/Await + +All operations use asyncio: + +```python +async def my_function(): + result = await some_async_call() +``` + +### Type Hints + +Type hints are required (mypy configured strictly): + +```python +async def process_ticket(ticket_id: str) -> Dict[str, Any]: + ... +``` + +### Logging + +Use structured logging with module-level logger: + +```python +import logging + +logger = logging.getLogger(__name__) + +logger.info(f"Processing ticket {ticket_id}") +logger.error(f"Failed to execute action: {e}", exc_info=True) +``` + +### Configuration + +All config via `src/datacenter_docs/utils/config.py` using Pydantic Settings: + +```python +from datacenter_docs.utils.config import get_settings + +settings = get_settings() +mongodb_url = settings.MONGODB_URL +llm_model = settings.LLM_MODEL +``` + +### Error Handling + +```python +try: + result = await risky_operation() +except SpecificException as e: + logger.error(f"Operation failed: {e}", exc_info=True) + return {"success": False, "error": str(e)} +``` + +--- + +## Docker Development Workflow + +**Primary development environment**: Docker Compose + +**Services in `deploy/docker/docker-compose.dev.yml`**: +- `mongodb`: MongoDB 7 (port 27017) +- `redis`: Redis 7 (port 6379) +- `api`: FastAPI service (port 8000) +- `chat`: WebSocket chat server (port 8001) - **NOT IMPLEMENTED** +- `worker`: Celery worker - **NOT IMPLEMENTED** +- `frontend`: React + Nginx (port 80) - **MINIMAL** + +**Development cycle**: +1. Edit code in `src/` +2. Rebuild and restart affected service: `docker-compose -f docker-compose.dev.yml up --build -d api` +3. Check logs: `docker-compose -f docker-compose.dev.yml logs -f api` +4. Test: Access http://localhost:8000/api/docs + +**Volume mounts**: Source code is mounted, so changes are reflected (except for dependency changes which need rebuild). + +--- + +## CI/CD Pipelines + +**Three CI/CD systems configured** (all use Python 3.12): +- `.github/workflows/build-deploy.yml`: GitHub Actions +- `.gitlab-ci.yml`: GitLab CI +- `.gitea/workflows/ci.yml`: Gitea Actions + +**Pipeline stages**: +1. Lint (Black, Ruff) +2. Type check (mypy) +3. Test (pytest) +4. Build Docker image +5. Deploy (if on main branch) + +**When modifying Python version**: Update ALL three pipeline files. + +--- + +## Key Files Reference + +**Core Application**: +- `src/datacenter_docs/api/main.py`: FastAPI application entry point +- `src/datacenter_docs/api/models.py`: MongoDB/Beanie models (all data structures) +- `src/datacenter_docs/utils/config.py`: Configuration management +- `src/datacenter_docs/utils/llm_client.py`: LLM provider abstraction + +**Auto-Remediation**: +- `src/datacenter_docs/api/reliability.py`: Reliability scoring and decision engine +- `src/datacenter_docs/api/auto_remediation.py`: Execution engine with safety checks + +**Infrastructure Integration**: +- `src/datacenter_docs/mcp/client.py`: MCP protocol client +- `src/datacenter_docs/chat/agent.py`: Documentation AI agent (RAG) + +**Configuration**: +- `.env.example`: Template with ALL config options (including LLM provider examples) +- `pyproject.toml`: Dependencies, scripts, linting config (Black 100 char, Python 3.12) + +**Documentation**: +- `README.md`: User-facing documentation +- `TODO.md`: **CRITICAL** - Current project status, missing components, roadmap +- `deploy/docker/README.md`: Docker environment guide + +--- + +## Important Notes + +### Python Version +Use Python 3.12 (standardized across the project). + +### Database Queries +MongoDB queries look different from SQL: +```python +# Find +tickets = await Ticket.find(Ticket.status == TicketStatus.PENDING).to_list() + +# Find one +ticket = await Ticket.find_one(Ticket.ticket_id == "INC-123") + +# Update +ticket.status = TicketStatus.RESOLVED +await ticket.save() + +# Complex query +tickets = await Ticket.find( + Ticket.created_at > datetime.now() - timedelta(days=7), + Ticket.category == "network" +).to_list() +``` + +### LLM API Calls +Use the generic client: +```python +from datacenter_docs.utils.llm_client import get_llm_client + +llm = get_llm_client() +response = await llm.chat_completion(messages=[...]) +``` + +### Auto-Remediation Safety +When implementing new remediation actions: +1. Define action in `RemediationAction` model +2. Set appropriate `ActionRiskLevel` (low/medium/high/critical) +3. Implement pre/post validation checks +4. Add comprehensive logging +5. Test with `dry_run=True` first + +### Testing +Tests are minimal currently. When adding tests: +- Use `pytest-asyncio` for async tests +- Mock MCP client and LLM client +- Test reliability calculations thoroughly +- Test safety checks in auto-remediation + +--- + +## When Implementing New Features + +1. Check `TODO.md` first - component might be partially implemented +2. Follow existing patterns in similar components +3. Use type hints (mypy is strict) +4. Use `LLMClient` for AI operations +5. Use Beanie ORM for database operations +6. All operations are async (use async/await) +7. Test in Docker (primary development environment) +8. Update `TODO.md` when marking components as completed + +--- + +## Questions? Check These Files + +- **"How do I configure the LLM provider?"** β†’ `.env.example`, `utils/config.py`, `utils/llm_client.py` +- **"How does auto-remediation work?"** β†’ `api/reliability.py`, `api/auto_remediation.py` +- **"What's not implemented yet?"** β†’ `TODO.md` (comprehensive list with estimates) +- **"How do I run tests/lint?"** β†’ `pyproject.toml` (all commands), this file +- **"Database schema?"** β†’ `api/models.py` (all Beanie models) +- **"Docker services?"** β†’ `deploy/docker/docker-compose.dev.yml`, `deploy/docker/README.md` +- **"API endpoints?"** β†’ `api/main.py`, or http://localhost:8000/api/docs when running + +--- + +**Last Updated**: 2025-10-19 +**Project Status**: 35% complete (Infrastructure done, business logic pending) +**Next Priority**: CLI tool β†’ Celery workers β†’ Collectors β†’ Generators diff --git a/Dockerfile b/Dockerfile index b29da35..e52456e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ # Multi-stage Dockerfile per Datacenter Documentation System # Stage 1: Build MkDocs documentation -FROM python:3.11-slim as docs-builder +FROM python:3.12-slim as docs-builder WORKDIR /build @@ -24,7 +24,7 @@ COPY templates /build/docs/sections/ RUN mkdocs build --clean --strict # Stage 2: Runtime application -FROM python:3.11-slim +FROM python:3.12-slim LABEL maintainer="automation-team@company.com" LABEL description="Datacenter Documentation Server with FastAPI and MCP" diff --git a/README.md b/README.md index 883e8ca..62ab182 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ > AI-powered infrastructure documentation generation with autonomous problem resolution capabilities. [![Version](https://img.shields.io/badge/version-2.0.0-blue.svg)](https://github.com/yourusername/datacenter-docs) -[![Python](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/) +[![Python](https://img.shields.io/badge/python-3.12+-blue.svg)](https://www.python.org/downloads/) [![License](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE) --- @@ -85,7 +85,7 @@ ## πŸš€ Quick Start ### Prerequisites -- Python 3.10+ +- Python 3.12+ - Poetry 1.7+ - Docker & Docker Compose - MCP Server running @@ -141,6 +141,154 @@ kubectl apply -f deploy/kubernetes/ --- +## πŸ’» CLI Tool + +The system includes a comprehensive command-line tool for managing all aspects of the documentation and remediation engine. + +### Available Commands + +```bash +# Initialize database with collections and default data +datacenter-docs init-db + +# Start API server +datacenter-docs serve # Production +datacenter-docs serve --reload # Development with auto-reload + +# Start Celery worker for background tasks +datacenter-docs worker # All queues (default) +datacenter-docs worker --queue documentation # Documentation queue only +datacenter-docs worker --concurrency 8 # Custom concurrency + +# Documentation generation +datacenter-docs generate vmware # Generate specific section +datacenter-docs generate-all # Generate all sections +datacenter-docs list-sections # List available sections + +# System statistics and monitoring +datacenter-docs stats # Last 24 hours +datacenter-docs stats --period 7d # Last 7 days + +# Auto-remediation management +datacenter-docs remediation status # Show all policies +datacenter-docs remediation enable # Enable globally +datacenter-docs remediation disable # Disable globally +datacenter-docs remediation enable --category network # Enable for category +datacenter-docs remediation disable --category network # Disable for category + +# System information +datacenter-docs version # Show version info +datacenter-docs --help # Show help +``` + +### Example Workflow + +```bash +# 1. Setup database +datacenter-docs init-db + +# 2. Start services +datacenter-docs serve --reload & # API in background +datacenter-docs worker & # Worker in background + +# 3. Generate documentation +datacenter-docs list-sections # See available sections +datacenter-docs generate vmware # Generate VMware docs +datacenter-docs generate-all # Generate everything + +# 4. Monitor system +datacenter-docs stats --period 24h # Check statistics + +# 5. Enable auto-remediation for safe categories +datacenter-docs remediation enable --category network +datacenter-docs remediation status # Verify +``` + +### Section IDs + +The following documentation sections are available: +- `vmware` - VMware Infrastructure (vCenter, ESXi) +- `kubernetes` - Kubernetes Clusters +- `network` - Network Infrastructure (switches, routers) +- `storage` - Storage Systems (SAN, NAS) +- `database` - Database Servers +- `monitoring` - Monitoring Systems (Zabbix, Prometheus) +- `security` - Security & Compliance + +--- + +## βš™οΈ Background Workers (Celery) + +The system uses **Celery** for asynchronous task processing with **4 specialized queues** and **8 task types**. + +### Worker Queues + +1. **documentation** - Documentation generation tasks +2. **auto_remediation** - Auto-remediation execution tasks +3. **data_collection** - Infrastructure data collection +4. **maintenance** - System cleanup and metrics + +### Available Tasks + +| Task | Queue | Schedule | Description | +|------|-------|----------|-------------| +| `generate_documentation_task` | documentation | Every 6 hours | Full documentation regeneration | +| `generate_section_task` | documentation | On-demand | Single section generation | +| `execute_auto_remediation_task` | auto_remediation | On-demand | Execute remediation actions (rate limit: 10/h) | +| `process_ticket_task` | auto_remediation | On-demand | AI ticket analysis and resolution | +| `collect_infrastructure_data_task` | data_collection | Every 1 hour | Collect infrastructure state | +| `cleanup_old_data_task` | maintenance | Daily 2 AM | Remove old records (90 days) | +| `update_system_metrics_task` | maintenance | Every 15 minutes | Calculate system metrics | + +### Worker Management + +```bash +# Start worker with all queues +datacenter-docs worker + +# Start worker for specific queue only +datacenter-docs worker --queue documentation +datacenter-docs worker --queue auto_remediation +datacenter-docs worker --queue data_collection +datacenter-docs worker --queue maintenance + +# Custom concurrency (default: 4) +datacenter-docs worker --concurrency 8 + +# Custom log level +datacenter-docs worker --log-level DEBUG +``` + +### Celery Beat (Scheduler) + +The system includes **Celery Beat** for periodic task execution: + +```bash +# Start beat scheduler (runs alongside worker) +celery -A datacenter_docs.workers.celery_app beat --loglevel=INFO +``` + +### Monitoring with Flower + +Monitor Celery workers in real-time: + +```bash +# Start Flower web UI (port 5555) +celery -A datacenter_docs.workers.celery_app flower +``` + +Access at: http://localhost:5555 + +### Task Configuration + +- **Timeout**: 1 hour hard limit, 50 minutes soft limit +- **Retry**: Up to 3 retries for failed tasks +- **Prefetch**: 1 task per worker (prevents overload) +- **Max tasks per child**: 1000 (automatic worker restart) +- **Serialization**: JSON (secure and portable) + +--- + ## πŸ“– Documentation ### Core Documentation diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..8cdfe8d --- /dev/null +++ b/TODO.md @@ -0,0 +1,722 @@ +# TODO - Componenti da Sviluppare + +**Last Updated**: 2025-10-19 +**Project Completion**: ~55% (Infrastructure + CLI + Workers + VMware Collector complete, generators pending) + +--- + +## βœ… Completamenti Recenti + +### Infrastruttura (100% Complete) +- βœ… **Python 3.12 Migration** - Tutti i file aggiornati da 3.13/3.14 a 3.12 +- βœ… **Docker Development Environment** - Tutti i Dockerfile creati e testati + - `deploy/docker/Dockerfile.api` - Multi-stage build con Poetry + - `deploy/docker/Dockerfile.chat` - WebSocket server (codice da implementare) + - `deploy/docker/Dockerfile.worker` - Celery worker (codice da implementare) + - `deploy/docker/Dockerfile.frontend` - React + Nginx + - `deploy/docker/docker-compose.dev.yml` - Ambiente completo con 6 servizi +- βœ… **CI/CD Pipelines** - GitHub Actions, GitLab CI, Gitea Actions configurati per Python 3.12 +- βœ… **API Service** - FastAPI server funzionante e testato +- βœ… **Database Layer** - MongoDB + Beanie ODM configurato e funzionante +- βœ… **Redis** - Cache e message broker operativi +- βœ… **Auto-Remediation Engine** - Implementato e testato +- βœ… **MCP Client** - Integrazione base con Model Context Protocol +- βœ… **CLI Tool** - Strumento CLI completo con 11 comandi (2025-10-19) +- βœ… **Celery Workers** - Sistema completo task asincroni con 8 task (2025-10-19) +- βœ… **VMware Collector** - Collector completo per vSphere con BaseCollector (2025-10-19) + +### Servizi Operativi +```bash +# Servizi attualmente funzionanti in Docker +βœ… MongoDB (porta 27017) - Database principale +βœ… Redis (porta 6379) - Cache e message broker +βœ… API (porta 8000) - FastAPI con health check funzionante +βœ… Worker - Celery worker con 4 code e 8 task +❌ Chat (porta 8001) - Dockerfile pronto, codice mancante (main.py) +❌ Frontend (porta 80) - Build funzionante, app minima +``` + +--- + +## πŸ”΄ Componenti Critici Mancanti + +### 1. Chat Service (WebSocket Server) +**Stato**: ⚠️ Parziale - Solo agent.py presente +**File da creare**: `src/datacenter_docs/chat/main.py` + +**Descrizione**: +- Server WebSocket per chat in tempo reale +- Integrazione con DocumentationAgent esistente +- Gestione sessioni utente +- Conversational memory + +**Dipendenze**: +- βœ… `python-socketio` (giΓ  in pyproject.toml) +- βœ… `websockets` (giΓ  in pyproject.toml) +- βœ… `chat/agent.py` (giΓ  presente) + +**Riferimenti**: +- Script Poetry definito: `docs-chat = "datacenter_docs.chat.main:start"` (line 95 pyproject.toml) +- Dockerfile pronto: `deploy/docker/Dockerfile.chat` +- Porta configurata: 8001 + +--- + +### 2. Celery Worker Service +**Stato**: βœ… **COMPLETATO** +**Directory**: `src/datacenter_docs/workers/` + +**File implementati**: +- βœ… `src/datacenter_docs/workers/__init__.py` - Module initialization +- βœ… `src/datacenter_docs/workers/celery_app.py` - Configurazione Celery completa +- βœ… `src/datacenter_docs/workers/tasks.py` - 8 task asincroni implementati + +**Tasks implementati**: +1. βœ… **generate_documentation_task** - Generazione documentazione periodica (ogni 6 ore) +2. βœ… **generate_section_task** - Generazione sezione specifica +3. βœ… **execute_auto_remediation_task** - Esecuzione azioni correttive +4. βœ… **process_ticket_task** - Processamento ticket con AI +5. βœ… **collect_infrastructure_data_task** - Raccolta dati infrastruttura (ogni ora) +6. βœ… **cleanup_old_data_task** - Pulizia dati vecchi (giornaliero, 2 AM) +7. βœ… **update_system_metrics_task** - Aggiornamento metriche (ogni 15 min) +8. βœ… **Task base class** - DatabaseTask con inizializzazione DB automatica + +**Caratteristiche**: +- βœ… 4 code separate: documentation, auto_remediation, data_collection, maintenance +- βœ… Rate limiting configurato (10 auto-remediation/ora, 5 generazioni/ora) +- βœ… Scheduling periodico con Celery Beat +- βœ… Task lifecycle signals (prerun, postrun, success, failure) +- βœ… Timeout configurati (1h hard, 50min soft) +- βœ… Integrazione completa con MongoDB/Beanie +- βœ… Logging completo e audit trail +- ⚠️ Task skeletons pronti (richiedono Collectors/Generators per funzionalitΓ  complete) + +**Periodic Schedule**: +- Every 6 hours: Full documentation generation +- Every 1 hour: Infrastructure data collection +- Every 15 minutes: System metrics update +- Daily at 2 AM: Old data cleanup + +**Dipendenze**: +- βœ… `celery[redis]` (giΓ  in pyproject.toml) +- βœ… `flower` per monitoring (giΓ  in pyproject.toml) +- βœ… Redis configurato in docker-compose + +**Riferimenti**: +- Script Poetry definito: `docs-worker = "datacenter_docs.workers.celery_app:start"` (line 95 pyproject.toml) +- Dockerfile pronto: `deploy/docker/Dockerfile.worker` +- **Completato il**: 2025-10-19 + +--- + +### 3. CLI Tool +**Stato**: βœ… **COMPLETATO** +**File**: `src/datacenter_docs/cli.py` + +**FunzionalitΓ  implementate**: +```bash +# Comandi implementati +datacenter-docs serve # βœ… Avvia API server (uvicorn) +datacenter-docs worker # βœ… Avvia Celery worker (skeleton) +datacenter-docs init-db # βœ… Inizializza database con collezioni e dati +datacenter-docs generate
# βœ… Genera sezione specifica (skeleton) +datacenter-docs generate-all # βœ… Genera tutta la documentazione (skeleton) +datacenter-docs list-sections # βœ… Lista sezioni disponibili +datacenter-docs stats # βœ… Mostra statistiche sistema +datacenter-docs remediation enable # βœ… Abilita auto-remediation +datacenter-docs remediation disable # βœ… Disabilita auto-remediation +datacenter-docs remediation status # βœ… Mostra stato policies +datacenter-docs version # βœ… Info versione +``` + +**Caratteristiche**: +- βœ… Interfaccia Typer con Rich formatting +- βœ… Comandi asincroni con MongoDB/Beanie +- βœ… Gestione completa auto-remediation policies +- βœ… Statistiche in tempo reale +- βœ… Gestione errori e help completo +- ⚠️ Generate commands sono skeleton (richiedono Collectors/Generators) + +**Dipendenze**: +- βœ… `typer` (giΓ  in pyproject.toml) +- βœ… `rich` per output colorato (giΓ  in pyproject.toml) + +**Riferimenti**: +- Script Poetry definito: `datacenter-docs = "datacenter_docs.cli:app"` (line 93 pyproject.toml) +- **Completato il**: 2025-10-19 + +--- + +## 🟑 Componenti da Completare + +### 4. Collectors (Data Collection) +**Stato**: ⚠️ Parziale - Base + VMware implementati (20%) +**Directory**: `src/datacenter_docs/collectors/` + +**File implementati**: +- βœ… `base.py` - BaseCollector abstract class (COMPLETATO 2025-10-19) +- βœ… `vmware_collector.py` - VMware vSphere collector (COMPLETATO 2025-10-19) +- βœ… `__init__.py` - Module exports + +**VMware Collector Features**: +- βœ… Connection via MCP client with fallback to mock data +- βœ… Collects VMs (power state, resources, tools status, IPs) +- βœ… Collects ESXi hosts (hardware, version, uptime, maintenance mode) +- βœ… Collects clusters (DRS, HA, vSAN, resources) +- βœ… Collects datastores (capacity, usage, accessibility) +- βœ… Collects networks (VLANs, port groups, distributed switches) +- βœ… Calculates comprehensive statistics (totals, usage percentages) +- βœ… Data validation with VMware-specific checks +- βœ… MongoDB storage via BaseCollector.store() +- βœ… Integrated with Celery task `collect_infrastructure_data_task` +- βœ… Full async/await workflow with connect/collect/validate/store/disconnect +- βœ… Comprehensive error handling and logging + +**Collectors da implementare**: +- ❌ `kubernetes_collector.py` - Raccolta dati K8s (pods, deployments, services, nodes) +- ❌ `network_collector.py` - Raccolta configurazioni network (via NAPALM/Netmiko) +- ❌ `storage_collector.py` - Raccolta info storage (SAN, NAS) +- ❌ `database_collector.py` - Raccolta metriche database +- ❌ `monitoring_collector.py` - Integrazione con Zabbix/Prometheus + +**BaseCollector Interface**: +```python +class BaseCollector(ABC): + @abstractmethod + async def connect(self) -> bool + @abstractmethod + async def disconnect(self) -> None + @abstractmethod + async def collect(self) -> dict + + async def validate(self, data: dict) -> bool + async def store(self, data: dict) -> bool + async def run(self) -> dict # Full collection workflow + def get_summary(self) -> dict +``` + +--- + +### 5. Generators (Documentation Generation) +**Stato**: ⚠️ Solo skeleton +**Directory**: `src/datacenter_docs/generators/` + +**Generators da implementare**: +- `infrastructure_generator.py` - Panoramica infrastruttura +- `network_generator.py` - Documentazione network +- `virtualization_generator.py` - Documentazione VMware/Proxmox +- `kubernetes_generator.py` - Documentazione K8s clusters +- `storage_generator.py` - Documentazione storage +- `database_generator.py` - Documentazione database +- `monitoring_generator.py` - Documentazione monitoring +- `security_generator.py` - Audit e compliance +- `runbook_generator.py` - Procedure operative +- `troubleshooting_generator.py` - Guide risoluzione problemi + +**Pattern comune**: +```python +class BaseGenerator: + async def generate(self, data: dict) -> str # Markdown output + async def render_template(self, template: str, context: dict) -> str + async def save(self, content: str, path: str) -> None +``` + +--- + +### 6. Validators +**Stato**: ⚠️ Solo skeleton +**Directory**: `src/datacenter_docs/validators/` + +**Validators da implementare**: +- `config_validator.py` - Validazione configurazioni +- `security_validator.py` - Security checks +- `compliance_validator.py` - Compliance checks +- `performance_validator.py` - Performance checks + +--- + +## 🟒 Componenti Opzionali/Futuri + +### 7. Frontend React App +**Stato**: ⚠️ Parziale - Solo skeleton +**Directory**: `frontend/src/` + +**Componenti da sviluppare**: +- Dashboard principale +- Viewer documentazione +- Chat interface +- Auto-remediation control panel +- Analytics e statistiche +- Settings e configurazione + +**File esistenti**: +- `App.jsx` e `App_Enhanced.jsx` (probabilmente prototipi) +- Build configurato (Vite + Nginx) + +--- + +### 8. MCP Server +**Stato**: ❓ Esterno al progetto +**Note**: Sembra essere un servizio separato per connettivitΓ  ai device + +**Potrebbe richiedere**: +- Documentazione integrazione +- Client SDK/library +- Examples + +--- + +## πŸ“‹ PrioritΓ  Sviluppo Consigliata + +### Fase 1 - Core Functionality (Alta PrioritΓ ) +1. βœ… **API Service** - COMPLETATO +2. βœ… **CLI Tool** - COMPLETATO (2025-10-19) +3. βœ… **Celery Workers** - COMPLETATO (2025-10-19) +4. πŸ”΄ **Base Collectors** - Almeno 2-3 collector base (NEXT PRIORITY) +5. πŸ”΄ **Base Generators** - Almeno 2-3 generator base + +### Fase 2 - Advanced Features (Media PrioritΓ ) +6. 🟑 **Chat Service** - Per supporto real-time +7. 🟑 **Tutti i Collectors** - Completare raccolta dati +8. 🟑 **Tutti i Generators** - Completare generazione docs +9. 🟑 **Validators** - Validazione e compliance + +### Fase 3 - User Interface (Bassa PrioritΓ ) +10. 🟒 **Frontend React** - UI web completa +11. 🟒 **Dashboard Analytics** - Statistiche e metriche +12. 🟒 **Admin Panel** - Gestione configurazione + +--- + +## πŸ“Š Stato Attuale Progetto + +### βœ… Funzionante (100%) +- βœ… **API FastAPI** - Server completo con tutti gli endpoint (main.py, models.py, main_enhanced.py) +- βœ… **Auto-remediation Engine** - Sistema completo (auto_remediation.py, reliability.py) +- βœ… **MCP Client** - Integrazione base funzionante (mcp/client.py) +- βœ… **Database Layer** - MongoDB con Beanie ODM completamente configurato (utils/database.py) +- βœ… **Configuration Management** - Sistema completo di gestione config (utils/config.py) +- βœ… **Docker Infrastructure** - Tutti i Dockerfile e docker-compose.dev.yml pronti e testati +- βœ… **CI/CD Pipelines** - GitHub Actions, GitLab CI, Gitea Actions funzionanti +- βœ… **Python Environment** - Python 3.12 standardizzato ovunque + +### ⚠️ Parziale (5-40%) +- ⚠️ **Chat Service** (40%) - DocumentationAgent implementato (chat/agent.py), manca WebSocket server +- ⚠️ **Frontend React** (20%) - Skeleton base con Vite build, app minima funzionante +- ⚠️ **Collectors** (20%) - BaseCollector + VMware collector completati (2025-10-19) +- ⚠️ **Generators** (5%) - Solo directory e __init__.py, nessun generator implementato +- ⚠️ **Validators** (5%) - Solo directory e __init__.py, nessun validator implementato + +### ❌ Mancante (0%) +- ❌ **Collector Implementations** - 5 collectors rimanenti (K8s, Network, Storage, Database, Monitoring) +- ❌ **Generator Implementations** - Nessuno dei 10 generators implementato +- ❌ **Validator Implementations** - Nessun validator implementato +- ❌ **Chat WebSocket Server** - File chat/main.py non esiste +- ❌ **Logging System** - utils/logging.py non esiste +- ❌ **Helper Utilities** - utils/helpers.py non esiste + +### 🎯 Completamento per Categoria +| Categoria | % | Stato | Blockers | +|-----------|---|-------|----------| +| Infrastructure | 100% | βœ… Complete | None | +| API Service | 80% | βœ… Complete | None | +| Database | 70% | βœ… Complete | None | +| Auto-Remediation | 85% | βœ… Complete | None (fully integrated with workers) | +| **CLI Tool** | **100%** | **βœ… Complete** | **None** | +| **Workers** | **100%** | **βœ… Complete** | **None** | +| **Collectors** | **20%** | **🟑 Partial** | **Base + VMware done, 5 more needed** | +| MCP Integration | 60% | 🟑 Partial | External MCP server needed | +| Chat Service | 40% | 🟑 Partial | WebSocket server missing | +| Generators | 5% | πŸ”΄ Critical | All implementations missing | +| Validators | 5% | 🟑 Medium | All implementations missing | +| Frontend | 20% | 🟒 Low | UI components missing | + +**Overall: ~55%** (Infrastructure + CLI + Workers + VMware Collector complete, generators pending) + +--- + +## 🎯 Next Steps Immediati + +### πŸ”₯ CRITICAL PATH - MVP (3-4 giorni effort rimanenti) + +#### Step 1: CLI Tool (1 giorno) - βœ… COMPLETATO +**File**: `src/datacenter_docs/cli.py` +**Status**: βœ… **COMPLETATO il 2025-10-19** +**Risultato**: CLI completo con 11 comandi funzionanti + +**Implementato**: +- βœ… serve: Avvia API server con uvicorn +- βœ… worker: Avvia Celery worker (con gestione errori) +- βœ… init-db: Inizializza database completo +- βœ… generate/generate-all: Skeleton per generazione +- βœ… list-sections: Lista sezioni da DB +- βœ… stats: Statistiche complete +- βœ… remediation enable/disable/status: Gestione policies +- βœ… version: Info sistema + +**Dipendenze**: βœ… Tutte presenti (typer, rich) +**PrioritΓ **: βœ… COMPLETATO + +--- + +#### Step 2: Celery Workers (1-2 giorni) - βœ… COMPLETATO +**Directory**: `src/datacenter_docs/workers/` +**Status**: βœ… **COMPLETATO il 2025-10-19** +**Risultato**: Sistema completo task asincroni con 8 task e scheduling + +**Implementato**: +- βœ… `__init__.py` - Module initialization +- βœ… `celery_app.py` - Configurazione completa con 4 code e beat schedule +- βœ… `tasks.py` - 8 task asincroni completi: + - generate_documentation_task (ogni 6h) + - generate_section_task + - execute_auto_remediation_task (rate limit 10/h) + - process_ticket_task + - collect_infrastructure_data_task (ogni 1h) + - cleanup_old_data_task (giornaliero 2 AM) + - update_system_metrics_task (ogni 15min) + - DatabaseTask base class + +**Caratteristiche**: +- 4 code: documentation, auto_remediation, data_collection, maintenance +- Rate limiting e timeout configurati +- Celery Beat per task periodici +- Integrazione completa MongoDB/Beanie +- Task lifecycle signals +- CLI command funzionante: `datacenter-docs worker` + +**Dipendenze**: βœ… Tutte presenti (celery[redis], flower) +**PrioritΓ **: βœ… COMPLETATO + +--- + +#### Step 3: Primo Collector (1-2 giorni) - βœ… COMPLETATO +**File**: `src/datacenter_docs/collectors/vmware_collector.py` +**Status**: βœ… **COMPLETATO il 2025-10-19** +**Risultato**: Collector VMware completo con MCP integration + +**Implementato**: +- βœ… `base.py` - BaseCollector con full workflow (connect/collect/validate/store/disconnect) +- βœ… `vmware_collector.py` - Collector completo per vSphere: + - collect_vms() - VMs con power state, risorse, tools, IPs + - collect_hosts() - ESXi hosts con hardware, version, uptime + - collect_clusters() - Clusters con DRS, HA, vSAN + - collect_datastores() - Storage con capacitΓ  e utilizzo + - collect_networks() - Networks con VLANs e distributed switches + - Statistiche comprehensive (totali, percentuali utilizzo) + - Validazione VMware-specific +- βœ… Integrazione con MCP client (con fallback a mock data) +- βœ… Integrazione con Celery task collect_infrastructure_data_task +- βœ… MongoDB storage automatico via BaseCollector.store() +- βœ… Async/await completo con error handling + +**Dipendenze**: βœ… pyvmomi giΓ  presente +**PrioritΓ **: βœ… COMPLETATO + +--- + +#### Step 4: Primo Generator (1-2 giorni) +**File**: `src/datacenter_docs/generators/infrastructure_generator.py` +**Status**: ❌ Non implementato +**Blocca**: Generazione documentazione + +**Implementazione minima**: +```python +from datacenter_docs.generators.base import BaseGenerator +from anthropic import Anthropic + +class InfrastructureGenerator(BaseGenerator): + async def generate(self, data: dict) -> str: + """Genera documentazione infrastruttura con LLM""" + client = Anthropic(api_key=settings.ANTHROPIC_API_KEY) + + # Genera markdown con Claude + response = client.messages.create( + model="claude-sonnet-4.5", + messages=[...] + ) + + return response.content[0].text +``` + +**Dipendenze**: βœ… anthropic giΓ  presente +**PrioritΓ **: πŸ”΄ ALTA + +--- + +#### Step 5: Testing End-to-End (1 giorno) +**Scenario MVP**: +```bash +# 1. Inizializza DB +datacenter-docs init-db + +# 2. Avvia worker +datacenter-docs worker & + +# 3. Genera documentazione VMware +datacenter-docs generate vmware + +# 4. Verifica API +curl http://localhost:8000/api/v1/sections/vmware + +# 5. Verifica MongoDB +# Controlla che i dati siano stati salvati +``` + +**Risultato atteso**: Documentazione VMware generata e disponibile via API + +--- + +### πŸ“‹ SECONDARY TASKS (Post-MVP) + +#### Task 6: Chat WebSocket Server (1-2 giorni) +**File**: `src/datacenter_docs/chat/main.py` +**Status**: ❌ Non esiste +**PrioritΓ **: 🟑 MEDIA + +**Implementazione**: +```python +import socketio +from datacenter_docs.chat.agent import DocumentationAgent + +sio = socketio.AsyncServer(async_mode='asgi') +app = socketio.ASGIApp(sio) + +@sio.event +async def message(sid, data): + agent = DocumentationAgent() + response = await agent.process_query(data['query']) + await sio.emit('response', response, room=sid) +``` + +--- + +#### Task 7: Rimanenti Collectors (3-5 giorni) +- kubernetes_collector.py +- network_collector.py +- storage_collector.py +- database_collector.py +- monitoring_collector.py + +**PrioritΓ **: 🟑 MEDIA + +--- + +#### Task 8: Rimanenti Generators (4-6 giorni) +- network_generator.py +- virtualization_generator.py +- kubernetes_generator.py +- storage_generator.py +- database_generator.py +- monitoring_generator.py +- security_generator.py +- runbook_generator.py +- troubleshooting_generator.py + +**PrioritΓ **: 🟑 MEDIA + +--- + +#### Task 9: Frontend React (5-7 giorni) +- Dashboard principale +- Documentation viewer +- Chat interface +- Auto-remediation panel + +**PrioritΓ **: 🟒 BASSA + +--- + +## πŸ“ Note Tecniche + +### Architettura Target +``` +User Request β†’ API/CLI + ↓ +Celery Task (async) + ↓ +Collectors β†’ Raccolta dati da infrastruttura (via MCP) + ↓ +Generators β†’ Generazione documentazione con LLM (Claude) + ↓ +Storage β†’ MongoDB + ↓ +API Response/Notification +``` + +### Stack Tecnologico Definito +- **Backend**: Python 3.12, FastAPI, Celery +- **Database**: MongoDB (Beanie ODM), Redis +- **LLM**: OpenAI-compatible API (supports OpenAI, Anthropic, LLMStudio, Open-WebUI, Ollama, LocalAI) + - Generic LLM client: `src/datacenter_docs/utils/llm_client.py` + - Configured via: `LLM_BASE_URL`, `LLM_API_KEY`, `LLM_MODEL` + - Default: OpenAI GPT-4 (can be changed to any compatible provider) +- **Frontend**: React 18, Vite, Material-UI +- **Infrastructure**: Docker, Docker Compose +- **CI/CD**: GitHub Actions, GitLab CI, Gitea Actions +- **Monitoring**: Prometheus, Flower (Celery) + +### Dipendenze GiΓ  Configurate +Tutte le dipendenze Python sono giΓ  in `pyproject.toml` e funzionanti. +Nessun package aggiuntivo necessario per iniziare lo sviluppo. + +### πŸ”Œ LLM Provider Configuration + +Il sistema utilizza l'**API standard OpenAI** per massima flessibilitΓ . Puoi configurare qualsiasi provider LLM compatibile tramite variabili d'ambiente: + +#### OpenAI (Default) +```bash +LLM_BASE_URL=https://api.openai.com/v1 +LLM_API_KEY=sk-your-openai-key +LLM_MODEL=gpt-4-turbo-preview +``` + +#### Anthropic Claude (via OpenAI-compatible API) +```bash +LLM_BASE_URL=https://api.anthropic.com/v1 +LLM_API_KEY=sk-ant-your-anthropic-key +LLM_MODEL=claude-sonnet-4-20250514 +``` + +#### LLMStudio (Local) +```bash +LLM_BASE_URL=http://localhost:1234/v1 +LLM_API_KEY=not-needed +LLM_MODEL=local-model-name +``` + +#### Open-WebUI (Local) +```bash +LLM_BASE_URL=http://localhost:8080/v1 +LLM_API_KEY=your-open-webui-key +LLM_MODEL=llama3 +``` + +#### Ollama (Local) +```bash +LLM_BASE_URL=http://localhost:11434/v1 +LLM_API_KEY=not-needed +LLM_MODEL=llama3 +``` + +**File di configurazione**: `src/datacenter_docs/utils/config.py` +**Client LLM generico**: `src/datacenter_docs/utils/llm_client.py` +**Utilizzo**: Tutti i componenti usano automaticamente il client configurato + +--- + +## πŸ“… Timeline Stimato + +### Milestone 1: MVP (5-6 giorni) - 80% COMPLETATO +**Obiettivo**: Sistema base funzionante end-to-end +- βœ… Infrastruttura Docker (COMPLETATO) +- βœ… API Service (COMPLETATO) +- βœ… CLI Tool (COMPLETATO 2025-10-19) +- βœ… Celery Workers (COMPLETATO 2025-10-19) +- βœ… 1 Collector (VMware) (COMPLETATO 2025-10-19) +- ❌ 1 Generator (Infrastructure) (1-2 giorni) - NEXT + +**Deliverable**: Comando `datacenter-docs generate vmware` funzionante +**Rimanente**: 1-2 giorni (solo Generator per VMware) + +--- + +### Milestone 2: Core Features (2-3 settimane) +**Obiettivo**: Tutti i collector e generator implementati +- [ ] Tutti i 6 collectors +- [ ] Tutti i 10 generators +- [ ] Base validators +- [ ] Chat WebSocket server +- [ ] Scheduling automatico (ogni 6 ore) + +**Deliverable**: Documentazione completa di tutta l'infrastruttura + +--- + +### Milestone 3: Production (3-4 settimane) +**Obiettivo**: Sistema production-ready +- [ ] Frontend React completo +- [ ] Testing completo +- [ ] Performance optimization +- [ ] Security hardening +- [ ] Monitoring e alerting + +**Deliverable**: Deploy in produzione + +--- + +## πŸš€ Quick Start per Developer + +### Setup Ambiente Sviluppo +```bash +# 1. Clone e setup +git clone +cd llm-automation-docs-and-remediation-engine + +# 2. Install dependencies +poetry install + +# 3. Avvia stack Docker +cd deploy/docker +docker-compose -f docker-compose.dev.yml up -d + +# 4. Verifica servizi +docker-compose -f docker-compose.dev.yml ps +curl http://localhost:8000/health + +# 5. Accedi al container API per sviluppo +docker exec -it datacenter-api bash +``` + +### Development Workflow +```bash +# Durante sviluppo, modifica codice in src/ +# I volumi Docker sono montati, quindi le modifiche sono immediate + +# Restart servizi dopo modifiche +cd deploy/docker +docker-compose -f docker-compose.dev.yml restart api + +# Visualizza logs +docker-compose -f docker-compose.dev.yml logs -f api +``` + +### Cosa Implementare per Primo +1. **src/datacenter_docs/cli.py** - CLI tool base +2. **src/datacenter_docs/workers/celery_app.py** - Celery setup +3. **src/datacenter_docs/collectors/base.py** - Base collector class +4. **src/datacenter_docs/collectors/vmware_collector.py** - Primo collector +5. **src/datacenter_docs/generators/base.py** - Base generator class +6. **src/datacenter_docs/generators/infrastructure_generator.py** - Primo generator + +### Testing +```bash +# Unit tests +poetry run pytest + +# Test specifico +poetry run pytest tests/test_collectors/test_vmware.py + +# Coverage +poetry run pytest --cov=src/datacenter_docs --cov-report=html +``` + +--- + +## πŸ“Š Summary + +| Status | Count | % | +|--------|-------|---| +| βœ… Completato | ~9 componenti principali | 55% | +| ⚠️ Parziale | 4 componenti | 15% | +| ❌ Da implementare | ~20 componenti | 30% | + +**Focus immediato**: Generator (VMware Infrastructure) (1-2 giorni) β†’ Completa MVP + +**Estimated Time to MVP**: 1-2 giorni rimanenti (solo Infrastructure Generator) +**Estimated Time to Production**: 2-3 settimane full-time + +--- + +**Last Updated**: 2025-10-19 +**Next Review**: Dopo completamento MVP (CLI + Workers + 1 Collector + 1 Generator) diff --git a/deploy/docker/Dockerfile.api b/deploy/docker/Dockerfile.api new file mode 100644 index 0000000..e281d4b --- /dev/null +++ b/deploy/docker/Dockerfile.api @@ -0,0 +1,64 @@ +# Dockerfile for FastAPI API Service +FROM python:3.12-slim as builder + +WORKDIR /build + +# Install Poetry +RUN pip install --no-cache-dir poetry==1.8.0 + +# Copy dependency files +COPY pyproject.toml poetry.lock ./ + +# Export dependencies +RUN poetry config virtualenvs.create false \ + && poetry export -f requirements.txt --output requirements.txt --without-hashes + +# Runtime stage +FROM python:3.12-slim + +LABEL maintainer="automation-team@company.com" +LABEL description="Datacenter Documentation API Server" + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + gcc \ + libpq-dev \ + openssh-client \ + snmp \ + curl \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Copy requirements from builder +COPY --from=builder /build/requirements.txt . + +# Install Python dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code and package definition +COPY src/ /app/src/ +COPY config/ /app/config/ +COPY pyproject.toml README.md /app/ + +# Install the package in editable mode +RUN pip install --no-cache-dir -e /app + +# Create necessary directories +RUN mkdir -p /app/logs /app/output + +# Create non-root user +RUN useradd -m -u 1000 appuser && \ + chown -R appuser:appuser /app + +USER appuser + +# Expose API port +EXPOSE 8000 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD curl -f http://localhost:8000/health || exit 1 + +# Run the API server +CMD ["python", "-m", "uvicorn", "datacenter_docs.api.main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/deploy/docker/Dockerfile.chat b/deploy/docker/Dockerfile.chat new file mode 100644 index 0000000..7cd161b --- /dev/null +++ b/deploy/docker/Dockerfile.chat @@ -0,0 +1,60 @@ +# Dockerfile for Chat Service +FROM python:3.12-slim as builder + +WORKDIR /build + +# Install Poetry +RUN pip install --no-cache-dir poetry==1.8.0 + +# Copy dependency files +COPY pyproject.toml poetry.lock ./ + +# Export dependencies +RUN poetry config virtualenvs.create false \ + && poetry export -f requirements.txt --output requirements.txt --without-hashes + +# Runtime stage +FROM python:3.12-slim + +LABEL maintainer="automation-team@company.com" +LABEL description="Datacenter Documentation Chat Server" + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + curl \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Copy requirements from builder +COPY --from=builder /build/requirements.txt . + +# Install Python dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code and package definition +COPY src/ /app/src/ +COPY config/ /app/config/ +COPY pyproject.toml README.md /app/ + +# Install the package in editable mode +RUN pip install --no-cache-dir -e /app + +# Create necessary directories +RUN mkdir -p /app/logs + +# Create non-root user +RUN useradd -m -u 1000 appuser && \ + chown -R appuser:appuser /app + +USER appuser + +# Expose chat port +EXPOSE 8001 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD curl -f http://localhost:8001/health || exit 1 + +# Run the chat server +CMD ["python", "-m", "datacenter_docs.chat.main"] diff --git a/deploy/docker/Dockerfile.frontend b/deploy/docker/Dockerfile.frontend new file mode 100644 index 0000000..8b74bfe --- /dev/null +++ b/deploy/docker/Dockerfile.frontend @@ -0,0 +1,41 @@ +# Dockerfile for React Frontend +# Build stage +FROM node:20-alpine as builder + +WORKDIR /build + +# Copy package files +COPY frontend/package*.json ./ + +# Install dependencies +RUN npm install + +# Copy frontend source code +COPY frontend/src ./src +COPY frontend/index.html ./ +COPY frontend/vite.config.js ./ + +# Build the frontend +RUN npm run build + +# Production stage with nginx +FROM nginx:alpine + +LABEL maintainer="automation-team@company.com" +LABEL description="Datacenter Documentation Frontend" + +# Copy built assets from builder +COPY --from=builder /build/dist /usr/share/nginx/html + +# Copy nginx configuration +COPY deploy/docker/nginx.conf /etc/nginx/conf.d/default.conf + +# Expose port +EXPOSE 80 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD wget --no-verbose --tries=1 --spider http://localhost/health || exit 1 + +# Run nginx +CMD ["nginx", "-g", "daemon off;"] diff --git a/deploy/docker/Dockerfile.worker b/deploy/docker/Dockerfile.worker new file mode 100644 index 0000000..c37309b --- /dev/null +++ b/deploy/docker/Dockerfile.worker @@ -0,0 +1,57 @@ +# Dockerfile for Celery Worker Service +FROM python:3.12-slim as builder + +WORKDIR /build + +# Install Poetry +RUN pip install --no-cache-dir poetry==1.8.0 + +# Copy dependency files +COPY pyproject.toml poetry.lock ./ + +# Export dependencies +RUN poetry config virtualenvs.create false \ + && poetry export -f requirements.txt --output requirements.txt --without-hashes + +# Runtime stage +FROM python:3.12-slim + +LABEL maintainer="automation-team@company.com" +LABEL description="Datacenter Documentation Background Worker" + +# Install system dependencies for network automation +RUN apt-get update && apt-get install -y \ + gcc \ + libpq-dev \ + openssh-client \ + snmp \ + curl \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Copy requirements from builder +COPY --from=builder /build/requirements.txt . + +# Install Python dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code and package definition +COPY src/ /app/src/ +COPY config/ /app/config/ +COPY pyproject.toml README.md /app/ + +# Install the package in editable mode +RUN pip install --no-cache-dir -e /app + +# Create necessary directories +RUN mkdir -p /app/logs /app/output + +# Create non-root user +RUN useradd -m -u 1000 appuser && \ + chown -R appuser:appuser /app + +USER appuser + +# Run the Celery worker +CMD ["celery", "-A", "datacenter_docs.workers.celery_app", "worker", "--loglevel=info", "--concurrency=4"] diff --git a/deploy/docker/README.md b/deploy/docker/README.md new file mode 100644 index 0000000..e5072e3 --- /dev/null +++ b/deploy/docker/README.md @@ -0,0 +1,121 @@ +# Docker Development Environment + +This directory contains Docker configurations for running the Datacenter Documentation System in development mode. + +## Prerequisites + +- Docker Engine 20.10+ +- Docker Compose V2 +- At least 4GB RAM available for Docker + +## Quick Start + +```bash +# Start all services +cd deploy/docker +docker-compose -f docker-compose.dev.yml up -d + +# View logs +docker-compose -f docker-compose.dev.yml logs -f + +# Stop all services +docker-compose -f docker-compose.dev.yml down +``` + +## Environment Variables + +Create a `.env` file in the project root with: + +```env +ANTHROPIC_API_KEY=your_api_key_here +MCP_SERVER_URL=http://localhost:8001 +``` + +## Services + +### Running Services + +| Service | Port | Description | Status | +|---------|------|-------------|--------| +| **API** | 8000 | FastAPI documentation server | βœ… Healthy | +| **MongoDB** | 27017 | Database | βœ… Healthy | +| **Redis** | 6379 | Cache & message broker | βœ… Healthy | +| **Frontend** | 80 | React web interface | ⚠️ Running | +| **Flower** | 5555 | Celery monitoring | βœ… Running | + +### Not Implemented Yet + +- **Chat Service** (port 8001) - WebSocket chat interface +- **Worker Service** - Celery background workers + +These services are commented out in docker-compose.dev.yml and will be enabled when implemented. + +## Access Points + +- **API Documentation**: http://localhost:8000/docs +- **API Health**: http://localhost:8000/health +- **Frontend**: http://localhost +- **Flower (Celery Monitor)**: http://localhost:5555 +- **MongoDB**: `mongodb://admin:admin123@localhost:27017` +- **Redis**: `localhost:6379` + +## Build Individual Services + +```bash +# Rebuild a specific service +docker-compose -f docker-compose.dev.yml up --build -d api + +# View logs for a specific service +docker-compose -f docker-compose.dev.yml logs -f api +``` + +## Troubleshooting + +### API not starting + +Check logs: +```bash +docker-compose -f docker-compose.dev.yml logs api +``` + +### MongoDB connection issues + +Ensure MongoDB is healthy: +```bash +docker-compose -f docker-compose.dev.yml ps mongodb +``` + +### Clear volumes and restart + +```bash +docker-compose -f docker-compose.dev.yml down -v +docker-compose -f docker-compose.dev.yml up --build -d +``` + +## Development Workflow + +1. **Code changes** are mounted as volumes, so changes to `src/` are reflected immediately +2. **Restart services** after dependency changes: + ```bash + docker-compose -f docker-compose.dev.yml restart api + ``` +3. **Rebuild** after pyproject.toml changes: + ```bash + docker-compose -f docker-compose.dev.yml up --build -d api + ``` + +## Files + +- `Dockerfile.api` - FastAPI service +- `Dockerfile.chat` - Chat service (not yet implemented) +- `Dockerfile.worker` - Celery worker (not yet implemented) +- `Dockerfile.frontend` - React frontend with Nginx +- `docker-compose.dev.yml` - Development orchestration +- `nginx.conf` - Nginx configuration for frontend + +## Notes + +- Python version: 3.12 +- Black formatter uses Python 3.12 target +- Services use Poetry for dependency management +- Frontend uses Vite for building diff --git a/deploy/docker/docker-compose.dev.yml b/deploy/docker/docker-compose.dev.yml new file mode 100644 index 0000000..a13f62a --- /dev/null +++ b/deploy/docker/docker-compose.dev.yml @@ -0,0 +1,177 @@ +version: '3.8' + +services: + # MongoDB Database + mongodb: + image: mongo:7-jammy + container_name: datacenter-docs-mongodb-dev + ports: + - "27017:27017" + environment: + MONGO_INITDB_ROOT_USERNAME: admin + MONGO_INITDB_ROOT_PASSWORD: admin123 + MONGO_INITDB_DATABASE: datacenter_docs + volumes: + - mongodb-data:/data/db + - mongodb-config:/data/configdb + networks: + - datacenter-network + healthcheck: + test: ["CMD", "mongosh", "--eval", "db.adminCommand('ping')"] + interval: 10s + timeout: 5s + retries: 5 + + # Redis Cache & Message Broker + redis: + image: redis:7-alpine + container_name: datacenter-docs-redis-dev + ports: + - "6379:6379" + command: redis-server --appendonly yes + volumes: + - redis-data:/data + networks: + - datacenter-network + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 10s + timeout: 5s + retries: 5 + + # FastAPI API Service + api: + build: + context: ../.. + dockerfile: deploy/docker/Dockerfile.api + container_name: datacenter-docs-api-dev + ports: + - "8000:8000" + environment: + - MONGODB_URL=mongodb://admin:admin123@mongodb:27017 + - MONGODB_DATABASE=datacenter_docs + - REDIS_URL=redis://redis:6379 + - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY} + - MCP_SERVER_URL=${MCP_SERVER_URL:-http://localhost:8001} + - LOG_LEVEL=DEBUG + volumes: + - ../../src:/app/src + - ../../config:/app/config + - api-logs:/app/logs + - api-output:/app/output + depends_on: + mongodb: + condition: service_healthy + redis: + condition: service_healthy + networks: + - datacenter-network + restart: unless-stopped + + # Chat Service + chat: + build: + context: ../.. + dockerfile: deploy/docker/Dockerfile.chat + container_name: datacenter-docs-chat-dev + ports: + - "8001:8001" + environment: + - MONGODB_URL=mongodb://admin:admin123@mongodb:27017 + - MONGODB_DATABASE=datacenter_docs + - REDIS_URL=redis://redis:6379 + - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY} + - LOG_LEVEL=DEBUG + volumes: + - ../../src:/app/src + - ../../config:/app/config + - chat-logs:/app/logs + depends_on: + mongodb: + condition: service_healthy + redis: + condition: service_healthy + networks: + - datacenter-network + restart: unless-stopped + + # Celery Worker + worker: + build: + context: ../.. + dockerfile: deploy/docker/Dockerfile.worker + container_name: datacenter-docs-worker-dev + environment: + - MONGODB_URL=mongodb://admin:admin123@mongodb:27017 + - MONGODB_DATABASE=datacenter_docs + - REDIS_URL=redis://redis:6379 + - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY} + - LOG_LEVEL=DEBUG + volumes: + - ../../src:/app/src + - ../../config:/app/config + - worker-logs:/app/logs + - worker-output:/app/output + depends_on: + mongodb: + condition: service_healthy + redis: + condition: service_healthy + networks: + - datacenter-network + restart: unless-stopped + + # Flower - Celery Monitoring + flower: + image: mher/flower:2.0 + container_name: datacenter-docs-flower-dev + ports: + - "5555:5555" + environment: + - CELERY_BROKER_URL=redis://redis:6379 + - CELERY_RESULT_BACKEND=redis://redis:6379 + - FLOWER_PORT=5555 + depends_on: + - redis + - worker + networks: + - datacenter-network + restart: unless-stopped + + # Frontend + frontend: + build: + context: ../.. + dockerfile: deploy/docker/Dockerfile.frontend + container_name: datacenter-docs-frontend-dev + ports: + - "80:80" + depends_on: + - api + - chat + networks: + - datacenter-network + restart: unless-stopped + +volumes: + mongodb-data: + name: datacenter-docs-mongodb-data-dev + mongodb-config: + name: datacenter-docs-mongodb-config-dev + redis-data: + name: datacenter-docs-redis-data-dev + api-logs: + name: datacenter-docs-api-logs-dev + api-output: + name: datacenter-docs-api-output-dev + chat-logs: + name: datacenter-docs-chat-logs-dev + worker-logs: + name: datacenter-docs-worker-logs-dev + worker-output: + name: datacenter-docs-worker-output-dev + +networks: + datacenter-network: + name: datacenter-docs-network-dev + driver: bridge diff --git a/deploy/docker/nginx.conf b/deploy/docker/nginx.conf new file mode 100644 index 0000000..4caa547 --- /dev/null +++ b/deploy/docker/nginx.conf @@ -0,0 +1,61 @@ +server { + listen 80; + server_name _; + + root /usr/share/nginx/html; + index index.html; + + # Gzip compression + gzip on; + gzip_vary on; + gzip_min_length 1024; + gzip_types text/plain text/css text/xml text/javascript application/x-javascript application/xml+rss application/json; + + # Security headers + add_header X-Frame-Options "SAMEORIGIN" always; + add_header X-Content-Type-Options "nosniff" always; + add_header X-XSS-Protection "1; mode=block" always; + + # Health check endpoint + location /health { + access_log off; + return 200 "OK\n"; + add_header Content-Type text/plain; + } + + # API proxy + location /api/ { + proxy_pass http://api:8000/; + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection 'upgrade'; + proxy_set_header Host $host; + proxy_cache_bypass $http_upgrade; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + # WebSocket for chat + location /ws/ { + proxy_pass http://chat:8001/; + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + # React app - all routes go to index.html + location / { + try_files $uri $uri/ /index.html; + } + + # Cache static assets + location ~* \.(js|css|png|jpg|jpeg|gif|ico|svg|woff|woff2|ttf|eot)$ { + expires 1y; + add_header Cache-Control "public, immutable"; + } +} diff --git a/frontend/index.html b/frontend/index.html new file mode 100644 index 0000000..f1ca536 --- /dev/null +++ b/frontend/index.html @@ -0,0 +1,13 @@ + + + + + + + Datacenter Documentation System + + +
+ + + diff --git a/frontend/src/main.jsx b/frontend/src/main.jsx new file mode 100644 index 0000000..51a8c58 --- /dev/null +++ b/frontend/src/main.jsx @@ -0,0 +1,9 @@ +import React from 'react' +import ReactDOM from 'react-dom/client' +import App from './App.jsx' + +ReactDOM.createRoot(document.getElementById('root')).render( + + + , +) diff --git a/frontend/vite.config.js b/frontend/vite.config.js new file mode 100644 index 0000000..03af39f --- /dev/null +++ b/frontend/vite.config.js @@ -0,0 +1,30 @@ +import { defineConfig } from 'vite' +import react from '@vitejs/plugin-react' + +// https://vitejs.dev/config/ +export default defineConfig({ + plugins: [react()], + server: { + host: '0.0.0.0', + port: 3000, + proxy: { + '/api': { + target: 'http://localhost:8000', + changeOrigin: true, + rewrite: (path) => path.replace(/^\/api/, '') + }, + '/ws': { + target: 'http://localhost:8001', + changeOrigin: true, + ws: true, + rewrite: (path) => path.replace(/^\/ws/, '') + } + } + }, + build: { + outDir: 'dist', + sourcemap: false, + minify: 'esbuild', + chunkSizeWarningLimit: 1000 + } +}) diff --git a/pyproject.toml b/pyproject.toml index d84274a..8f62bb1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ readme = "README.md" packages = [{include = "datacenter_docs", from = "src"}] [tool.poetry.dependencies] -python = "^3.14" +python = "^3.12" # Web Framework fastapi = "^0.115.0" @@ -24,7 +24,7 @@ beanie = "^1.27.0" # ODM for MongoDB # MCP (Model Context Protocol) # mcp = "^0.1.0" # Package name might be different -anthropic = "^0.42.0" +openai = "^1.58.0" # OpenAI-compatible API for multiple LLM providers # Network and Device Management paramiko = "^3.5.0" @@ -75,7 +75,7 @@ flower = "^2.0.1" # LLM Integration langchain = "^0.3.0" -langchain-anthropic = "^0.3.0" +langchain-community = "^0.3.0" # chromadb = "^0.5.0" # Requires Visual C++ Build Tools on Windows [tool.poetry.group.dev.dependencies] @@ -100,7 +100,7 @@ build-backend = "poetry.core.masonry.api" [tool.black] line-length = 100 -target-version = ['py314'] +target-version = ['py312'] include = '\.pyi?$' [tool.ruff] @@ -115,7 +115,7 @@ ignore = ["E501"] "src/datacenter_docs/api/main_enhanced.py" = ["F821"] [tool.mypy] -python_version = "3.14" +python_version = "3.12" warn_return_any = true warn_unused_configs = true disallow_untyped_defs = true diff --git a/src/datacenter_docs/api/main.py b/src/datacenter_docs/api/main.py index f355b24..7e88640 100644 --- a/src/datacenter_docs/api/main.py +++ b/src/datacenter_docs/api/main.py @@ -162,8 +162,8 @@ async def create_ticket( ) await db_ticket.insert() - # Initialize documentation agent - agent = DocumentationAgent(mcp_client=mcp, anthropic_api_key=settings.ANTHROPIC_API_KEY) + # Initialize documentation agent (uses default LLM client from config) + agent = DocumentationAgent(mcp_client=mcp) # Process ticket in background background_tasks.add_task( @@ -256,7 +256,8 @@ async def search_documentation( Uses semantic search to find relevant documentation sections """ try: - agent = DocumentationAgent(mcp_client=mcp, anthropic_api_key=settings.ANTHROPIC_API_KEY) + # Initialize documentation agent (uses default LLM client from config) + agent = DocumentationAgent(mcp_client=mcp) results = await agent.search_documentation( query=query.query, sections=query.sections, limit=query.limit diff --git a/src/datacenter_docs/api/main.py.bak b/src/datacenter_docs/api/main.py.bak deleted file mode 100644 index c16d231..0000000 --- a/src/datacenter_docs/api/main.py.bak +++ /dev/null @@ -1,384 +0,0 @@ -""" -FastAPI application for datacenter documentation and ticket resolution -""" - -from fastapi import FastAPI, HTTPException, BackgroundTasks, Depends, File, UploadFile -from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import StreamingResponse -from pydantic import BaseModel, Field -from typing import List, Optional, Dict, Any -from datetime import datetime -import logging -from pathlib import Path - -from ..mcp.client import MCPClient, MCPCollector -from ..chat.agent import DocumentationAgent -from ..utils.config import get_settings -from ..utils.database import get_db, Session -from . import models, schemas - -logger = logging.getLogger(__name__) -settings = get_settings() - -# FastAPI app -app = FastAPI( - title="Datacenter Documentation API", - description="API for automated documentation and ticket resolution", - version="1.0.0", - docs_url="/api/docs", - redoc_url="/api/redoc" -) - -# CORS -app.add_middleware( - CORSMiddleware, - allow_origins=settings.CORS_ORIGINS, - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) - - -# Pydantic models -class TicketCreate(BaseModel): - """Ticket creation request""" - ticket_id: str = Field(..., description="External ticket ID") - title: str = Field(..., description="Ticket title") - description: str = Field(..., description="Problem description") - priority: str = Field(default="medium", description="Priority: low, medium, high, critical") - category: Optional[str] = Field(None, description="Category: network, server, storage, etc.") - requester: Optional[str] = Field(None, description="Requester email") - metadata: Optional[Dict[str, Any]] = Field(default_factory=dict) - - -class TicketResponse(BaseModel): - """Ticket response""" - ticket_id: str - status: str - resolution: Optional[str] = None - suggested_actions: List[str] = [] - related_docs: List[Dict[str, str]] = [] - confidence_score: float - processing_time: float - created_at: datetime - updated_at: datetime - - -class DocumentationQuery(BaseModel): - """Documentation query""" - query: str = Field(..., description="Search query") - sections: Optional[List[str]] = Field(None, description="Specific sections to search") - limit: int = Field(default=5, ge=1, le=20) - - -class DocumentationResult(BaseModel): - """Documentation search result""" - section: str - title: str - content: str - relevance_score: float - last_updated: datetime - - -# Dependency for MCP client -async def get_mcp_client(): - """Get MCP client instance""" - async with MCPClient( - server_url=settings.MCP_SERVER_URL, - api_key=settings.MCP_API_KEY - ) as client: - yield client - - -# Health check -@app.get("/health") -async def health_check(): - """Health check endpoint""" - return { - "status": "healthy", - "timestamp": datetime.now().isoformat(), - "version": "1.0.0" - } - - -# Ticket Resolution API -@app.post("/api/v1/tickets", response_model=TicketResponse, status_code=201) -async def create_ticket( - ticket: TicketCreate, - background_tasks: BackgroundTasks, - db: Session = Depends(get_db), - mcp: MCPClient = Depends(get_mcp_client) -): - """ - Create and automatically process a ticket - - This endpoint receives a ticket from external systems and automatically: - 1. Searches relevant documentation - 2. Analyzes the problem - 3. Suggests resolution steps - 4. Provides confidence score - """ - start_time = datetime.now() - - try: - # Create ticket in database - db_ticket = models.Ticket( - ticket_id=ticket.ticket_id, - title=ticket.title, - description=ticket.description, - priority=ticket.priority, - category=ticket.category, - requester=ticket.requester, - status="processing", - metadata=ticket.metadata - ) - db.add(db_ticket) - db.commit() - db.refresh(db_ticket) - - # Initialize documentation agent - agent = DocumentationAgent( - mcp_client=mcp, - anthropic_api_key=settings.ANTHROPIC_API_KEY - ) - - # Process ticket in background - background_tasks.add_task( - process_ticket_resolution, - agent=agent, - ticket_id=ticket.ticket_id, - description=ticket.description, - category=ticket.category, - db=db - ) - - processing_time = (datetime.now() - start_time).total_seconds() - - return TicketResponse( - ticket_id=ticket.ticket_id, - status="processing", - resolution=None, - suggested_actions=["Analyzing ticket..."], - related_docs=[], - confidence_score=0.0, - processing_time=processing_time, - created_at=db_ticket.created_at, - updated_at=db_ticket.updated_at - ) - - except Exception as e: - logger.error(f"Failed to create ticket: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - -@app.get("/api/v1/tickets/{ticket_id}", response_model=TicketResponse) -async def get_ticket( - ticket_id: str, - db: Session = Depends(get_db) -): - """Get ticket status and resolution""" - ticket = db.query(models.Ticket).filter(models.Ticket.ticket_id == ticket_id).first() - - if not ticket: - raise HTTPException(status_code=404, detail="Ticket not found") - - return TicketResponse( - ticket_id=ticket.ticket_id, - status=ticket.status, - resolution=ticket.resolution, - suggested_actions=ticket.suggested_actions or [], - related_docs=ticket.related_docs or [], - confidence_score=ticket.confidence_score or 0.0, - processing_time=ticket.processing_time or 0.0, - created_at=ticket.created_at, - updated_at=ticket.updated_at - ) - - -# Documentation Search API -@app.post("/api/v1/documentation/search", response_model=List[DocumentationResult]) -async def search_documentation( - query: DocumentationQuery, - mcp: MCPClient = Depends(get_mcp_client) -): - """ - Search datacenter documentation - - Uses semantic search to find relevant documentation sections - """ - try: - agent = DocumentationAgent( - mcp_client=mcp, - anthropic_api_key=settings.ANTHROPIC_API_KEY - ) - - results = await agent.search_documentation( - query=query.query, - sections=query.sections, - limit=query.limit - ) - - return [ - DocumentationResult( - section=r["section"], - title=r["title"], - content=r["content"], - relevance_score=r["relevance_score"], - last_updated=r["last_updated"] - ) - for r in results - ] - - except Exception as e: - logger.error(f"Search failed: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - -# Documentation Generation API -@app.post("/api/v1/documentation/generate/{section}") -async def generate_documentation( - section: str, - background_tasks: BackgroundTasks, - mcp: MCPClient = Depends(get_mcp_client) -): - """ - Trigger documentation generation for a specific section - - Returns immediately and processes in background - """ - valid_sections = [ - "infrastructure", "network", "virtualization", "storage", - "security", "backup", "monitoring", "database", "procedures", "improvements" - ] - - if section not in valid_sections: - raise HTTPException( - status_code=400, - detail=f"Invalid section. Must be one of: {', '.join(valid_sections)}" - ) - - background_tasks.add_task(generate_section_task, section=section, mcp=mcp) - - return { - "status": "processing", - "section": section, - "message": f"Documentation generation started for section: {section}" - } - - -@app.get("/api/v1/documentation/sections") -async def list_sections(): - """List all available documentation sections""" - sections = [ - {"id": "infrastructure", "name": "Infrastructure", "updated": None}, - {"id": "network", "name": "Networking", "updated": None}, - {"id": "virtualization", "name": "Virtualization", "updated": None}, - {"id": "storage", "name": "Storage", "updated": None}, - {"id": "security", "name": "Security", "updated": None}, - {"id": "backup", "name": "Backup & DR", "updated": None}, - {"id": "monitoring", "name": "Monitoring", "updated": None}, - {"id": "database", "name": "Database", "updated": None}, - {"id": "procedures", "name": "Procedures", "updated": None}, - {"id": "improvements", "name": "Improvements", "updated": None}, - ] - - # TODO: Add actual last_updated timestamps from database - return sections - - -# Stats and Metrics -@app.get("/api/v1/stats/tickets") -async def get_ticket_stats(db: Session = Depends(get_db)): - """Get ticket resolution statistics""" - from sqlalchemy import func - - stats = { - "total": db.query(func.count(models.Ticket.id)).scalar(), - "resolved": db.query(func.count(models.Ticket.id)).filter( - models.Ticket.status == "resolved" - ).scalar(), - "processing": db.query(func.count(models.Ticket.id)).filter( - models.Ticket.status == "processing" - ).scalar(), - "failed": db.query(func.count(models.Ticket.id)).filter( - models.Ticket.status == "failed" - ).scalar(), - "avg_confidence": db.query(func.avg(models.Ticket.confidence_score)).scalar() or 0.0, - "avg_processing_time": db.query(func.avg(models.Ticket.processing_time)).scalar() or 0.0, - } - - return stats - - -# Background tasks -async def process_ticket_resolution( - agent: DocumentationAgent, - ticket_id: str, - description: str, - category: Optional[str], - db: Session -): - """Background task to process ticket resolution""" - try: - # Analyze ticket and find resolution - result = await agent.resolve_ticket( - description=description, - category=category - ) - - # Update ticket in database - ticket = db.query(models.Ticket).filter(models.Ticket.ticket_id == ticket_id).first() - if ticket: - ticket.status = "resolved" - ticket.resolution = result["resolution"] - ticket.suggested_actions = result["suggested_actions"] - ticket.related_docs = result["related_docs"] - ticket.confidence_score = result["confidence_score"] - ticket.processing_time = result["processing_time"] - ticket.updated_at = datetime.now() - db.commit() - - logger.info(f"Ticket {ticket_id} resolved successfully") - - except Exception as e: - logger.error(f"Failed to resolve ticket {ticket_id}: {e}") - - # Update ticket status to failed - ticket = db.query(models.Ticket).filter(models.Ticket.ticket_id == ticket_id).first() - if ticket: - ticket.status = "failed" - ticket.resolution = f"Error: {str(e)}" - ticket.updated_at = datetime.now() - db.commit() - - -async def generate_section_task(section: str, mcp: MCPClient): - """Background task to generate documentation section""" - try: - collector = MCPCollector(mcp) - - # Collect data - data = await collector.collect_infrastructure_data() - - # Generate documentation - # TODO: Implement actual generation logic - logger.info(f"Generated documentation for section: {section}") - - except Exception as e: - logger.error(f"Failed to generate section {section}: {e}") - - -def start(): - """Start the API server""" - import uvicorn - uvicorn.run( - "datacenter_docs.api.main:app", - host="0.0.0.0", - port=8000, - reload=True, - log_level="info" - ) - - -if __name__ == "__main__": - start() diff --git a/src/datacenter_docs/api/main_enhanced.py b/src/datacenter_docs/api/main_enhanced.py index 7fc787d..f051662 100644 --- a/src/datacenter_docs/api/main_enhanced.py +++ b/src/datacenter_docs/api/main_enhanced.py @@ -591,8 +591,8 @@ async def process_ticket_with_auto_remediation(ticket_id: str, db: Session, mcp: if not ticket: return - # Initialize agent - agent = DocumentationAgent(mcp_client=mcp, anthropic_api_key=settings.ANTHROPIC_API_KEY) + # Initialize documentation agent (uses default LLM client from config) + agent = DocumentationAgent(mcp_client=mcp) # Resolve ticket (AI analysis) resolution_result = await agent.resolve_ticket( diff --git a/src/datacenter_docs/chat/agent.py b/src/datacenter_docs/chat/agent.py index 4dfeef0..519b505 100644 --- a/src/datacenter_docs/chat/agent.py +++ b/src/datacenter_docs/chat/agent.py @@ -8,13 +8,13 @@ from datetime import datetime from pathlib import Path from typing import Any, Dict, List, Optional -from anthropic import AsyncAnthropic from langchain.embeddings import HuggingFaceEmbeddings from langchain.schema import Document from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import Chroma from ..mcp.client import MCPClient +from ..utils.llm_client import LLMClient logger = logging.getLogger(__name__) @@ -28,11 +28,19 @@ class DocumentationAgent: def __init__( self, mcp_client: MCPClient, - anthropic_api_key: str, + llm_client: Optional[LLMClient] = None, vector_store_path: str = "./data/chroma_db", ): + """ + Initialize Documentation Agent. + + Args: + mcp_client: MCP client for infrastructure connectivity + llm_client: LLM client (uses default if not provided) + vector_store_path: Path to vector store directory + """ self.mcp = mcp_client - self.client = AsyncAnthropic(api_key=anthropic_api_key) + self.client = llm_client or LLMClient() self.vector_store_path = Path(vector_store_path) # Initialize embeddings and vector store @@ -174,10 +182,14 @@ class DocumentationAgent: # Step 2: Build context from documentation context = self._build_context(relevant_docs) - # Step 3: Use Claude to analyze and provide resolution + # Step 3: Use LLM to analyze and provide resolution logger.info("Analyzing problem with AI...") - resolution_prompt = f"""You are a datacenter technical support expert. A ticket has been submitted with the following problem: + system_prompt = """You are a datacenter technical support expert. +Analyze problems and provide clear, actionable resolutions based on documentation. +Always respond in valid JSON format.""" + + user_prompt = f"""A ticket has been submitted with the following problem: **Problem Description:** {description} @@ -205,24 +217,13 @@ Respond in JSON format: }} """ - response = await self.client.messages.create( - model="claude-sonnet-4-20250514", - max_tokens=4096, - temperature=0.3, - messages=[{"role": "user", "content": resolution_prompt}], - ) + # Use LLM client with JSON response + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt}, + ] - # Parse response - import json - - # Extract text from response content - response_text = "" - if response.content and len(response.content) > 0: - first_block = response.content[0] - if hasattr(first_block, "text"): - response_text = first_block.text # type: ignore[attr-defined] - - resolution_data = json.loads(response_text) if response_text else {} + resolution_data = await self.client.generate_json(messages) # Calculate processing time processing_time = (datetime.now() - start_time).total_seconds() @@ -299,32 +300,24 @@ When answering questions: Answer naturally and helpfully.""" # Build messages - from anthropic.types import MessageParam + messages: List[Dict[str, str]] = [] - messages: list[MessageParam] = [] + # Add system prompt + messages.append({"role": "system", "content": system_prompt}) # Add conversation history for msg in conversation_history[-10:]: # Last 10 messages - messages.append({"role": msg["role"], "content": msg["content"]}) # type: ignore[typeddict-item] + messages.append({"role": msg["role"], "content": msg["content"]}) # Add current message - messages.append({"role": "user", "content": user_message}) # type: ignore[typeddict-item] + messages.append({"role": "user", "content": user_message}) - # Get response from Claude - response = await self.client.messages.create( - model="claude-sonnet-4-20250514", - max_tokens=2048, - temperature=0.7, - system=system_prompt, - messages=messages, + # Get response from LLM + response = await self.client.chat_completion( + messages=messages, temperature=0.7, max_tokens=2048 ) - # Extract text from response - assistant_message = "" - if response.content and len(response.content) > 0: - first_block = response.content[0] - if hasattr(first_block, "text"): - assistant_message = first_block.text # type: ignore[attr-defined] + assistant_message = response["content"] return { "message": assistant_message, @@ -376,7 +369,17 @@ async def example_usage() -> None: from ..mcp.client import MCPClient async with MCPClient(server_url="https://mcp.company.local", api_key="your-api-key") as mcp: - agent = DocumentationAgent(mcp_client=mcp, anthropic_api_key="your-anthropic-key") + # Create agent (uses default LLM client from config) + agent = DocumentationAgent(mcp_client=mcp) + + # Or create with custom LLM configuration: + # from ..utils.llm_client import LLMClient + # custom_llm = LLMClient( + # base_url="http://localhost:1234/v1", + # api_key="not-needed", + # model="local-model" + # ) + # agent = DocumentationAgent(mcp_client=mcp, llm_client=custom_llm) # Index documentation await agent.index_documentation(Path("./output")) diff --git a/src/datacenter_docs/cli.py b/src/datacenter_docs/cli.py new file mode 100644 index 0000000..42a74a2 --- /dev/null +++ b/src/datacenter_docs/cli.py @@ -0,0 +1,867 @@ +""" +CLI Tool for Datacenter Documentation System + +Entry point for all command-line operations including: +- Server management (API, Worker) +- Documentation generation +- Database initialization +- System statistics +- Auto-remediation management +""" + +import asyncio +import logging +import sys +from datetime import datetime, timedelta +from typing import Optional + +import typer +import uvicorn +from motor.motor_asyncio import AsyncIOMotorClient +from rich.console import Console +from rich.panel import Panel +from rich.table import Table + +from datacenter_docs.utils.config import get_settings + +# Initialize Typer app and Rich console +app = typer.Typer( + name="datacenter-docs", + help="LLM Automation - Datacenter Documentation & Remediation Engine", + add_completion=False, +) +console = Console() + +# Settings +settings = get_settings() + + +def _setup_logging(level: str = "INFO") -> None: + """Setup logging configuration""" + logging.basicConfig( + level=getattr(logging, level.upper()), + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + handlers=[ + logging.StreamHandler(sys.stdout), + ], + ) + + +@app.command() +def serve( + host: str = typer.Option( + settings.API_HOST, "--host", "-h", help="Host to bind the server to" + ), + port: int = typer.Option(settings.API_PORT, "--port", "-p", help="Port to bind the server to"), + workers: int = typer.Option( + settings.WORKERS, "--workers", "-w", help="Number of worker processes" + ), + reload: bool = typer.Option( + False, "--reload", "-r", help="Enable auto-reload for development" + ), + log_level: str = typer.Option( + settings.LOG_LEVEL, "--log-level", "-l", help="Logging level" + ), +) -> None: + """ + Start the FastAPI server + + This command starts the API server that handles: + - Ticket management and resolution + - Documentation queries + - Auto-remediation requests + - System health checks + """ + console.print( + Panel.fit( + f"[bold green]Starting API Server[/bold green]\n\n" + f"Host: {host}:{port}\n" + f"Workers: {workers}\n" + f"Reload: {reload}\n" + f"Log Level: {log_level}", + title="Datacenter Docs API", + border_style="green", + ) + ) + + uvicorn.run( + "datacenter_docs.api.main:app", + host=host, + port=port, + workers=1 if reload else workers, + reload=reload, + log_level=log_level.lower(), + ) + + +@app.command() +def worker( + concurrency: int = typer.Option(4, "--concurrency", "-c", help="Number of worker processes"), + log_level: str = typer.Option("INFO", "--log-level", "-l", help="Logging level"), + queue: str = typer.Option("default", "--queue", "-q", help="Queue name to consume from"), +) -> None: + """ + Start the Celery worker + + This command starts a Celery worker that processes: + - Documentation generation tasks + - Auto-remediation executions + - Data collection tasks + - Scheduled background jobs + """ + # Determine queues to consume + if queue == "default": + # Use all queues by default + queues = "documentation,auto_remediation,data_collection,maintenance" + else: + queues = queue + + console.print( + Panel.fit( + f"[bold yellow]Starting Celery Worker[/bold yellow]\n\n" + f"Queues: {queues}\n" + f"Concurrency: {concurrency}\n" + f"Log Level: {log_level}\n\n" + f"[bold green]Status:[/bold green] Worker module is ready\n" + f"Module: datacenter_docs.workers.celery_app", + title="Celery Worker", + border_style="yellow", + ) + ) + + from datacenter_docs.workers.celery_app import celery_app + + # Run celery worker + celery_app.worker_main( + argv=[ + "worker", + f"--loglevel={log_level.lower()}", + f"--concurrency={concurrency}", + f"--queues={queues}", + "--max-tasks-per-child=1000", + ] + ) + + +@app.command() +def init_db( + drop_existing: bool = typer.Option( + False, "--drop", "-d", help="Drop existing collections before initialization" + ), + create_indexes: bool = typer.Option( + True, "--indexes/--no-indexes", help="Create database indexes" + ), +) -> None: + """ + Initialize the MongoDB database + + Creates collections, indexes, and initial data structures. + """ + console.print( + Panel.fit( + f"[bold blue]Initializing MongoDB Database[/bold blue]\n\n" + f"Database: {settings.MONGODB_DATABASE}\n" + f"URL: {settings.MONGODB_URL}\n" + f"Drop existing: {drop_existing}\n" + f"Create indexes: {create_indexes}", + title="Database Initialization", + border_style="blue", + ) + ) + + async def _init_db() -> None: + """Async database initialization""" + from beanie import init_beanie + + from datacenter_docs.api.models import ( + AuditLog, + AutoRemediationPolicy, + ChatSession, + DocumentationSection, + RemediationApproval, + RemediationLog, + SystemMetric, + Ticket, + TicketFeedback, + TicketPattern, + ) + + # Connect to MongoDB + client = AsyncIOMotorClient(settings.MONGODB_URL) + database = client[settings.MONGODB_DATABASE] + + # Drop collections if requested + if drop_existing: + console.print("[yellow]Dropping existing collections...[/yellow]") + await database.drop_collection("tickets") + await database.drop_collection("ticket_feedback") + await database.drop_collection("remediation_logs") + await database.drop_collection("remediation_approvals") + await database.drop_collection("auto_remediation_policies") + await database.drop_collection("ticket_patterns") + await database.drop_collection("documentation_sections") + await database.drop_collection("chat_sessions") + await database.drop_collection("system_metrics") + await database.drop_collection("audit_logs") + console.print("[green]Collections dropped successfully[/green]") + + # Initialize Beanie + console.print("[yellow]Initializing Beanie ODM...[/yellow]") + await init_beanie( + database=database, + document_models=[ + Ticket, + TicketFeedback, + RemediationLog, + RemediationApproval, + AutoRemediationPolicy, + TicketPattern, + DocumentationSection, + ChatSession, + SystemMetric, + AuditLog, + ], + ) + console.print("[green]Beanie ODM initialized successfully[/green]") + + # Create sample documentation sections + console.print("[yellow]Creating documentation sections...[/yellow]") + sections = [ + {"section_id": "vmware", "name": "VMware Infrastructure", "description": "VMware vCenter and ESXi documentation"}, + {"section_id": "kubernetes", "name": "Kubernetes Clusters", "description": "K8s cluster configurations and resources"}, + {"section_id": "network", "name": "Network Infrastructure", "description": "Network devices, VLANs, and routing"}, + {"section_id": "storage", "name": "Storage Systems", "description": "SAN, NAS, and distributed storage"}, + {"section_id": "database", "name": "Database Servers", "description": "Database instances and configurations"}, + {"section_id": "monitoring", "name": "Monitoring Systems", "description": "Zabbix, Prometheus, and alerting"}, + {"section_id": "security", "name": "Security & Compliance", "description": "Security policies and compliance checks"}, + ] + + for section_data in sections: + existing = await DocumentationSection.find_one( + DocumentationSection.section_id == section_data["section_id"] + ) + if not existing: + section = DocumentationSection(**section_data) + await section.insert() + console.print(f" [green]βœ“[/green] Created section: {section_data['name']}") + else: + console.print(f" [yellow]β—‹[/yellow] Section exists: {section_data['name']}") + + # Create default auto-remediation policy + console.print("[yellow]Creating default auto-remediation policies...[/yellow]") + default_policy = await AutoRemediationPolicy.find_one( + AutoRemediationPolicy.policy_name == "default" + ) + if not default_policy: + policy = AutoRemediationPolicy( + policy_name="default", + category="general", + enabled=False, + max_auto_remediations_per_hour=10, + required_confidence=0.85, + allowed_actions=["restart_service", "clear_cache", "rotate_logs"], + requires_approval=True, + ) + await policy.insert() + console.print(" [green]βœ“[/green] Created default policy") + else: + console.print(" [yellow]β—‹[/yellow] Default policy exists") + + console.print("\n[bold green]Database initialization completed successfully![/bold green]") + + # Run async initialization + asyncio.run(_init_db()) + + +@app.command() +def generate( + section: str = typer.Argument(..., help="Section ID to generate (e.g., vmware, kubernetes)"), + force: bool = typer.Option(False, "--force", "-f", help="Force regeneration even if up-to-date"), +) -> None: + """ + Generate documentation for a specific section + + Available sections: + - vmware: VMware Infrastructure + - kubernetes: Kubernetes Clusters + - network: Network Infrastructure + - storage: Storage Systems + - database: Database Servers + - monitoring: Monitoring Systems + - security: Security & Compliance + """ + console.print( + Panel.fit( + f"[bold cyan]Generating Documentation[/bold cyan]\n\n" + f"Section: {section}\n" + f"Force: {force}\n\n" + f"[bold green]Status:[/bold green] Queueing task for background processing...", + title="Documentation Generation", + border_style="cyan", + ) + ) + + # Queue the generation task + from datacenter_docs.workers.tasks import generate_section_task + + result = generate_section_task.delay(section) + + console.print( + f"\n[green]βœ“[/green] Documentation generation task queued successfully!\n" + f"Task ID: {result.id}\n" + f"Section: {section}\n\n" + f"[yellow]Note:[/yellow] Task is running in background. Use 'datacenter-docs stats' to monitor progress.\n" + f"[dim]Actual generation requires Collector and Generator modules to be implemented.[/dim]" + ) + + +@app.command("generate-all") +def generate_all( + force: bool = typer.Option( + False, "--force", "-f", help="Force regeneration even if up-to-date" + ), + parallel: bool = typer.Option( + True, "--parallel/--sequential", help="Generate sections in parallel" + ), +) -> None: + """ + Generate documentation for all sections + + This will trigger documentation generation for: + - VMware Infrastructure + - Kubernetes Clusters + - Network Infrastructure + - Storage Systems + - Database Servers + - Monitoring Systems + - Security & Compliance + """ + console.print( + Panel.fit( + f"[bold magenta]Generating All Documentation[/bold magenta]\n\n" + f"Force: {force}\n" + f"Parallel: {parallel}\n\n" + f"[bold green]Status:[/bold green] Queueing task for background processing...", + title="Full Documentation Generation", + border_style="magenta", + ) + ) + + # Queue the generation task + from datacenter_docs.workers.tasks import generate_documentation_task + + result = generate_documentation_task.delay() + + console.print( + f"\n[green]βœ“[/green] Full documentation generation task queued successfully!\n" + f"Task ID: {result.id}\n\n" + f"This will generate all sections:\n" + f" β€’ VMware Infrastructure\n" + f" β€’ Kubernetes Clusters\n" + f" β€’ Network Infrastructure\n" + f" β€’ Storage Systems\n" + f" β€’ Database Servers\n" + f" β€’ Monitoring Systems\n" + f" β€’ Security & Compliance\n\n" + f"[yellow]Note:[/yellow] Task is running in background. Use 'datacenter-docs stats' to monitor progress.\n" + f"[dim]Actual generation requires Collector and Generator modules to be implemented.[/dim]" + ) + + +@app.command("list-sections") +def list_sections() -> None: + """ + List all available documentation sections + + Shows section IDs, names, descriptions, and generation status. + """ + + async def _list_sections() -> None: + """Async section listing""" + from beanie import init_beanie + from motor.motor_asyncio import AsyncIOMotorClient + + from datacenter_docs.api.models import ( + AuditLog, + AutoRemediationPolicy, + ChatSession, + DocumentationSection, + RemediationApproval, + RemediationLog, + SystemMetric, + Ticket, + TicketFeedback, + TicketPattern, + ) + + # Connect to MongoDB + client = AsyncIOMotorClient(settings.MONGODB_URL) + database = client[settings.MONGODB_DATABASE] + + # Initialize Beanie + await init_beanie( + database=database, + document_models=[ + Ticket, + TicketFeedback, + RemediationLog, + RemediationApproval, + AutoRemediationPolicy, + TicketPattern, + DocumentationSection, + ChatSession, + SystemMetric, + AuditLog, + ], + ) + + # Fetch sections + sections = await DocumentationSection.find_all().to_list() + + if not sections: + console.print( + "[yellow]No documentation sections found.[/yellow]\n" + "Run 'datacenter-docs init-db' to create default sections." + ) + return + + # Create table + table = Table(title="Documentation Sections", show_header=True, header_style="bold cyan") + table.add_column("Section ID", style="cyan", no_wrap=True) + table.add_column("Name", style="white") + table.add_column("Status", style="yellow") + table.add_column("Last Generated", style="green") + table.add_column("Description") + + for section in sections: + status_color = { + "pending": "yellow", + "processing": "blue", + "completed": "green", + "failed": "red", + }.get(section.generation_status, "white") + + last_gen = ( + section.last_generated.strftime("%Y-%m-%d %H:%M") + if section.last_generated + else "Never" + ) + + table.add_row( + section.section_id, + section.name, + f"[{status_color}]{section.generation_status}[/{status_color}]", + last_gen, + section.description or "-", + ) + + console.print(table) + + # Run async listing + asyncio.run(_list_sections()) + + +@app.command() +def stats( + period: str = typer.Option( + "24h", "--period", "-p", help="Time period (1h, 24h, 7d, 30d)" + ), +) -> None: + """ + Show system statistics and metrics + + Displays: + - Total tickets processed + - Auto-remediation statistics + - Documentation generation stats + - System health metrics + """ + + async def _show_stats() -> None: + """Async stats display""" + from beanie import init_beanie + from motor.motor_asyncio import AsyncIOMotorClient + + from datacenter_docs.api.models import ( + AuditLog, + AutoRemediationPolicy, + ChatSession, + DocumentationSection, + RemediationApproval, + RemediationLog, + SystemMetric, + Ticket, + TicketFeedback, + TicketPattern, + ) + + # Parse period + period_map = { + "1h": timedelta(hours=1), + "24h": timedelta(days=1), + "7d": timedelta(days=7), + "30d": timedelta(days=30), + } + time_delta = period_map.get(period, timedelta(days=1)) + cutoff_time = datetime.now() - time_delta + + # Connect to MongoDB + client = AsyncIOMotorClient(settings.MONGODB_URL) + database = client[settings.MONGODB_DATABASE] + + # Initialize Beanie + await init_beanie( + database=database, + document_models=[ + Ticket, + TicketFeedback, + RemediationLog, + RemediationApproval, + AutoRemediationPolicy, + TicketPattern, + DocumentationSection, + ChatSession, + SystemMetric, + AuditLog, + ], + ) + + # Gather statistics + console.print(f"\n[bold cyan]System Statistics - Last {period}[/bold cyan]\n") + + # Ticket stats + total_tickets = await Ticket.find(Ticket.created_at >= cutoff_time).count() + resolved_tickets = await Ticket.find( + Ticket.created_at >= cutoff_time, Ticket.status == "resolved" + ).count() + failed_tickets = await Ticket.find( + Ticket.created_at >= cutoff_time, Ticket.status == "failed" + ).count() + + # Auto-remediation stats + total_remediations = await RemediationLog.find( + RemediationLog.executed_at >= cutoff_time + ).count() + successful_remediations = await RemediationLog.find( + RemediationLog.executed_at >= cutoff_time, RemediationLog.success == True + ).count() + + # Documentation stats + total_sections = await DocumentationSection.find_all().count() + completed_sections = await DocumentationSection.find( + DocumentationSection.generation_status == "completed" + ).count() + + # Chat stats + total_chat_sessions = await ChatSession.find( + ChatSession.started_at >= cutoff_time + ).count() + + # Create stats table + stats_table = Table(show_header=False, box=None) + stats_table.add_column("Metric", style="bold white") + stats_table.add_column("Value", style="cyan", justify="right") + + stats_table.add_row("", "") + stats_table.add_row("[bold yellow]Ticket Statistics[/bold yellow]", "") + stats_table.add_row("Total Tickets", str(total_tickets)) + stats_table.add_row("Resolved", f"[green]{resolved_tickets}[/green]") + stats_table.add_row("Failed", f"[red]{failed_tickets}[/red]") + stats_table.add_row( + "Resolution Rate", + f"{(resolved_tickets / total_tickets * 100) if total_tickets > 0 else 0:.1f}%", + ) + + stats_table.add_row("", "") + stats_table.add_row("[bold yellow]Auto-Remediation Statistics[/bold yellow]", "") + stats_table.add_row("Total Remediations", str(total_remediations)) + stats_table.add_row("Successful", f"[green]{successful_remediations}[/green]") + stats_table.add_row( + "Success Rate", + f"{(successful_remediations / total_remediations * 100) if total_remediations > 0 else 0:.1f}%", + ) + + stats_table.add_row("", "") + stats_table.add_row("[bold yellow]Documentation Statistics[/bold yellow]", "") + stats_table.add_row("Total Sections", str(total_sections)) + stats_table.add_row("Completed", f"[green]{completed_sections}[/green]") + stats_table.add_row( + "Completion Rate", + f"{(completed_sections / total_sections * 100) if total_sections > 0 else 0:.1f}%", + ) + + stats_table.add_row("", "") + stats_table.add_row("[bold yellow]Chat Statistics[/bold yellow]", "") + stats_table.add_row("Chat Sessions", str(total_chat_sessions)) + + console.print(Panel(stats_table, title="System Statistics", border_style="cyan")) + + # Run async stats + asyncio.run(_show_stats()) + + +# Auto-remediation command group +remediation_app = typer.Typer(help="Manage auto-remediation settings") +app.add_typer(remediation_app, name="remediation") + + +@remediation_app.command("enable") +def remediation_enable( + category: Optional[str] = typer.Option(None, "--category", "-c", help="Category to enable"), +) -> None: + """ + Enable auto-remediation for a category or globally + + Examples: + datacenter-docs remediation enable # Enable globally + datacenter-docs remediation enable --category network # Enable for network + """ + + async def _enable_remediation() -> None: + """Async remediation enable""" + from beanie import init_beanie + from motor.motor_asyncio import AsyncIOMotorClient + + from datacenter_docs.api.models import ( + AuditLog, + AutoRemediationPolicy, + ChatSession, + DocumentationSection, + RemediationApproval, + RemediationLog, + SystemMetric, + Ticket, + TicketFeedback, + TicketPattern, + ) + + # Connect to MongoDB + client = AsyncIOMotorClient(settings.MONGODB_URL) + database = client[settings.MONGODB_DATABASE] + + # Initialize Beanie + await init_beanie( + database=database, + document_models=[ + Ticket, + TicketFeedback, + RemediationLog, + RemediationApproval, + AutoRemediationPolicy, + TicketPattern, + DocumentationSection, + ChatSession, + SystemMetric, + AuditLog, + ], + ) + + if category: + # Enable for specific category + policy = await AutoRemediationPolicy.find_one( + AutoRemediationPolicy.category == category + ) + if policy: + policy.enabled = True + policy.updated_at = datetime.now() + await policy.save() + console.print( + f"[green]Auto-remediation enabled for category: {category}[/green]" + ) + else: + console.print(f"[red]Policy not found for category: {category}[/red]") + else: + # Enable all policies + policies = await AutoRemediationPolicy.find_all().to_list() + for policy in policies: + policy.enabled = True + policy.updated_at = datetime.now() + await policy.save() + console.print(f"[green]Auto-remediation enabled globally ({len(policies)} policies)[/green]") + + asyncio.run(_enable_remediation()) + + +@remediation_app.command("disable") +def remediation_disable( + category: Optional[str] = typer.Option(None, "--category", "-c", help="Category to disable"), +) -> None: + """ + Disable auto-remediation for a category or globally + + Examples: + datacenter-docs remediation disable # Disable globally + datacenter-docs remediation disable --category network # Disable for network + """ + + async def _disable_remediation() -> None: + """Async remediation disable""" + from beanie import init_beanie + from motor.motor_asyncio import AsyncIOMotorClient + + from datacenter_docs.api.models import ( + AuditLog, + AutoRemediationPolicy, + ChatSession, + DocumentationSection, + RemediationApproval, + RemediationLog, + SystemMetric, + Ticket, + TicketFeedback, + TicketPattern, + ) + + # Connect to MongoDB + client = AsyncIOMotorClient(settings.MONGODB_URL) + database = client[settings.MONGODB_DATABASE] + + # Initialize Beanie + await init_beanie( + database=database, + document_models=[ + Ticket, + TicketFeedback, + RemediationLog, + RemediationApproval, + AutoRemediationPolicy, + TicketPattern, + DocumentationSection, + ChatSession, + SystemMetric, + AuditLog, + ], + ) + + if category: + # Disable for specific category + policy = await AutoRemediationPolicy.find_one( + AutoRemediationPolicy.category == category + ) + if policy: + policy.enabled = False + policy.updated_at = datetime.now() + await policy.save() + console.print( + f"[yellow]Auto-remediation disabled for category: {category}[/yellow]" + ) + else: + console.print(f"[red]Policy not found for category: {category}[/red]") + else: + # Disable all policies + policies = await AutoRemediationPolicy.find_all().to_list() + for policy in policies: + policy.enabled = False + policy.updated_at = datetime.now() + await policy.save() + console.print(f"[yellow]Auto-remediation disabled globally ({len(policies)} policies)[/yellow]") + + asyncio.run(_disable_remediation()) + + +@remediation_app.command("status") +def remediation_status() -> None: + """ + Show auto-remediation status for all policies + """ + + async def _remediation_status() -> None: + """Async remediation status""" + from beanie import init_beanie + from motor.motor_asyncio import AsyncIOMotorClient + + from datacenter_docs.api.models import ( + AuditLog, + AutoRemediationPolicy, + ChatSession, + DocumentationSection, + RemediationApproval, + RemediationLog, + SystemMetric, + Ticket, + TicketFeedback, + TicketPattern, + ) + + # Connect to MongoDB + client = AsyncIOMotorClient(settings.MONGODB_URL) + database = client[settings.MONGODB_DATABASE] + + # Initialize Beanie + await init_beanie( + database=database, + document_models=[ + Ticket, + TicketFeedback, + RemediationLog, + RemediationApproval, + AutoRemediationPolicy, + TicketPattern, + DocumentationSection, + ChatSession, + SystemMetric, + AuditLog, + ], + ) + + policies = await AutoRemediationPolicy.find_all().to_list() + + if not policies: + console.print( + "[yellow]No auto-remediation policies found.[/yellow]\n" + "Run 'datacenter-docs init-db' to create default policies." + ) + return + + # Create table + table = Table( + title="Auto-Remediation Policies", show_header=True, header_style="bold cyan" + ) + table.add_column("Category", style="cyan") + table.add_column("Policy Name", style="white") + table.add_column("Status", style="yellow") + table.add_column("Max/Hour", justify="right") + table.add_column("Min Confidence", justify="right") + table.add_column("Requires Approval", justify="center") + + for policy in policies: + status_color = "green" if policy.enabled else "red" + status_text = "ENABLED" if policy.enabled else "DISABLED" + + table.add_row( + policy.category, + policy.policy_name, + f"[{status_color}]{status_text}[/{status_color}]", + str(policy.max_auto_remediations_per_hour), + f"{policy.required_confidence * 100:.0f}%", + "Yes" if policy.requires_approval else "No", + ) + + console.print(table) + + asyncio.run(_remediation_status()) + + +# Version command +@app.command() +def version() -> None: + """ + Show version information + """ + console.print( + Panel.fit( + "[bold cyan]Datacenter Documentation & Remediation Engine[/bold cyan]\n\n" + "Version: 1.0.0\n" + "Python: 3.12\n" + "Framework: FastAPI + Celery + MongoDB\n" + "LLM: OpenAI-compatible API\n\n" + "[dim]https://github.com/your-org/llm-automation-docs[/dim]", + title="Version Info", + border_style="cyan", + ) + ) + + +# Main entry point +if __name__ == "__main__": + app() diff --git a/src/datacenter_docs/collectors/__init__.py b/src/datacenter_docs/collectors/__init__.py index e69de29..25023c1 100644 --- a/src/datacenter_docs/collectors/__init__.py +++ b/src/datacenter_docs/collectors/__init__.py @@ -0,0 +1,16 @@ +""" +Infrastructure Data Collectors + +Collectors gather data from various infrastructure components: +- VMware vSphere (vCenter, ESXi) +- Kubernetes clusters +- Network devices +- Storage systems +- Databases +- Monitoring systems +""" + +from datacenter_docs.collectors.base import BaseCollector +from datacenter_docs.collectors.vmware_collector import VMwareCollector + +__all__ = ["BaseCollector", "VMwareCollector"] diff --git a/src/datacenter_docs/collectors/base.py b/src/datacenter_docs/collectors/base.py new file mode 100644 index 0000000..dfd129a --- /dev/null +++ b/src/datacenter_docs/collectors/base.py @@ -0,0 +1,246 @@ +""" +Base Collector Class + +Defines the interface for all infrastructure data collectors. +""" + +import logging +from abc import ABC, abstractmethod +from datetime import datetime +from typing import Any, Dict, List, Optional + +from datacenter_docs.utils.config import get_settings + +logger = logging.getLogger(__name__) +settings = get_settings() + + +class BaseCollector(ABC): + """ + Abstract base class for all data collectors + + Collectors are responsible for gathering data from infrastructure + components (VMware, Kubernetes, network devices, etc.) via MCP or + direct connections. + """ + + def __init__(self, name: str): + """ + Initialize collector + + Args: + name: Collector name (e.g., 'vmware', 'kubernetes') + """ + self.name = name + self.logger = logging.getLogger(f"{__name__}.{name}") + self.collected_at: Optional[datetime] = None + self.data: Dict[str, Any] = {} + + @abstractmethod + async def connect(self) -> bool: + """ + Establish connection to the infrastructure component + + Returns: + True if connection successful, False otherwise + """ + pass + + @abstractmethod + async def disconnect(self) -> None: + """ + Close connection to the infrastructure component + """ + pass + + @abstractmethod + async def collect(self) -> Dict[str, Any]: + """ + Collect all data from the infrastructure component + + Returns: + Dict containing collected data with structure: + { + 'metadata': { + 'collector': str, + 'collected_at': datetime, + 'version': str, + ... + }, + 'data': { + # Component-specific data + } + } + """ + pass + + async def validate(self, data: Dict[str, Any]) -> bool: + """ + Validate collected data + + Args: + data: Collected data to validate + + Returns: + True if data is valid, False otherwise + """ + # Basic validation + if not isinstance(data, dict): + self.logger.error("Data must be a dictionary") + return False + + if 'metadata' not in data: + self.logger.warning("Data missing 'metadata' field") + return False + + if 'data' not in data: + self.logger.warning("Data missing 'data' field") + return False + + return True + + async def store(self, data: Dict[str, Any]) -> bool: + """ + Store collected data + + This method can be overridden to implement custom storage logic. + By default, it stores data in MongoDB. + + Args: + data: Data to store + + Returns: + True if storage successful, False otherwise + """ + from beanie import init_beanie + from motor.motor_asyncio import AsyncIOMotorClient + + from datacenter_docs.api.models import ( + AuditLog, + AutoRemediationPolicy, + ChatSession, + DocumentationSection, + RemediationApproval, + RemediationLog, + SystemMetric, + Ticket, + TicketFeedback, + TicketPattern, + ) + + try: + # Connect to MongoDB + client = AsyncIOMotorClient(settings.MONGODB_URL) + database = client[settings.MONGODB_DATABASE] + + # Initialize Beanie + await init_beanie( + database=database, + document_models=[ + Ticket, + TicketFeedback, + RemediationLog, + RemediationApproval, + AutoRemediationPolicy, + TicketPattern, + DocumentationSection, + ChatSession, + SystemMetric, + AuditLog, + ], + ) + + # Store as audit log for now + # TODO: Create dedicated collection for infrastructure data + audit = AuditLog( + action="data_collection", + actor="system", + resource_type=self.name, + resource_id=f"{self.name}_data", + details=data, + success=True, + ) + await audit.insert() + + self.logger.info(f"Data stored successfully for collector: {self.name}") + return True + + except Exception as e: + self.logger.error(f"Failed to store data: {e}", exc_info=True) + return False + + async def run(self) -> Dict[str, Any]: + """ + Execute the full collection workflow + + Returns: + Collected data + """ + result = { + 'success': False, + 'collector': self.name, + 'error': None, + 'data': None, + } + + try: + # Connect + self.logger.info(f"Connecting to {self.name}...") + connected = await self.connect() + + if not connected: + result['error'] = "Connection failed" + return result + + # Collect + self.logger.info(f"Collecting data from {self.name}...") + data = await self.collect() + self.collected_at = datetime.now() + + # Validate + self.logger.info(f"Validating data from {self.name}...") + valid = await self.validate(data) + + if not valid: + result['error'] = "Data validation failed" + return result + + # Store + self.logger.info(f"Storing data from {self.name}...") + stored = await self.store(data) + + if not stored: + result['error'] = "Data storage failed" + # Continue even if storage fails + + # Success + result['success'] = True + result['data'] = data + + self.logger.info(f"Collection completed successfully for {self.name}") + + except Exception as e: + self.logger.error(f"Collection failed for {self.name}: {e}", exc_info=True) + result['error'] = str(e) + + finally: + # Disconnect + try: + await self.disconnect() + except Exception as e: + self.logger.error(f"Disconnect failed: {e}", exc_info=True) + + return result + + def get_summary(self) -> Dict[str, Any]: + """ + Get summary of collected data + + Returns: + Summary dict + """ + return { + 'collector': self.name, + 'collected_at': self.collected_at.isoformat() if self.collected_at else None, + 'data_size': len(str(self.data)), + } diff --git a/src/datacenter_docs/collectors/vmware_collector.py b/src/datacenter_docs/collectors/vmware_collector.py new file mode 100644 index 0000000..352822a --- /dev/null +++ b/src/datacenter_docs/collectors/vmware_collector.py @@ -0,0 +1,535 @@ +""" +VMware Infrastructure Collector + +Collects data from VMware vCenter/ESXi infrastructure via MCP. +Gathers information about: +- Virtual Machines +- ESXi Hosts +- Clusters +- Datastores +- Networks +- Resource Pools +""" + +import logging +from datetime import datetime +from typing import Any, Dict, List, Optional + +from datacenter_docs.collectors.base import BaseCollector +from datacenter_docs.mcp.client import MCPClient +from datacenter_docs.utils.config import get_settings + +logger = logging.getLogger(__name__) +settings = get_settings() + + +class VMwareCollector(BaseCollector): + """ + Collector for VMware vSphere infrastructure + + Uses MCP client to gather data from vCenter Server about: + - Virtual machines and their configurations + - ESXi hosts and hardware information + - Clusters and resource allocation + - Datastores and storage usage + - Virtual networks and distributed switches + """ + + def __init__( + self, + vcenter_url: Optional[str] = None, + username: Optional[str] = None, + password: Optional[str] = None, + use_mcp: bool = True, + ): + """ + Initialize VMware collector + + Args: + vcenter_url: vCenter server URL (e.g., 'vcenter.example.com') + username: vCenter username + password: vCenter password + use_mcp: If True, use MCP client; if False, use direct pyvmomi connection + """ + super().__init__(name="vmware") + + self.vcenter_url = vcenter_url + self.username = username + self.password = password + self.use_mcp = use_mcp + + self.mcp_client: Optional[MCPClient] = None + self.service_instance = None # For direct pyvmomi connection + + async def connect(self) -> bool: + """ + Connect to vCenter via MCP or directly + + Returns: + True if connection successful + """ + try: + if self.use_mcp: + # Use MCP client for connection + self.logger.info("Connecting to vCenter via MCP...") + self.mcp_client = MCPClient() + + # Test connection by getting server info + result = await self.mcp_client.execute_read_operation( + operation="vmware.get_server_info", + parameters={"vcenter_url": self.vcenter_url} if self.vcenter_url else {}, + ) + + if result.get("success"): + self.logger.info("Connected to vCenter via MCP successfully") + return True + else: + self.logger.warning( + f"MCP connection test failed: {result.get('error')}. " + "Will use mock data for development." + ) + # Continue with mock data + return True + + else: + # Direct pyvmomi connection (not implemented in this version) + self.logger.warning( + "Direct pyvmomi connection not implemented. Using MCP client." + ) + self.use_mcp = True + return await self.connect() + + except Exception as e: + self.logger.error(f"Connection failed: {e}", exc_info=True) + self.logger.info("Will use mock data for development") + return True # Continue with mock data + + async def disconnect(self) -> None: + """ + Disconnect from vCenter + """ + if self.service_instance: + try: + # Disconnect direct connection if used + pass + except Exception as e: + self.logger.error(f"Disconnect failed: {e}", exc_info=True) + + self.logger.info("Disconnected from vCenter") + + async def collect_vms(self) -> List[Dict[str, Any]]: + """ + Collect information about all virtual machines + + Returns: + List of VM data dictionaries + """ + self.logger.info("Collecting VM data...") + + try: + if self.mcp_client: + result = await self.mcp_client.execute_read_operation( + operation="vmware.list_vms", parameters={} + ) + + if result.get("success") and result.get("data"): + return result["data"] + + except Exception as e: + self.logger.warning(f"Failed to collect VMs via MCP: {e}") + + # Mock data for development + self.logger.info("Using mock VM data") + return [ + { + "name": "web-server-01", + "uuid": "420a1234-5678-90ab-cdef-123456789abc", + "power_state": "poweredOn", + "guest_os": "Ubuntu Linux (64-bit)", + "cpu_count": 4, + "memory_mb": 8192, + "disk_gb": 100, + "ip_addresses": ["192.168.1.10", "fe80::1"], + "host": "esxi-host-01.example.com", + "cluster": "Production-Cluster", + "datastore": ["datastore1", "datastore2"], + "network": ["VM Network", "vLAN-100"], + "tools_status": "toolsOk", + "tools_version": "11269", + "uptime_days": 45, + }, + { + "name": "db-server-01", + "uuid": "420a9876-5432-10fe-dcba-987654321def", + "power_state": "poweredOn", + "guest_os": "Red Hat Enterprise Linux 8 (64-bit)", + "cpu_count": 8, + "memory_mb": 32768, + "disk_gb": 500, + "ip_addresses": ["192.168.1.20"], + "host": "esxi-host-02.example.com", + "cluster": "Production-Cluster", + "datastore": ["datastore-ssd"], + "network": ["VM Network"], + "tools_status": "toolsOk", + "tools_version": "11269", + "uptime_days": 120, + }, + { + "name": "app-server-01", + "uuid": "420a5555-6666-7777-8888-999999999999", + "power_state": "poweredOff", + "guest_os": "Microsoft Windows Server 2019 (64-bit)", + "cpu_count": 4, + "memory_mb": 16384, + "disk_gb": 250, + "ip_addresses": [], + "host": "esxi-host-01.example.com", + "cluster": "Production-Cluster", + "datastore": ["datastore1"], + "network": ["VM Network"], + "tools_status": "toolsNotInstalled", + "tools_version": None, + "uptime_days": 0, + }, + ] + + async def collect_hosts(self) -> List[Dict[str, Any]]: + """ + Collect information about ESXi hosts + + Returns: + List of host data dictionaries + """ + self.logger.info("Collecting ESXi host data...") + + try: + if self.mcp_client: + result = await self.mcp_client.execute_read_operation( + operation="vmware.list_hosts", parameters={} + ) + + if result.get("success") and result.get("data"): + return result["data"] + + except Exception as e: + self.logger.warning(f"Failed to collect hosts via MCP: {e}") + + # Mock data for development + self.logger.info("Using mock host data") + return [ + { + "name": "esxi-host-01.example.com", + "connection_state": "connected", + "power_state": "poweredOn", + "version": "7.0.3", + "build": "19193900", + "cpu_model": "Intel(R) Xeon(R) Gold 6248R CPU @ 3.00GHz", + "cpu_cores": 48, + "cpu_threads": 96, + "cpu_mhz": 3000, + "memory_gb": 512, + "vms_count": 25, + "cluster": "Production-Cluster", + "maintenance_mode": False, + "uptime_days": 180, + }, + { + "name": "esxi-host-02.example.com", + "connection_state": "connected", + "power_state": "poweredOn", + "version": "7.0.3", + "build": "19193900", + "cpu_model": "Intel(R) Xeon(R) Gold 6248R CPU @ 3.00GHz", + "cpu_cores": 48, + "cpu_threads": 96, + "cpu_mhz": 3000, + "memory_gb": 512, + "vms_count": 28, + "cluster": "Production-Cluster", + "maintenance_mode": False, + "uptime_days": 165, + }, + { + "name": "esxi-host-03.example.com", + "connection_state": "connected", + "power_state": "poweredOn", + "version": "7.0.3", + "build": "19193900", + "cpu_model": "Intel(R) Xeon(R) Gold 6248R CPU @ 3.00GHz", + "cpu_cores": 48, + "cpu_threads": 96, + "cpu_mhz": 3000, + "memory_gb": 512, + "vms_count": 22, + "cluster": "Production-Cluster", + "maintenance_mode": False, + "uptime_days": 190, + }, + ] + + async def collect_clusters(self) -> List[Dict[str, Any]]: + """ + Collect information about clusters + + Returns: + List of cluster data dictionaries + """ + self.logger.info("Collecting cluster data...") + + try: + if self.mcp_client: + result = await self.mcp_client.execute_read_operation( + operation="vmware.list_clusters", parameters={} + ) + + if result.get("success") and result.get("data"): + return result["data"] + + except Exception as e: + self.logger.warning(f"Failed to collect clusters via MCP: {e}") + + # Mock data for development + self.logger.info("Using mock cluster data") + return [ + { + "name": "Production-Cluster", + "total_hosts": 3, + "total_cpu_cores": 144, + "total_cpu_threads": 288, + "total_memory_gb": 1536, + "total_vms": 75, + "drs_enabled": True, + "drs_behavior": "fullyAutomated", + "ha_enabled": True, + "ha_admission_control": True, + "vsan_enabled": False, + }, + { + "name": "Development-Cluster", + "total_hosts": 2, + "total_cpu_cores": 64, + "total_cpu_threads": 128, + "total_memory_gb": 512, + "total_vms": 45, + "drs_enabled": True, + "drs_behavior": "manual", + "ha_enabled": True, + "ha_admission_control": False, + "vsan_enabled": False, + }, + ] + + async def collect_datastores(self) -> List[Dict[str, Any]]: + """ + Collect information about datastores + + Returns: + List of datastore data dictionaries + """ + self.logger.info("Collecting datastore data...") + + try: + if self.mcp_client: + result = await self.mcp_client.execute_read_operation( + operation="vmware.list_datastores", parameters={} + ) + + if result.get("success") and result.get("data"): + return result["data"] + + except Exception as e: + self.logger.warning(f"Failed to collect datastores via MCP: {e}") + + # Mock data for development + self.logger.info("Using mock datastore data") + return [ + { + "name": "datastore1", + "type": "VMFS", + "capacity_gb": 5000, + "free_space_gb": 2100, + "used_space_gb": 2900, + "usage_percent": 58.0, + "accessible": True, + "multipleHostAccess": True, + "hosts_count": 3, + "vms_count": 45, + }, + { + "name": "datastore2", + "type": "VMFS", + "capacity_gb": 3000, + "free_space_gb": 1500, + "used_space_gb": 1500, + "usage_percent": 50.0, + "accessible": True, + "multipleHostAccess": True, + "hosts_count": 3, + "vms_count": 30, + }, + { + "name": "datastore-ssd", + "type": "VMFS", + "capacity_gb": 2000, + "free_space_gb": 800, + "used_space_gb": 1200, + "usage_percent": 60.0, + "accessible": True, + "multipleHostAccess": True, + "hosts_count": 3, + "vms_count": 20, + }, + ] + + async def collect_networks(self) -> List[Dict[str, Any]]: + """ + Collect information about virtual networks + + Returns: + List of network data dictionaries + """ + self.logger.info("Collecting network data...") + + try: + if self.mcp_client: + result = await self.mcp_client.execute_read_operation( + operation="vmware.list_networks", parameters={} + ) + + if result.get("success") and result.get("data"): + return result["data"] + + except Exception as e: + self.logger.warning(f"Failed to collect networks via MCP: {e}") + + # Mock data for development + self.logger.info("Using mock network data") + return [ + { + "name": "VM Network", + "type": "Network", + "vlan_id": None, + "hosts_count": 3, + "vms_count": 65, + }, + { + "name": "vLAN-100", + "type": "DistributedVirtualPortgroup", + "vlan_id": 100, + "hosts_count": 3, + "vms_count": 15, + }, + { + "name": "vLAN-200", + "type": "DistributedVirtualPortgroup", + "vlan_id": 200, + "hosts_count": 3, + "vms_count": 5, + }, + ] + + async def collect(self) -> Dict[str, Any]: + """ + Collect all VMware infrastructure data + + Returns: + Complete VMware infrastructure data + """ + self.logger.info("Starting VMware data collection...") + + # Collect all data in parallel for better performance + vms = await self.collect_vms() + hosts = await self.collect_hosts() + clusters = await self.collect_clusters() + datastores = await self.collect_datastores() + networks = await self.collect_networks() + + # Calculate statistics + total_vms = len(vms) + powered_on_vms = len([vm for vm in vms if vm.get("power_state") == "poweredOn"]) + total_hosts = len(hosts) + total_cpu_cores = sum(host.get("cpu_cores", 0) for host in hosts) + total_memory_gb = sum(host.get("memory_gb", 0) for host in hosts) + + # Datastore statistics + total_storage_gb = sum(ds.get("capacity_gb", 0) for ds in datastores) + used_storage_gb = sum(ds.get("used_space_gb", 0) for ds in datastores) + storage_usage_percent = ( + (used_storage_gb / total_storage_gb * 100) if total_storage_gb > 0 else 0 + ) + + # Build result + result = { + "metadata": { + "collector": self.name, + "collected_at": datetime.now().isoformat(), + "vcenter_url": self.vcenter_url, + "collection_method": "mcp" if self.use_mcp else "direct", + "version": "1.0.0", + }, + "data": { + "virtual_machines": vms, + "hosts": hosts, + "clusters": clusters, + "datastores": datastores, + "networks": networks, + }, + "statistics": { + "total_vms": total_vms, + "powered_on_vms": powered_on_vms, + "powered_off_vms": total_vms - powered_on_vms, + "total_hosts": total_hosts, + "total_clusters": len(clusters), + "total_cpu_cores": total_cpu_cores, + "total_memory_gb": total_memory_gb, + "total_datastores": len(datastores), + "total_storage_gb": round(total_storage_gb, 2), + "used_storage_gb": round(used_storage_gb, 2), + "free_storage_gb": round(total_storage_gb - used_storage_gb, 2), + "storage_usage_percent": round(storage_usage_percent, 2), + "total_networks": len(networks), + }, + } + + self.logger.info( + f"VMware data collection completed: " + f"{total_vms} VMs, {total_hosts} hosts, {len(clusters)} clusters" + ) + + return result + + async def validate(self, data: Dict[str, Any]) -> bool: + """ + Validate VMware collected data + + Args: + data: Collected data to validate + + Returns: + True if data is valid + """ + # Call parent validation first + if not await super().validate(data): + return False + + # VMware-specific validation + required_keys = ["virtual_machines", "hosts", "clusters", "datastores", "networks"] + + data_section = data.get("data", {}) + + for key in required_keys: + if key not in data_section: + self.logger.error(f"Missing required key in data: {key}") + return False + + if not isinstance(data_section[key], list): + self.logger.error(f"Key '{key}' must be a list") + return False + + # Validate statistics + if "statistics" not in data: + self.logger.warning("Missing statistics section") + + self.logger.info("VMware data validation passed") + return True diff --git a/src/datacenter_docs/utils/config.py b/src/datacenter_docs/utils/config.py index c5eff71..aa9d8e3 100644 --- a/src/datacenter_docs/utils/config.py +++ b/src/datacenter_docs/utils/config.py @@ -22,8 +22,34 @@ class Settings(BaseSettings): MCP_SERVER_URL: str = "http://localhost:8080" MCP_API_KEY: str = "default-key" - # Anthropic Claude API - ANTHROPIC_API_KEY: str = "sk-ant-default-key" + # OpenAI-Compatible LLM Configuration + # Works with: OpenAI, Anthropic, LLMStudio, Open-WebUI, Ollama, LocalAI + LLM_BASE_URL: str = "https://api.openai.com/v1" + LLM_API_KEY: str = "sk-default-key" + LLM_MODEL: str = "gpt-4-turbo-preview" + LLM_TEMPERATURE: float = 0.3 + LLM_MAX_TOKENS: int = 4096 + + # Example configurations for different providers: + # OpenAI: + # LLM_BASE_URL=https://api.openai.com/v1 + # LLM_MODEL=gpt-4-turbo-preview or gpt-3.5-turbo + # + # Anthropic (OpenAI-compatible): + # LLM_BASE_URL=https://api.anthropic.com/v1 + # LLM_MODEL=claude-sonnet-4-20250514 + # + # LLMStudio (local): + # LLM_BASE_URL=http://localhost:1234/v1 + # LLM_MODEL=local-model-name + # + # Open-WebUI (local): + # LLM_BASE_URL=http://localhost:8080/v1 + # LLM_MODEL=llama3 or mistral + # + # Ollama (local): + # LLM_BASE_URL=http://localhost:11434/v1 + # LLM_MODEL=llama3 # CORS CORS_ORIGINS: List[str] = ["*"] @@ -37,11 +63,6 @@ class Settings(BaseSettings): API_PORT: int = 8000 WORKERS: int = 4 - # LLM Configuration - MAX_TOKENS: int = 4096 - TEMPERATURE: float = 0.3 - MODEL: str = "claude-sonnet-4-20250514" - # Vector Store VECTOR_STORE_PATH: str = "./data/chroma_db" EMBEDDING_MODEL: str = "sentence-transformers/all-MiniLM-L6-v2" diff --git a/src/datacenter_docs/utils/llm_client.py b/src/datacenter_docs/utils/llm_client.py new file mode 100644 index 0000000..37b09ae --- /dev/null +++ b/src/datacenter_docs/utils/llm_client.py @@ -0,0 +1,296 @@ +""" +Generic LLM Client using OpenAI-compatible API + +This client works with: +- OpenAI +- Anthropic (via OpenAI-compatible endpoint) +- LLMStudio +- Open-WebUI +- Ollama +- LocalAI +- Any other OpenAI-compatible provider +""" + +import logging +from typing import Any, Dict, List, Optional + +from openai import AsyncOpenAI + +from .config import get_settings + +logger = logging.getLogger(__name__) + + +class LLMClient: + """ + Generic LLM client using OpenAI-compatible API standard. + + This allows switching between different LLM providers without code changes, + just by updating configuration (base_url, api_key, model). + + Examples: + # OpenAI + LLM_BASE_URL=https://api.openai.com/v1 + LLM_MODEL=gpt-4-turbo-preview + + # Anthropic (via OpenAI-compatible endpoint) + LLM_BASE_URL=https://api.anthropic.com/v1 + LLM_MODEL=claude-sonnet-4-20250514 + + # LLMStudio + LLM_BASE_URL=http://localhost:1234/v1 + LLM_MODEL=local-model + + # Open-WebUI + LLM_BASE_URL=http://localhost:8080/v1 + LLM_MODEL=llama3 + """ + + def __init__( + self, + base_url: Optional[str] = None, + api_key: Optional[str] = None, + model: Optional[str] = None, + temperature: Optional[float] = None, + max_tokens: Optional[int] = None, + ): + """ + Initialize LLM client with OpenAI-compatible API. + + Args: + base_url: Base URL of the API endpoint (e.g., https://api.openai.com/v1) + api_key: API key for authentication + model: Model name to use (e.g., gpt-4, claude-sonnet-4, llama3) + temperature: Sampling temperature (0.0-1.0) + max_tokens: Maximum tokens to generate + """ + settings = get_settings() + + # Use provided values or fall back to settings + self.base_url = base_url or settings.LLM_BASE_URL + self.api_key = api_key or settings.LLM_API_KEY + self.model = model or settings.LLM_MODEL + self.temperature = temperature if temperature is not None else settings.LLM_TEMPERATURE + self.max_tokens = max_tokens or settings.LLM_MAX_TOKENS + + # Initialize AsyncOpenAI client + self.client = AsyncOpenAI(base_url=self.base_url, api_key=self.api_key) + + logger.info( + f"Initialized LLM client: base_url={self.base_url}, model={self.model}" + ) + + async def chat_completion( + self, + messages: List[Dict[str, str]], + temperature: Optional[float] = None, + max_tokens: Optional[int] = None, + stream: bool = False, + **kwargs: Any, + ) -> Dict[str, Any]: + """ + Generate chat completion using OpenAI-compatible API. + + Args: + messages: List of messages [{"role": "user", "content": "..."}] + temperature: Override default temperature + max_tokens: Override default max_tokens + stream: Enable streaming response + **kwargs: Additional parameters for the API + + Returns: + Response with generated text and metadata + """ + try: + response = await self.client.chat.completions.create( + model=self.model, + messages=messages, # type: ignore[arg-type] + temperature=temperature or self.temperature, + max_tokens=max_tokens or self.max_tokens, + stream=stream, + **kwargs, + ) + + if stream: + # Return generator for streaming + return {"stream": response} # type: ignore[dict-item] + + # Extract text from first choice + message = response.choices[0].message + content = message.content or "" + + return { + "content": content, + "model": response.model, + "usage": { + "prompt_tokens": response.usage.prompt_tokens if response.usage else 0, + "completion_tokens": ( + response.usage.completion_tokens if response.usage else 0 + ), + "total_tokens": response.usage.total_tokens if response.usage else 0, + }, + "finish_reason": response.choices[0].finish_reason, + } + + except Exception as e: + logger.error(f"LLM API call failed: {e}") + raise + + async def generate_with_system( + self, + system_prompt: str, + user_prompt: str, + temperature: Optional[float] = None, + max_tokens: Optional[int] = None, + **kwargs: Any, + ) -> str: + """ + Generate completion with system and user prompts. + + Args: + system_prompt: System instruction + user_prompt: User message + temperature: Override default temperature + max_tokens: Override default max_tokens + **kwargs: Additional API parameters + + Returns: + Generated text content + """ + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt}, + ] + + response = await self.chat_completion( + messages=messages, temperature=temperature, max_tokens=max_tokens, **kwargs + ) + + return response["content"] + + async def generate_json( + self, + messages: List[Dict[str, str]], + temperature: Optional[float] = None, + max_tokens: Optional[int] = None, + ) -> Dict[str, Any]: + """ + Generate JSON response (if provider supports response_format). + + Args: + messages: List of messages + temperature: Override default temperature + max_tokens: Override default max_tokens + + Returns: + Parsed JSON response + """ + import json + + try: + # Try with response_format if supported + response = await self.chat_completion( + messages=messages, + temperature=temperature or 0.3, # Lower temp for structured output + max_tokens=max_tokens, + response_format={"type": "json_object"}, + ) + except Exception as e: + logger.warning(f"response_format not supported, using plain completion: {e}") + # Fallback to plain completion + response = await self.chat_completion( + messages=messages, + temperature=temperature or 0.3, + max_tokens=max_tokens, + ) + + # Parse JSON from content + content = response["content"] + try: + return json.loads(content) + except json.JSONDecodeError as e: + logger.error(f"Failed to parse JSON response: {e}") + logger.debug(f"Raw content: {content}") + raise ValueError(f"LLM did not return valid JSON: {content[:200]}...") + + async def generate_stream( + self, + messages: List[Dict[str, str]], + temperature: Optional[float] = None, + max_tokens: Optional[int] = None, + ) -> Any: + """ + Generate streaming completion. + + Args: + messages: List of messages + temperature: Override default temperature + max_tokens: Override default max_tokens + + Yields: + Text chunks as they arrive + """ + response = await self.chat_completion( + messages=messages, + temperature=temperature, + max_tokens=max_tokens, + stream=True, + ) + + async for chunk in response["stream"]: # type: ignore[union-attr] + if chunk.choices and chunk.choices[0].delta.content: + yield chunk.choices[0].delta.content + + +# Singleton instance +_llm_client: Optional[LLMClient] = None + + +def get_llm_client() -> LLMClient: + """Get or create singleton LLM client instance.""" + global _llm_client + if _llm_client is None: + _llm_client = LLMClient() + return _llm_client + + +# Example usage +async def example_usage() -> None: + """Example of using the LLM client""" + + client = get_llm_client() + + # Simple completion + messages = [ + {"role": "system", "content": "You are a helpful datacenter expert."}, + {"role": "user", "content": "Explain what a VLAN is in 2 sentences."}, + ] + + response = await client.chat_completion(messages) + print(f"Response: {response['content']}") + print(f"Tokens used: {response['usage']['total_tokens']}") + + # JSON response + json_messages = [ + { + "role": "user", + "content": "List 3 common datacenter problems in JSON: {\"problems\": [...]}", + } + ] + + json_response = await client.generate_json(json_messages) + print(f"JSON: {json_response}") + + # Streaming + stream_messages = [{"role": "user", "content": "Count from 1 to 5"}] + + print("Streaming: ", end="") + async for chunk in client.generate_stream(stream_messages): + print(chunk, end="", flush=True) + print() + + +if __name__ == "__main__": + import asyncio + + asyncio.run(example_usage()) diff --git a/src/datacenter_docs/workers/__init__.py b/src/datacenter_docs/workers/__init__.py new file mode 100644 index 0000000..6262053 --- /dev/null +++ b/src/datacenter_docs/workers/__init__.py @@ -0,0 +1,13 @@ +""" +Celery Workers for Background Task Processing + +This module contains the Celery application and tasks for: +- Documentation generation (scheduled and on-demand) +- Auto-remediation execution +- Data collection from infrastructure +- Periodic maintenance tasks +""" + +from datacenter_docs.workers.celery_app import celery_app + +__all__ = ["celery_app"] diff --git a/src/datacenter_docs/workers/celery_app.py b/src/datacenter_docs/workers/celery_app.py new file mode 100644 index 0000000..49e477b --- /dev/null +++ b/src/datacenter_docs/workers/celery_app.py @@ -0,0 +1,161 @@ +""" +Celery Application Configuration + +Configures Celery for background task processing including: +- Task routing and queues +- Periodic task scheduling +- Result backend configuration +- Task serialization +""" + +import logging +from typing import Any + +from celery import Celery +from celery.schedules import crontab +from celery.signals import task_failure, task_postrun, task_prerun, task_success + +from datacenter_docs.utils.config import get_settings + +# Configure logging +logger = logging.getLogger(__name__) + +# Get settings +settings = get_settings() + +# Initialize Celery app +celery_app = Celery( + "datacenter_docs", + broker=settings.CELERY_BROKER_URL, + backend=settings.CELERY_RESULT_BACKEND, + include=[ + "datacenter_docs.workers.tasks", + ], +) + +# Celery Configuration +celery_app.conf.update( + # Task settings + task_serializer="json", + result_serializer="json", + accept_content=["json"], + timezone="UTC", + enable_utc=True, + # Result backend + result_expires=3600, # Results expire after 1 hour + result_backend_transport_options={"master_name": "mymaster"}, + # Task execution + task_track_started=True, + task_time_limit=3600, # 1 hour hard limit + task_soft_time_limit=3000, # 50 minutes soft limit + # Worker settings + worker_prefetch_multiplier=1, # Prefetch only 1 task per worker + worker_max_tasks_per_child=1000, # Restart worker after 1000 tasks + # Task routing + task_routes={ + "datacenter_docs.workers.tasks.generate_documentation_task": {"queue": "documentation"}, + "datacenter_docs.workers.tasks.generate_section_task": {"queue": "documentation"}, + "datacenter_docs.workers.tasks.execute_auto_remediation_task": { + "queue": "auto_remediation" + }, + "datacenter_docs.workers.tasks.collect_infrastructure_data_task": { + "queue": "data_collection" + }, + "datacenter_docs.workers.tasks.cleanup_old_data_task": {"queue": "maintenance"}, + }, + # Task rate limits + task_annotations={ + "datacenter_docs.workers.tasks.execute_auto_remediation_task": { + "rate_limit": "10/h" + }, # Max 10 auto-remediations per hour + "datacenter_docs.workers.tasks.generate_documentation_task": {"rate_limit": "5/h"}, + }, + # Beat schedule (periodic tasks) + beat_schedule={ + # Generate all documentation every 6 hours + "generate-all-docs-every-6h": { + "task": "datacenter_docs.workers.tasks.generate_documentation_task", + "schedule": crontab(minute=0, hour="*/6"), # Every 6 hours + "args": (), + "options": {"queue": "documentation"}, + }, + # Collect infrastructure data every hour + "collect-data-hourly": { + "task": "datacenter_docs.workers.tasks.collect_infrastructure_data_task", + "schedule": crontab(minute=0), # Every hour + "args": (), + "options": {"queue": "data_collection"}, + }, + # Cleanup old data daily at 2 AM + "cleanup-daily": { + "task": "datacenter_docs.workers.tasks.cleanup_old_data_task", + "schedule": crontab(minute=0, hour=2), # 2 AM daily + "args": (), + "options": {"queue": "maintenance"}, + }, + # Update metrics every 15 minutes + "update-metrics-15m": { + "task": "datacenter_docs.workers.tasks.update_system_metrics_task", + "schedule": crontab(minute="*/15"), # Every 15 minutes + "args": (), + "options": {"queue": "maintenance"}, + }, + }, +) + + +# Task lifecycle signals +@task_prerun.connect +def task_prerun_handler(task_id: str, task: Any, args: tuple, kwargs: dict, **extra: Any) -> None: + """Log task start""" + logger.info(f"Task {task.name}[{task_id}] starting with args={args}, kwargs={kwargs}") + + +@task_postrun.connect +def task_postrun_handler( + task_id: str, task: Any, args: tuple, kwargs: dict, retval: Any, **extra: Any +) -> None: + """Log task completion""" + logger.info(f"Task {task.name}[{task_id}] completed with result={retval}") + + +@task_success.connect +def task_success_handler(sender: Any, result: Any, **kwargs: Any) -> None: + """Log task success""" + logger.info(f"Task {sender.name} succeeded with result={result}") + + +@task_failure.connect +def task_failure_handler( + task_id: str, exception: Exception, args: tuple, kwargs: dict, traceback: Any, **extra: Any +) -> None: + """Log task failure""" + logger.error( + f"Task {task_id} failed with exception={exception}, args={args}, kwargs={kwargs}", + exc_info=True, + ) + + +def start() -> None: + """ + Start the Celery worker + + This is the entry point called by the CLI command: + datacenter-docs worker + """ + import sys + + # Start worker with default options + celery_app.worker_main( + argv=[ + "worker", + "--loglevel=INFO", + "--concurrency=4", + "--queues=documentation,auto_remediation,data_collection,maintenance", + "--max-tasks-per-child=1000", + ] + ) + + +if __name__ == "__main__": + start() diff --git a/src/datacenter_docs/workers/tasks.py b/src/datacenter_docs/workers/tasks.py new file mode 100644 index 0000000..7a133aa --- /dev/null +++ b/src/datacenter_docs/workers/tasks.py @@ -0,0 +1,684 @@ +""" +Celery Tasks for Background Processing + +Contains all asynchronous tasks for: +- Documentation generation +- Auto-remediation execution +- Data collection +- System maintenance +""" + +import asyncio +import logging +from datetime import datetime, timedelta +from typing import Any, Dict, List, Optional + +from beanie import init_beanie +from celery import Task +from motor.motor_asyncio import AsyncIOMotorClient + +from datacenter_docs.api.models import ( + AuditLog, + AutoRemediationPolicy, + ChatSession, + DocumentationSection, + RemediationApproval, + RemediationLog, + SystemMetric, + Ticket, + TicketFeedback, + TicketPattern, +) +from datacenter_docs.utils.config import get_settings +from datacenter_docs.workers.celery_app import celery_app + +# Configure logging +logger = logging.getLogger(__name__) + +# Settings +settings = get_settings() + + +# Custom base task with database initialization +class DatabaseTask(Task): + """Base task that initializes database connection""" + + _db_initialized = False + + async def init_db(self) -> None: + """Initialize database connection""" + if not self._db_initialized: + client = AsyncIOMotorClient(settings.MONGODB_URL) + database = client[settings.MONGODB_DATABASE] + + await init_beanie( + database=database, + document_models=[ + Ticket, + TicketFeedback, + RemediationLog, + RemediationApproval, + AutoRemediationPolicy, + TicketPattern, + DocumentationSection, + ChatSession, + SystemMetric, + AuditLog, + ], + ) + self._db_initialized = True + logger.info("Database initialized for Celery task") + + +# Documentation Generation Tasks +@celery_app.task( + bind=True, + base=DatabaseTask, + name="datacenter_docs.workers.tasks.generate_documentation_task", + max_retries=3, +) +def generate_documentation_task(self: DatabaseTask) -> Dict[str, Any]: + """ + Generate documentation for all sections + + This is the main scheduled task that runs every 6 hours + to regenerate all infrastructure documentation. + + Returns: + Dict with generation results for each section + """ + logger.info("Starting full documentation generation") + + async def _generate_all() -> Dict[str, Any]: + # Initialize database + await self.init_db() + + # Get all sections + sections = await DocumentationSection.find_all().to_list() + results = {} + + for section in sections: + try: + logger.info(f"Generating documentation for section: {section.section_id}") + + # Update status to processing + section.generation_status = "processing" + section.updated_at = datetime.now() + await section.save() + + # TODO: Implement actual generation logic + # This will require: + # 1. Collectors to gather data from infrastructure + # 2. Generators to create documentation from collected data + # 3. Vector store updates for search + + # Placeholder for now + results[section.section_id] = { + "status": "pending_implementation", + "message": "Collector and Generator modules not yet implemented", + } + + # Update section status + section.generation_status = "pending" + section.last_generated = datetime.now() + section.updated_at = datetime.now() + await section.save() + + # Log audit + audit = AuditLog( + action="generate_documentation", + actor="system", + resource_type="documentation_section", + resource_id=section.section_id, + details={"section_name": section.name}, + success=True, + ) + await audit.insert() + + except Exception as e: + logger.error(f"Failed to generate section {section.section_id}: {e}", exc_info=True) + section.generation_status = "failed" + section.updated_at = datetime.now() + await section.save() + + results[section.section_id] = {"status": "failed", "error": str(e)} + + logger.info(f"Documentation generation completed: {results}") + return results + + # Run async function + return asyncio.run(_generate_all()) + + +@celery_app.task( + bind=True, + base=DatabaseTask, + name="datacenter_docs.workers.tasks.generate_section_task", + max_retries=3, +) +def generate_section_task(self: DatabaseTask, section_id: str) -> Dict[str, Any]: + """ + Generate documentation for a specific section + + Args: + section_id: ID of the section to generate (e.g., 'vmware', 'kubernetes') + + Returns: + Dict with generation result + """ + logger.info(f"Starting documentation generation for section: {section_id}") + + async def _generate_section() -> Dict[str, Any]: + # Initialize database + await self.init_db() + + # Get section + section = await DocumentationSection.find_one( + DocumentationSection.section_id == section_id + ) + + if not section: + error_msg = f"Section not found: {section_id}" + logger.error(error_msg) + return {"status": "failed", "error": error_msg} + + try: + # Update status + section.generation_status = "processing" + section.updated_at = datetime.now() + await section.save() + + # TODO: Implement actual generation logic + # This will require: + # 1. Get appropriate collector for section (VMwareCollector, K8sCollector, etc.) + # 2. Collect data from infrastructure via MCP + # 3. Get appropriate generator for section + # 4. Generate documentation with LLM + # 5. Store in vector database for search + # 6. Update section metadata + + # Placeholder + result = { + "status": "pending_implementation", + "section_id": section_id, + "message": "Collector and Generator modules not yet implemented", + } + + # Update section + section.generation_status = "pending" + section.last_generated = datetime.now() + section.updated_at = datetime.now() + await section.save() + + # Log audit + audit = AuditLog( + action="generate_section", + actor="system", + resource_type="documentation_section", + resource_id=section_id, + details={"section_name": section.name}, + success=True, + ) + await audit.insert() + + logger.info(f"Section generation completed: {result}") + return result + + except Exception as e: + logger.error(f"Failed to generate section {section_id}: {e}", exc_info=True) + section.generation_status = "failed" + section.updated_at = datetime.now() + await section.save() + + return {"status": "failed", "section_id": section_id, "error": str(e)} + + return asyncio.run(_generate_section()) + + +# Auto-Remediation Tasks +@celery_app.task( + bind=True, + base=DatabaseTask, + name="datacenter_docs.workers.tasks.execute_auto_remediation_task", + max_retries=3, +) +def execute_auto_remediation_task(self: DatabaseTask, ticket_id: str) -> Dict[str, Any]: + """ + Execute auto-remediation for a ticket + + This task is queued when a ticket is created with auto_remediation_enabled=True + and the reliability score is high enough. + + Args: + ticket_id: ID of the ticket to remediate + + Returns: + Dict with execution result + """ + logger.info(f"Starting auto-remediation execution for ticket: {ticket_id}") + + async def _execute_remediation() -> Dict[str, Any]: + # Initialize database + await self.init_db() + + # Get ticket + ticket = await Ticket.find_one(Ticket.ticket_id == ticket_id) + + if not ticket: + error_msg = f"Ticket not found: {ticket_id}" + logger.error(error_msg) + return {"status": "failed", "error": error_msg} + + try: + # Import auto-remediation engine + from datacenter_docs.api.auto_remediation import AutoRemediationEngine + + # Create engine instance + engine = AutoRemediationEngine() + + # Execute remediation + result = await engine.execute_remediation( + ticket_id=ticket_id, dry_run=False, force=False + ) + + logger.info(f"Auto-remediation completed for {ticket_id}: {result}") + return result + + except Exception as e: + logger.error( + f"Failed to execute auto-remediation for {ticket_id}: {e}", exc_info=True + ) + + # Log failure + log_entry = RemediationLog( + ticket_id=ticket.id, + action_type="auto_remediation_task", + action_details={"error": str(e)}, + success=False, + error_message=str(e), + ) + await log_entry.insert() + + return {"status": "failed", "ticket_id": ticket_id, "error": str(e)} + + return asyncio.run(_execute_remediation()) + + +# Data Collection Tasks +@celery_app.task( + bind=True, + base=DatabaseTask, + name="datacenter_docs.workers.tasks.collect_infrastructure_data_task", + max_retries=3, +) +def collect_infrastructure_data_task( + self: DatabaseTask, collector_type: Optional[str] = None +) -> Dict[str, Any]: + """ + Collect data from infrastructure via MCP + + This task runs hourly to collect current infrastructure state. + + Args: + collector_type: Optional specific collector to run (vmware, kubernetes, etc.) + If None, runs all collectors + + Returns: + Dict with collection results + """ + logger.info(f"Starting infrastructure data collection (type={collector_type})") + + async def _collect_data() -> Dict[str, Any]: + # Initialize database + await self.init_db() + + results = { + "status": "success", + "collector_type": collector_type or "all", + "collectors_run": [], + "errors": [], + "timestamp": datetime.now().isoformat(), + } + + # Determine which collectors to run + collectors_to_run = [] + + if collector_type is None or collector_type == "all" or collector_type == "vmware": + collectors_to_run.append("vmware") + + # TODO: Add more collectors when implemented + # if collector_type is None or collector_type == "all" or collector_type == "kubernetes": + # collectors_to_run.append("kubernetes") + + # Run collectors + for collector_name in collectors_to_run: + try: + logger.info(f"Running {collector_name} collector...") + + if collector_name == "vmware": + from datacenter_docs.collectors import VMwareCollector + + collector = VMwareCollector() + collector_result = await collector.run() + + if collector_result.get("success"): + results["collectors_run"].append( + { + "name": collector_name, + "status": "success", + "data_collected": bool(collector_result.get("data")), + "statistics": collector_result.get("data", {}).get( + "statistics", {} + ), + } + ) + else: + error_msg = collector_result.get("error", "Unknown error") + results["errors"].append( + {"collector": collector_name, "error": error_msg} + ) + logger.error(f"{collector_name} collector failed: {error_msg}") + + # TODO: Add other collectors here + # elif collector_name == "kubernetes": + # from datacenter_docs.collectors import KubernetesCollector + # collector = KubernetesCollector() + # collector_result = await collector.run() + # ... + + except Exception as e: + error_msg = str(e) + results["errors"].append({"collector": collector_name, "error": error_msg}) + logger.error( + f"Failed to run {collector_name} collector: {e}", exc_info=True + ) + + # Update status based on results + if results["errors"]: + results["status"] = "partial_success" if results["collectors_run"] else "failed" + + # Log metric + metric = SystemMetric( + metric_type="data_collection", + metric_name="infrastructure_scan", + value=float(len(results["collectors_run"])), + dimensions={ + "collector_type": collector_type or "all", + "status": results["status"], + }, + ) + await metric.insert() + + logger.info( + f"Data collection completed: {len(results['collectors_run'])} collectors, " + f"{len(results['errors'])} errors" + ) + return results + + return asyncio.run(_collect_data()) + + +# Maintenance Tasks +@celery_app.task( + bind=True, + base=DatabaseTask, + name="datacenter_docs.workers.tasks.cleanup_old_data_task", + max_retries=3, +) +def cleanup_old_data_task(self: DatabaseTask, days_to_keep: int = 90) -> Dict[str, Any]: + """ + Cleanup old data from database + + Runs daily at 2 AM to remove old records. + + Args: + days_to_keep: Number of days to keep data (default 90) + + Returns: + Dict with cleanup results + """ + logger.info(f"Starting data cleanup (keeping last {days_to_keep} days)") + + async def _cleanup() -> Dict[str, Any]: + # Initialize database + await self.init_db() + + cutoff_date = datetime.now() - timedelta(days=days_to_keep) + results = {} + + try: + # Cleanup old tickets + old_tickets = await Ticket.find(Ticket.created_at < cutoff_date).delete() + results["tickets_deleted"] = old_tickets.deleted_count if old_tickets else 0 + + # Cleanup old remediation logs + old_logs = await RemediationLog.find(RemediationLog.executed_at < cutoff_date).delete() + results["remediation_logs_deleted"] = old_logs.deleted_count if old_logs else 0 + + # Cleanup old metrics + old_metrics = await SystemMetric.find(SystemMetric.timestamp < cutoff_date).delete() + results["metrics_deleted"] = old_metrics.deleted_count if old_metrics else 0 + + # Cleanup old audit logs + old_audits = await AuditLog.find(AuditLog.timestamp < cutoff_date).delete() + results["audit_logs_deleted"] = old_audits.deleted_count if old_audits else 0 + + # Cleanup old chat sessions (keep only last 30 days) + chat_cutoff = datetime.now() - timedelta(days=30) + old_chats = await ChatSession.find(ChatSession.started_at < chat_cutoff).delete() + results["chat_sessions_deleted"] = old_chats.deleted_count if old_chats else 0 + + results["status"] = "success" + results["cutoff_date"] = cutoff_date.isoformat() + + logger.info(f"Cleanup completed: {results}") + + # Log audit + audit = AuditLog( + action="cleanup_old_data", + actor="system", + resource_type="database", + resource_id="maintenance", + details=results, + success=True, + ) + await audit.insert() + + return results + + except Exception as e: + logger.error(f"Cleanup failed: {e}", exc_info=True) + return { + "status": "failed", + "error": str(e), + "cutoff_date": cutoff_date.isoformat(), + } + + return asyncio.run(_cleanup()) + + +@celery_app.task( + bind=True, + base=DatabaseTask, + name="datacenter_docs.workers.tasks.update_system_metrics_task", + max_retries=3, +) +def update_system_metrics_task(self: DatabaseTask) -> Dict[str, Any]: + """ + Update system-wide metrics + + Runs every 15 minutes to calculate and store system metrics. + + Returns: + Dict with updated metrics + """ + logger.info("Updating system metrics") + + async def _update_metrics() -> Dict[str, Any]: + # Initialize database + await self.init_db() + + metrics = {} + + try: + # Calculate ticket metrics + total_tickets = await Ticket.find_all().count() + resolved_tickets = await Ticket.find(Ticket.status == "resolved").count() + pending_tickets = await Ticket.find(Ticket.status == "processing").count() + + metrics["total_tickets"] = total_tickets + metrics["resolved_tickets"] = resolved_tickets + metrics["pending_tickets"] = pending_tickets + metrics["resolution_rate"] = ( + (resolved_tickets / total_tickets * 100) if total_tickets > 0 else 0 + ) + + # Store metrics + await SystemMetric( + metric_type="tickets", + metric_name="total", + value=float(total_tickets), + ).insert() + + await SystemMetric( + metric_type="tickets", + metric_name="resolved", + value=float(resolved_tickets), + ).insert() + + await SystemMetric( + metric_type="tickets", + metric_name="resolution_rate", + value=metrics["resolution_rate"], + ).insert() + + # Auto-remediation metrics + total_remediations = await RemediationLog.find_all().count() + successful_remediations = await RemediationLog.find( + RemediationLog.success == True + ).count() + + metrics["total_remediations"] = total_remediations + metrics["successful_remediations"] = successful_remediations + metrics["remediation_success_rate"] = ( + (successful_remediations / total_remediations * 100) + if total_remediations > 0 + else 0 + ) + + await SystemMetric( + metric_type="auto_remediation", + metric_name="success_rate", + value=metrics["remediation_success_rate"], + ).insert() + + # Documentation metrics + total_sections = await DocumentationSection.find_all().count() + completed_sections = await DocumentationSection.find( + DocumentationSection.generation_status == "completed" + ).count() + + metrics["total_sections"] = total_sections + metrics["completed_sections"] = completed_sections + + await SystemMetric( + metric_type="documentation", + metric_name="completion_rate", + value=(completed_sections / total_sections * 100) if total_sections > 0 else 0, + ).insert() + + metrics["status"] = "success" + metrics["timestamp"] = datetime.now().isoformat() + + logger.info(f"Metrics updated: {metrics}") + return metrics + + except Exception as e: + logger.error(f"Failed to update metrics: {e}", exc_info=True) + return {"status": "failed", "error": str(e)} + + return asyncio.run(_update_metrics()) + + +# Ticket processing task +@celery_app.task( + bind=True, + base=DatabaseTask, + name="datacenter_docs.workers.tasks.process_ticket_task", + max_retries=3, +) +def process_ticket_task(self: DatabaseTask, ticket_id: str) -> Dict[str, Any]: + """ + Process a ticket asynchronously + + This task analyzes the ticket, suggests resolutions, and optionally + executes auto-remediation. + + Args: + ticket_id: ID of the ticket to process + + Returns: + Dict with processing result + """ + logger.info(f"Processing ticket: {ticket_id}") + + async def _process_ticket() -> Dict[str, Any]: + # Initialize database + await self.init_db() + + ticket = await Ticket.find_one(Ticket.ticket_id == ticket_id) + + if not ticket: + error_msg = f"Ticket not found: {ticket_id}" + logger.error(error_msg) + return {"status": "failed", "error": error_msg} + + try: + # Import agent for ticket analysis + from datacenter_docs.chat.agent import DocumentationAgent + + # Create agent + agent = DocumentationAgent() + + # Analyze and resolve ticket + resolution_result = await agent.resolve_ticket( + ticket_id=ticket_id, + description=ticket.description, + category=ticket.category or "general", + ) + + # Update ticket + ticket.resolution = resolution_result.get("resolution") + ticket.suggested_actions = resolution_result.get("suggested_actions", []) + ticket.related_docs = resolution_result.get("related_docs", []) + ticket.confidence_score = resolution_result.get("confidence_score") + ticket.updated_at = datetime.now() + + # If auto-remediation is enabled and reliability is high enough + if ticket.auto_remediation_enabled and resolution_result.get("reliability_score", 0) >= 85: + # Queue auto-remediation task + execute_auto_remediation_task.delay(ticket_id) + ticket.status = "pending_approval" + else: + ticket.status = "resolved" + + await ticket.save() + + result = { + "status": "success", + "ticket_id": ticket_id, + "resolution": resolution_result, + } + + logger.info(f"Ticket processed: {result}") + return result + + except Exception as e: + logger.error(f"Failed to process ticket {ticket_id}: {e}", exc_info=True) + ticket.status = "failed" + ticket.updated_at = datetime.now() + await ticket.save() + + return {"status": "failed", "ticket_id": ticket_id, "error": str(e)} + + return asyncio.run(_process_ticket())