feat: implement automatic ChromaDB indexing and loop-based documentation generation
Some checks failed
CI/CD Pipeline / Run Tests (push) Has been skipped
CI/CD Pipeline / Security Scanning (push) Has been skipped
CI/CD Pipeline / Generate Documentation (push) Failing after 8m31s
CI/CD Pipeline / Lint Code (push) Failing after 8m33s
CI/CD Pipeline / Build and Push Docker Images (api) (push) Has been skipped
CI/CD Pipeline / Build and Push Docker Images (chat) (push) Has been skipped
CI/CD Pipeline / Build and Push Docker Images (frontend) (push) Has been skipped
CI/CD Pipeline / Build and Push Docker Images (worker) (push) Has been skipped
CI/CD Pipeline / Deploy to Staging (push) Has been skipped
CI/CD Pipeline / Deploy to Production (push) Has been skipped

- Add automatic ChromaDB indexing after documentation generation
- Implement loop-based section generation for individual VM and container documentation
- Fix Celery anti-pattern in generate_proxmox_docs task (removed blocking .get() call)
- Share ChromaDB vector store volume between worker and chat services
- Add documentation management UI to frontend with manual job triggering and log viewing
- Fix frontend Socket.IO connection URL to point to correct chat service port
- Enhance DocumentationAgent.index_documentation() with automatic cleanup of old documents
- Update Proxmox template to generate individual files for each VM and container

This enables the RAG system to properly respond with infrastructure-specific information from the generated documentation.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-10-21 03:00:50 +02:00
parent 5dcd7bd2be
commit c8aebaaee3
9 changed files with 994 additions and 104 deletions

View File

@@ -339,6 +339,220 @@ async def list_sections() -> Dict[str, Any]:
}
# Documentation Jobs API
@app.post("/api/v1/documentation/jobs/proxmox")
async def trigger_proxmox_documentation() -> Dict[str, Any]:
"""
Trigger Proxmox documentation generation job
Returns job_id for status monitoring
"""
from celery import Celery
from datacenter_docs.utils.config import get_settings
try:
settings = get_settings()
# Create minimal Celery client for sending tasks
celery_client = Celery(
"datacenter_docs",
broker=settings.CELERY_BROKER_URL,
backend=settings.CELERY_RESULT_BACKEND,
)
# Send task by name to the documentation queue
task = celery_client.send_task(
"generate_proxmox_docs",
kwargs={},
queue="documentation",
)
logger.info(f"Triggered Proxmox docs generation job: {task.id}")
return {
"success": True,
"job_id": task.id,
"status": "queued",
"message": "Proxmox documentation generation job started",
}
except Exception as e:
logger.error(f"Failed to trigger Proxmox docs job: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=f"Failed to start job: {str(e)}")
@app.get("/api/v1/documentation/jobs/{job_id}/status")
async def get_job_status(job_id: str) -> Dict[str, Any]:
"""
Get status of documentation generation job
Returns current status and progress information
"""
from celery.result import AsyncResult
from datacenter_docs.workers.celery_app import celery_app
try:
result = AsyncResult(job_id, app=celery_app)
response = {
"job_id": job_id,
"status": result.state, # PENDING, STARTED, SUCCESS, FAILURE, RETRY
}
if result.state == "SUCCESS":
response["result"] = result.result
response["completed"] = True
elif result.state == "FAILURE":
response["error"] = str(result.info)
response["completed"] = True
elif result.state == "PROGRESS":
response["progress"] = result.info
response["completed"] = False
else:
response["completed"] = False
return response
except Exception as e:
logger.error(f"Failed to get job status: {e}")
raise HTTPException(status_code=500, detail=f"Failed to get status: {str(e)}")
@app.get("/api/v1/documentation/files")
async def list_documentation_files() -> Dict[str, Any]:
"""
List all generated documentation files
Returns available markdown files organized by category
"""
from pathlib import Path
output_dir = Path("/app/output")
if not output_dir.exists():
return {"categories": [], "total": 0}
files_by_category = {}
# Scan output directory
for category_dir in output_dir.iterdir():
if category_dir.is_dir():
category = category_dir.name
files = []
for file_path in category_dir.glob("*.md"):
stat = file_path.stat()
files.append({
"filename": file_path.name,
"size": stat.st_size,
"modified": stat.st_mtime,
"path": f"{category}/{file_path.name}"
})
if files:
files_by_category[category] = sorted(files, key=lambda x: x["modified"], reverse=True)
total_files = sum(len(files) for files in files_by_category.values())
return {
"categories": [
{"name": cat, "files": files}
for cat, files in files_by_category.items()
],
"total": total_files
}
@app.get("/api/v1/documentation/files/{category}/{filename}")
async def get_documentation_content(category: str, filename: str) -> Dict[str, Any]:
"""
Retrieve content of a specific documentation file
Returns markdown content
"""
from pathlib import Path
# Validate filename to prevent directory traversal
if ".." in filename or "/" in filename:
raise HTTPException(status_code=400, detail="Invalid filename")
if ".." in category or "/" in category:
raise HTTPException(status_code=400, detail="Invalid category")
file_path = Path(f"/app/output/{category}/{filename}")
if not file_path.exists():
raise HTTPException(status_code=404, detail="Documentation file not found")
if not file_path.is_file() or file_path.suffix != ".md":
raise HTTPException(status_code=400, detail="Invalid file type")
try:
content = file_path.read_text(encoding="utf-8")
stat = file_path.stat()
return {
"filename": filename,
"category": category,
"content": content,
"size": stat.st_size,
"modified": stat.st_mtime
}
except Exception as e:
logger.error(f"Failed to read file {file_path}: {e}")
raise HTTPException(status_code=500, detail=f"Failed to read file: {str(e)}")
@app.get("/api/v1/documentation/jobs/{job_id}/logs")
async def get_job_logs(job_id: str, tail: int = 100) -> Dict[str, Any]:
"""
Get logs for a specific documentation generation job
Returns recent log lines related to the job
"""
import subprocess
try:
# Get worker logs and filter for job_id
cmd = [
"docker",
"logs",
"datacenter-docs-worker-dev",
"--tail",
str(tail)
]
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=5
)
all_logs = result.stdout + result.stderr
# Filter logs containing the job_id
job_logs = [
line
for line in all_logs.split("\n")
if job_id in line or "ForkPoolWorker" in line
]
# Take last 50 relevant lines
relevant_logs = job_logs[-50:] if len(job_logs) > 50 else job_logs
return {
"job_id": job_id,
"logs": relevant_logs,
"total_lines": len(relevant_logs)
}
except subprocess.TimeoutExpired:
raise HTTPException(status_code=504, detail="Timeout getting logs")
except Exception as e:
logger.error(f"Failed to get job logs: {e}")
raise HTTPException(status_code=500, detail=f"Failed to get logs: {str(e)}")
# Stats and Metrics
@app.get("/api/v1/stats/tickets")
async def get_ticket_stats() -> Dict[str, Any]:

View File

@@ -76,14 +76,35 @@ class DocumentationAgent:
async def index_documentation(self, docs_path: Path) -> None:
"""Index all documentation files into vector store"""
logger.info("Indexing documentation...")
logger.info("📚 Indexing documentation...")
# Clear existing documents to avoid duplicates
if self.vector_store is not None:
try:
# Delete the collection and recreate it
logger.info("🗑️ Clearing old documentation from vector store...")
self.vector_store.delete_collection()
self.vector_store = Chroma(
persist_directory=str(self.vector_store_path),
embedding_function=self.embeddings,
)
logger.info("✅ Vector store cleared and recreated")
except Exception as e:
logger.warning(f"⚠️ Could not clear vector store (might be first run): {e}")
documents = []
files_processed = 0
# Read all markdown files
for md_file in docs_path.glob("**/*.md"):
with open(md_file, "r", encoding="utf-8") as f:
content = f.read()
try:
with open(md_file, "r", encoding="utf-8") as f:
content = f.read()
# Skip empty files
if not content.strip():
logger.warning(f"⚠️ Skipping empty file: {md_file}")
continue
# Split into chunks
splitter = RecursiveCharacterTextSplitter(
@@ -91,6 +112,7 @@ class DocumentationAgent:
)
chunks = splitter.split_text(content)
files_processed += 1
for i, chunk in enumerate(chunks):
doc = Document(
@@ -104,12 +126,19 @@ class DocumentationAgent:
)
documents.append(doc)
logger.info(f" ✓ Indexed {md_file.name} ({len(chunks)} chunks)")
except Exception as e:
logger.error(f"❌ Failed to index {md_file}: {e}")
# Add to vector store
if self.vector_store is not None:
if self.vector_store is not None and documents:
logger.info(f"💾 Adding {len(documents)} chunks to vector store...")
self.vector_store.add_documents(documents)
self.vector_store.persist()
logger.info(f"Indexed {len(documents)} chunks from documentation")
logger.info(f"✅ Indexed {files_processed} files ({len(documents)} chunks) from documentation")
else:
logger.warning("⚠️ No documents to index")
async def search_documentation(
self, query: str, sections: Optional[List[str]] = None, limit: int = 5

View File

@@ -240,12 +240,15 @@ Guidelines:
return prompt
def _format_data_for_prompt(self, data: Any) -> str:
def _format_data_for_prompt(self, data: Any, max_items: int = 10) -> str:
"""
Format data for inclusion in LLM prompt
For large datasets, limits the number of items to prevent overwhelming small LLMs.
Args:
data: Data to format (dict, list, str, etc.)
max_items: Maximum number of items to include for lists (default: 10)
Returns:
Formatted string representation
@@ -253,7 +256,24 @@ Guidelines:
if data is None:
return "No data available"
if isinstance(data, (dict, list)):
if isinstance(data, list):
# Limit list size for small LLMs
total_count = len(data)
if total_count > max_items:
limited_data = data[:max_items]
try:
formatted = json.dumps(limited_data, indent=2, default=str)
summary = f"\n\n**Note: Showing {max_items} of {total_count} items. Full count: {total_count}**\n"
return formatted + summary
except Exception:
return str(limited_data) + summary
# Small list, show all
try:
return json.dumps(data, indent=2, default=str)
except Exception:
return str(data)
if isinstance(data, dict):
# Pretty print JSON for structured data
try:
return json.dumps(data, indent=2, default=str)
@@ -305,12 +325,15 @@ Guidelines:
This is useful for very large documentation where you want each
section as a separate file.
Supports looped sections where one section definition generates
multiple output files (e.g., one file per VM).
Args:
data: Collected infrastructure data
save_individually: Save each section as separate file
Returns:
List of results for each section
List of results for each section (may be multiple for looped sections)
"""
results = []
output_config = self.template.output_config
@@ -322,57 +345,251 @@ Guidelines:
section_id = section_def.get("id")
section_title = section_def.get("title")
# Generate section
content = await self.generate_section(section_def, data)
# Check if this section should loop over items
loop_over = section_def.get("loop_over")
if not content:
results.append(
{
"section_id": section_id,
"success": False,
"error": "Generation failed",
}
if loop_over:
# Generate one section per item in the loop
loop_results = await self._generate_looped_sections(
section_def, data, output_dir, save_to_db, save_to_file, save_individually
)
continue
results.extend(loop_results)
else:
# Generate single section (original behavior)
# Generate section
content = await self.generate_section(section_def, data)
result = {
"section_id": section_id,
"title": section_title,
"success": True,
"content": content,
}
if not content:
results.append(
{
"section_id": section_id,
"success": False,
"error": "Generation failed",
}
)
continue
# Save section if requested
if save_individually:
if save_to_file:
# Save to file
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
filename = f"{section_id}.md"
file_path = output_path / filename
file_path.write_text(content, encoding="utf-8")
result["file_path"] = str(file_path)
self.logger.info(f"Saved section to: {file_path}")
result = {
"section_id": section_id,
"title": section_title,
"success": True,
"content": content,
}
if save_to_db:
# Save to database
metadata = {
"section_id": section_id,
"template": str(self.template.path),
"category": section_def.get("category", ""),
}
# Create temporary generator for this section
temp_gen = BaseGenerator.__new__(BaseGenerator)
temp_gen.name = self.name
temp_gen.section = section_id
temp_gen.logger = self.logger
temp_gen.llm = self.llm
await temp_gen.save_to_database(content, metadata)
# Save section if requested
if save_individually:
if save_to_file:
# Save to file
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
filename = f"{section_id}.md"
file_path = output_path / filename
file_path.write_text(content, encoding="utf-8")
result["file_path"] = str(file_path)
self.logger.info(f"Saved section to: {file_path}")
results.append(result)
if save_to_db:
# Save to database using our own instance
metadata = {
"section_id": section_id,
"template": str(self.template.path),
"category": section_def.get("category", ""),
}
# Use current instance with modified section name
original_section = self.section
self.section = section_id
try:
await self.save_to_database(content, metadata)
finally:
# Restore original section name
self.section = original_section
results.append(result)
return results
async def _generate_looped_sections(
self,
section_def: Dict[str, Any],
full_data: Dict[str, Any],
output_dir: str,
save_to_db: bool,
save_to_file: bool,
save_individually: bool,
) -> List[Dict[str, Any]]:
"""
Generate multiple sections by looping over items
Args:
section_def: Section definition with loop_over key
full_data: Complete collected data
output_dir: Output directory
save_to_db: Whether to save to database
save_to_file: Whether to save to file
save_individually: Whether to save each item individually
Returns:
List of results, one per looped item
"""
results = []
section_id = section_def.get("id")
loop_over = section_def.get("loop_over")
loop_item_name = section_def.get("loop_item_name", "item")
# Get the data to loop over
data_section = full_data.get("data", {})
items = data_section.get(loop_over, [])
if not isinstance(items, list):
self.logger.warning(f"loop_over '{loop_over}' is not a list, skipping")
return []
total_items = len(items)
self.logger.info(f"Looping over {total_items} {loop_over} to generate individual sections")
# Generate one section per item
for idx, item in enumerate(items, 1):
try:
# Create item-specific section definition
item_section_def = section_def.copy()
# Get item identifier for filename
item_id = item.get("vmid") or item.get("id") or item.get("name") or f"item_{idx}"
item_name = item.get("name", f"{loop_over}_{item_id}")
# Build item-specific data context
item_data = self._build_item_context(section_def, full_data, item, loop_item_name)
# Build prompt with item context
prompt_template = item_section_def.get("prompt_template", "")
prompt = self._build_prompt(prompt_template, item_data)
# Get generation config
gen_config = self.template.generation_config
temperature = gen_config.get("temperature", 0.7)
max_tokens = gen_config.get("max_tokens", 4000)
# System prompt
system_prompt = """You are a technical documentation expert specializing in datacenter infrastructure.
Generate clear, accurate, and well-structured documentation in Markdown format.
Guidelines:
- Use proper Markdown formatting (headers, tables, lists, code blocks)
- Be precise and factual based on provided data
- Include practical examples and recommendations
- Use tables for structured data
- Use bullet points for lists
- Use code blocks for commands/configurations
- Organize content with clear sections
- Write in a professional but accessible tone
"""
# Generate content
content = await self.generate_with_llm(
system_prompt=system_prompt,
user_prompt=prompt,
temperature=temperature,
max_tokens=max_tokens,
)
# Create title with item name
# Support both {item_name} and {loop_item_name}_{name} patterns
title_template = item_section_def.get("title", "")
title_context = {
loop_item_name: item_name,
f"{loop_item_name}_name": item_name,
f"{loop_item_name}_id": str(item_id),
}
section_title = title_template.format(**title_context)
section_content = f"# {section_title}\n\n{content}\n\n"
result = {
"section_id": f"{section_id}_{item_id}",
"title": section_title,
"success": True,
"content": section_content,
"item_index": idx,
"total_items": total_items,
}
# Save if requested
if save_individually:
if save_to_file:
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
# Use category subdirectory if specified
category = section_def.get("category", "")
if category:
output_path = output_path / category
output_path.mkdir(parents=True, exist_ok=True)
filename = f"{section_id}_{item_id}.md"
file_path = output_path / filename
file_path.write_text(section_content, encoding="utf-8")
result["file_path"] = str(file_path)
self.logger.info(f"[{idx}/{total_items}] Saved: {file_path}")
if save_to_db:
metadata = {
"section_id": f"{section_id}_{item_id}",
"template": str(self.template.path),
"category": section_def.get("category", ""),
"item_id": str(item_id),
"item_name": item_name,
}
original_section = self.section
self.section = f"{section_id}_{item_id}"
try:
await self.save_to_database(section_content, metadata)
finally:
self.section = original_section
results.append(result)
self.logger.info(f"✓ [{idx}/{total_items}] Generated: {section_title}")
except Exception as e:
self.logger.error(f"Failed to generate section for {loop_over} item {idx}: {e}")
results.append({
"section_id": f"{section_id}_item_{idx}",
"success": False,
"error": str(e),
"item_index": idx,
"total_items": total_items,
})
return results
def _build_item_context(
self,
section_def: Dict[str, Any],
full_data: Dict[str, Any],
item: Dict[str, Any],
loop_item_name: str,
) -> Dict[str, Any]:
"""
Build data context for a looped item
Args:
section_def: Section definition
full_data: Complete collected data
item: Current loop item
loop_item_name: Variable name for the item in prompt
Returns:
Dictionary with item and any additional required data
"""
context = {loop_item_name: item}
# Add any additional required data from section definition
data_requirements = section_def.get("data_requirements", [])
data_section = full_data.get("data", {})
for req in data_requirements:
if req in data_section:
context[req] = data_section[req]
return context
async def example_usage() -> None:
"""Example of using template-based generator"""

View File

@@ -81,7 +81,8 @@ class LLMClient:
self.max_tokens = max_tokens or settings.LLM_MAX_TOKENS
# Initialize AsyncOpenAI client with custom HTTP client (disable SSL verification for self-signed certs)
http_client = httpx.AsyncClient(verify=False, timeout=30.0)
# Increased timeout to 120s for documentation generation (large prompts)
http_client = httpx.AsyncClient(verify=False, timeout=120.0)
self.client = AsyncOpenAI(
base_url=self.base_url,
api_key=self.api_key,
@@ -129,6 +130,13 @@ class LLMClient:
# Type guard: we know it's ChatCompletion when stream=False
response = cast(ChatCompletion, response)
# Check for None response or empty choices
if response is None:
raise ValueError("LLM returned None response")
if not response.choices or len(response.choices) == 0:
raise ValueError("LLM returned empty choices")
# Extract text from first choice
message = response.choices[0].message
content = message.content or ""

View File

@@ -94,6 +94,7 @@ async def _async_collect_and_generate(
Generation result
"""
from datacenter_docs.generators.template_generator import TemplateBasedGenerator
from datacenter_docs.chat.agent import DocumentationAgent
# Import appropriate collector
collector = await _get_collector(collector_name)
@@ -119,6 +120,23 @@ async def _async_collect_and_generate(
sections_generated = sum(1 for r in sections_results if r.get("success"))
sections_failed = sum(1 for r in sections_results if not r.get("success"))
# Index documentation into ChromaDB for RAG
logger.info("📚 Indexing generated documentation into ChromaDB...")
try:
# Initialize agent with vector store
agent = DocumentationAgent(vector_store_path="./data/chroma_db")
# Index all documentation from output directory
output_dir = Path("output")
if output_dir.exists():
await agent.index_documentation(output_dir)
logger.info("✅ Documentation indexed successfully into ChromaDB")
else:
logger.warning("⚠️ Output directory not found, skipping indexing")
except Exception as e:
logger.error(f"❌ Failed to index documentation: {e}", exc_info=True)
# Don't fail the whole task if indexing fails
return {
"sections_generated": sections_generated,
"sections_failed": sections_failed,
@@ -155,8 +173,8 @@ async def _get_collector(collector_name: str) -> Any:
return collectors[collector_name]()
@celery_app.task(name="generate_proxmox_docs")
def generate_proxmox_docs() -> Dict[str, Any]:
@celery_app.task(name="generate_proxmox_docs", bind=True)
def generate_proxmox_docs(self) -> Dict[str, Any]:
"""
Scheduled task to generate Proxmox documentation
@@ -165,11 +183,66 @@ def generate_proxmox_docs() -> Dict[str, Any]:
Returns:
Task result
"""
logger.info("Scheduled Proxmox documentation generation started")
import asyncio
task_id = self.request.id
logger.info(f"[{task_id}] Proxmox documentation generation started")
# Update task state
self.update_state(
state='PROGRESS',
meta={'current': 0, 'total': 6, 'status': 'Starting Proxmox documentation generation...'}
)
template_path = "templates/documentation/proxmox.yaml"
return collect_and_generate_docs(collector_name="proxmox", template_path=template_path)
result = {
"task_id": task_id,
"collector": "proxmox",
"template": template_path,
"success": False,
"started_at": datetime.now().isoformat(),
"completed_at": None,
"error": None,
"sections_generated": 0,
"sections_failed": 0,
}
try:
# Run async collection and generation directly (don't call another Celery task)
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
generation_result = loop.run_until_complete(
_async_collect_and_generate("proxmox", template_path)
)
loop.close()
# Update result
result.update(generation_result)
result["success"] = True
result["completed_at"] = datetime.now().isoformat()
logger.info(
f"[{task_id}] Proxmox documentation generation completed: "
f"{result['sections_generated']} sections generated, "
f"{result['sections_failed']} failed"
)
return result
except Exception as e:
result["error"] = str(e)
result["completed_at"] = datetime.now().isoformat()
logger.error(f"[{task_id}] Proxmox documentation generation failed: {e}", exc_info=True)
self.update_state(
state='FAILURE',
meta={'error': str(e), 'status': f'Failed: {str(e)}'}
)
raise
@celery_app.task(name="generate_all_docs")