This commit introduces major performance improvements and migrates from custom job scheduling to Celery Beat for better reliability and scalability. ### 🚀 Performance Optimizations **CVE2CAPEC Client Performance (Fixed startup blocking)** - Implement lazy loading with 24-hour cache for CVE2CAPEC mappings - Add background task for CVE2CAPEC sync (data_sync_tasks.sync_cve2capec) - Remove blocking data fetch during client initialization - API endpoint: POST /api/sync-cve2capec **ExploitDB Client Performance (Fixed webapp request blocking)** - Implement global file index cache to prevent rebuilding on every request - Add lazy loading with 24-hour cache expiry for 46K+ exploit index - Background task for index building (data_sync_tasks.build_exploitdb_index) - API endpoint: POST /api/build-exploitdb-index ### 🔄 Celery Migration & Scheduling **Celery Beat Integration** - Migrate from custom job scheduler to Celery Beat for reliability - Remove 'finetuned' LLM provider (logic moved to ollama container) - Optimized daily workflow with proper timing and dependencies **New Celery Tasks Structure** - tasks/bulk_tasks.py - NVD bulk processing and SIGMA generation - tasks/data_sync_tasks.py - All data synchronization tasks - tasks/maintenance_tasks.py - System maintenance and cleanup - tasks/sigma_tasks.py - SIGMA rule generation tasks **Daily Schedule (Optimized)** ``` 1:00 AM → Weekly cleanup (Sundays) 1:30 AM → Daily result cleanup 2:00 AM → NVD incremental update 3:00 AM → CISA KEV sync 3:15 AM → Nomi-sec PoC sync 3:30 AM → GitHub PoC sync 3:45 AM → ExploitDB sync 4:00 AM → CVE2CAPEC MITRE ATT&CK sync 4:15 AM → ExploitDB index rebuild 5:00 AM → Reference content sync 8:00 AM → SIGMA rule generation 9:00 AM → LLM-enhanced SIGMA generation Every 15min → Health checks ``` ### 🐳 Docker & Infrastructure **Enhanced Docker Setup** - Ollama setup with integrated SIGMA model creation (setup_ollama_with_sigma.py) - Initial database population check and trigger (initial_setup.py) - Proper service dependencies and health checks - Remove manual post-rebuild script requirements **Service Architecture** - Celery worker with 4-queue system (default, bulk_processing, sigma_generation, data_sync) - Flower monitoring dashboard (localhost:5555) - Redis as message broker and result backend ### 🎯 API Improvements **Background Task Endpoints** - GitHub PoC sync now uses Celery (was blocking backend) - All sync operations return task IDs and monitoring URLs - Consistent error handling and progress tracking **New Endpoints** - POST /api/sync-cve2capec - CVE2CAPEC mapping sync - POST /api/build-exploitdb-index - ExploitDB index rebuild ### 📁 Cleanup **Removed Files** - fix_sigma_model.sh (replaced by setup_ollama_with_sigma.py) - Various test_* and debug_* files no longer needed - Old training scripts related to removed 'finetuned' provider - Utility scripts replaced by Docker services ### 🔧 Configuration **Key Files Added/Modified** - backend/celery_config.py - Complete Celery configuration - backend/initial_setup.py - First-boot database population - backend/setup_ollama_with_sigma.py - Integrated Ollama setup - CLAUDE.md - Project documentation and development guide 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
256 lines
No EOL
8.9 KiB
Python
256 lines
No EOL
8.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enhanced Ollama setup script that includes SIGMA model creation.
|
|
This integrates the functionality from fix_sigma_model.sh into the Docker container.
|
|
"""
|
|
import os
|
|
import json
|
|
import time
|
|
import requests
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
from typing import Dict, List, Optional
|
|
|
|
OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL', 'http://ollama:11434')
|
|
DEFAULT_MODEL = os.getenv('LLM_MODEL', 'llama3.2')
|
|
SIGMA_MODEL_NAME = 'sigma-llama'
|
|
|
|
def log(message: str, level: str = "INFO"):
|
|
"""Log message with timestamp"""
|
|
timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
|
|
print(f"[{timestamp}] {level}: {message}")
|
|
|
|
def wait_for_ollama(max_retries: int = 30, delay: int = 5) -> bool:
|
|
"""Wait for Ollama service to be ready"""
|
|
log("Waiting for Ollama service to be ready...")
|
|
|
|
for attempt in range(max_retries):
|
|
try:
|
|
response = requests.get(f"{OLLAMA_BASE_URL}/api/tags", timeout=10)
|
|
if response.status_code == 200:
|
|
log("✅ Ollama service is ready!")
|
|
return True
|
|
except requests.exceptions.RequestException as e:
|
|
log(f"Attempt {attempt + 1}/{max_retries}: Ollama not ready yet ({e})", "DEBUG")
|
|
|
|
if attempt < max_retries - 1:
|
|
time.sleep(delay)
|
|
|
|
log("❌ Ollama service failed to become ready", "ERROR")
|
|
return False
|
|
|
|
def get_available_models() -> List[str]:
|
|
"""Get list of available models"""
|
|
try:
|
|
response = requests.get(f"{OLLAMA_BASE_URL}/api/tags", timeout=10)
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
models = [model.get('name', '') for model in data.get('models', [])]
|
|
log(f"Available models: {models}")
|
|
return models
|
|
else:
|
|
log(f"Failed to get models: HTTP {response.status_code}", "ERROR")
|
|
return []
|
|
except Exception as e:
|
|
log(f"Error getting models: {e}", "ERROR")
|
|
return []
|
|
|
|
def pull_model(model_name: str) -> bool:
|
|
"""Pull a model if not available"""
|
|
log(f"Pulling model: {model_name}")
|
|
|
|
try:
|
|
payload = {"name": model_name}
|
|
response = requests.post(
|
|
f"{OLLAMA_BASE_URL}/api/pull",
|
|
json=payload,
|
|
timeout=600, # 10 minutes timeout
|
|
stream=True
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
# Stream and log progress
|
|
for line in response.iter_lines():
|
|
if line:
|
|
try:
|
|
data = json.loads(line.decode('utf-8'))
|
|
status = data.get('status', '')
|
|
if status:
|
|
log(f"Pull progress: {status}", "DEBUG")
|
|
if data.get('error'):
|
|
log(f"Pull error: {data.get('error')}", "ERROR")
|
|
return False
|
|
except json.JSONDecodeError:
|
|
continue
|
|
|
|
log(f"✅ Successfully pulled model: {model_name}")
|
|
return True
|
|
else:
|
|
log(f"❌ Failed to pull model {model_name}: HTTP {response.status_code}", "ERROR")
|
|
return False
|
|
|
|
except Exception as e:
|
|
log(f"❌ Error pulling model {model_name}: {e}", "ERROR")
|
|
return False
|
|
|
|
def create_sigma_model() -> bool:
|
|
"""Create the sigma-llama model with specialized SIGMA generation configuration"""
|
|
log("🔄 Creating sigma-llama model...")
|
|
|
|
# First, remove any existing sigma-llama model
|
|
try:
|
|
response = requests.delete(f"{OLLAMA_BASE_URL}/api/delete",
|
|
json={"name": SIGMA_MODEL_NAME},
|
|
timeout=30)
|
|
if response.status_code == 200:
|
|
log("Removed existing sigma-llama model")
|
|
except Exception:
|
|
pass # Model might not exist, that's fine
|
|
|
|
# Create Modelfile content without the FROM line
|
|
modelfile_content = """TEMPLATE \"\"\"### Instruction:
|
|
Generate SIGMA rule logsource and detection sections based on the provided context.
|
|
|
|
### Input:
|
|
{{ .Prompt }}
|
|
|
|
### Response:
|
|
\"\"\"
|
|
|
|
PARAMETER temperature 0.1
|
|
PARAMETER top_p 0.9
|
|
PARAMETER stop "### Instruction:"
|
|
PARAMETER stop "### Response:"
|
|
PARAMETER num_ctx 4096
|
|
|
|
SYSTEM \"\"\"You are a cybersecurity expert specializing in SIGMA rule creation. Generate valid SIGMA rules in YAML format based on the provided CVE and exploit information. Output ONLY valid YAML starting with 'title:' and ending with the last YAML line.\"\"\"
|
|
"""
|
|
|
|
try:
|
|
# Create the model using the API with 'from' parameter
|
|
payload = {
|
|
"name": SIGMA_MODEL_NAME,
|
|
"from": f"{DEFAULT_MODEL}:latest",
|
|
"modelfile": modelfile_content,
|
|
"stream": False
|
|
}
|
|
|
|
response = requests.post(
|
|
f"{OLLAMA_BASE_URL}/api/create",
|
|
json=payload,
|
|
timeout=300, # 5 minutes timeout
|
|
stream=True
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
# Stream and log progress
|
|
for line in response.iter_lines():
|
|
if line:
|
|
try:
|
|
data = json.loads(line.decode('utf-8'))
|
|
status = data.get('status', '')
|
|
if status:
|
|
log(f"Model creation: {status}", "DEBUG")
|
|
if data.get('error'):
|
|
log(f"Model creation error: {data.get('error')}", "ERROR")
|
|
return False
|
|
except json.JSONDecodeError:
|
|
continue
|
|
|
|
# Verify the model was created
|
|
models = get_available_models()
|
|
if any(SIGMA_MODEL_NAME in model for model in models):
|
|
log("✅ sigma-llama model created successfully!")
|
|
return True
|
|
else:
|
|
log("❌ sigma-llama model not found after creation", "ERROR")
|
|
return False
|
|
else:
|
|
log(f"❌ Failed to create sigma-llama model: HTTP {response.status_code}", "ERROR")
|
|
try:
|
|
error_data = response.json()
|
|
log(f"Error details: {error_data}", "ERROR")
|
|
except:
|
|
log(f"Error response: {response.text}", "ERROR")
|
|
return False
|
|
|
|
except Exception as e:
|
|
log(f"❌ Error creating sigma-llama model: {e}", "ERROR")
|
|
return False
|
|
|
|
def test_sigma_model() -> bool:
|
|
"""Test the sigma-llama model"""
|
|
log("🔄 Testing sigma-llama model...")
|
|
|
|
try:
|
|
test_payload = {
|
|
"model": SIGMA_MODEL_NAME,
|
|
"prompt": "Title: Test PowerShell Rule",
|
|
"stream": False
|
|
}
|
|
|
|
response = requests.post(
|
|
f"{OLLAMA_BASE_URL}/api/generate",
|
|
json=test_payload,
|
|
timeout=60
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
test_response = data.get('response', '')[:100] # First 100 chars
|
|
log(f"✅ Model test successful! Response: {test_response}...")
|
|
return True
|
|
else:
|
|
log(f"❌ Model test failed: HTTP {response.status_code}", "ERROR")
|
|
return False
|
|
|
|
except Exception as e:
|
|
log(f"❌ Error testing model: {e}", "ERROR")
|
|
return False
|
|
|
|
def main():
|
|
"""Main setup function"""
|
|
log("🚀 Starting enhanced Ollama setup with SIGMA model creation...")
|
|
|
|
# Step 1: Wait for Ollama to be ready
|
|
if not wait_for_ollama():
|
|
log("❌ Setup failed: Ollama service not available", "ERROR")
|
|
sys.exit(1)
|
|
|
|
# Step 2: Check current models
|
|
models = get_available_models()
|
|
log(f"Current models: {models}")
|
|
|
|
# Step 3: Pull default model if needed
|
|
if not any(DEFAULT_MODEL in model for model in models):
|
|
log(f"Default model {DEFAULT_MODEL} not found, pulling...")
|
|
if not pull_model(DEFAULT_MODEL):
|
|
log(f"❌ Setup failed: Could not pull {DEFAULT_MODEL}", "ERROR")
|
|
sys.exit(1)
|
|
else:
|
|
log(f"✅ Default model {DEFAULT_MODEL} already available")
|
|
|
|
# Step 4: Create SIGMA model
|
|
if not create_sigma_model():
|
|
log("❌ Setup failed: Could not create sigma-llama model", "ERROR")
|
|
sys.exit(1)
|
|
|
|
# Step 5: Test SIGMA model
|
|
if not test_sigma_model():
|
|
log("⚠️ Setup warning: sigma-llama model test failed", "WARN")
|
|
# Don't exit here, the model might still work
|
|
|
|
# Step 6: Final verification
|
|
final_models = get_available_models()
|
|
log(f"Final models available: {final_models}")
|
|
|
|
if any(SIGMA_MODEL_NAME in model for model in final_models):
|
|
log("🎉 Setup complete! sigma-llama model is ready for use.")
|
|
sys.exit(0)
|
|
else:
|
|
log("❌ Setup failed: sigma-llama model not available after setup", "ERROR")
|
|
sys.exit(1)
|
|
|
|
if __name__ == "__main__":
|
|
main() |