This commit introduces major performance improvements and migrates from custom job scheduling to Celery Beat for better reliability and scalability. ### 🚀 Performance Optimizations **CVE2CAPEC Client Performance (Fixed startup blocking)** - Implement lazy loading with 24-hour cache for CVE2CAPEC mappings - Add background task for CVE2CAPEC sync (data_sync_tasks.sync_cve2capec) - Remove blocking data fetch during client initialization - API endpoint: POST /api/sync-cve2capec **ExploitDB Client Performance (Fixed webapp request blocking)** - Implement global file index cache to prevent rebuilding on every request - Add lazy loading with 24-hour cache expiry for 46K+ exploit index - Background task for index building (data_sync_tasks.build_exploitdb_index) - API endpoint: POST /api/build-exploitdb-index ### 🔄 Celery Migration & Scheduling **Celery Beat Integration** - Migrate from custom job scheduler to Celery Beat for reliability - Remove 'finetuned' LLM provider (logic moved to ollama container) - Optimized daily workflow with proper timing and dependencies **New Celery Tasks Structure** - tasks/bulk_tasks.py - NVD bulk processing and SIGMA generation - tasks/data_sync_tasks.py - All data synchronization tasks - tasks/maintenance_tasks.py - System maintenance and cleanup - tasks/sigma_tasks.py - SIGMA rule generation tasks **Daily Schedule (Optimized)** ``` 1:00 AM → Weekly cleanup (Sundays) 1:30 AM → Daily result cleanup 2:00 AM → NVD incremental update 3:00 AM → CISA KEV sync 3:15 AM → Nomi-sec PoC sync 3:30 AM → GitHub PoC sync 3:45 AM → ExploitDB sync 4:00 AM → CVE2CAPEC MITRE ATT&CK sync 4:15 AM → ExploitDB index rebuild 5:00 AM → Reference content sync 8:00 AM → SIGMA rule generation 9:00 AM → LLM-enhanced SIGMA generation Every 15min → Health checks ``` ### 🐳 Docker & Infrastructure **Enhanced Docker Setup** - Ollama setup with integrated SIGMA model creation (setup_ollama_with_sigma.py) - Initial database population check and trigger (initial_setup.py) - Proper service dependencies and health checks - Remove manual post-rebuild script requirements **Service Architecture** - Celery worker with 4-queue system (default, bulk_processing, sigma_generation, data_sync) - Flower monitoring dashboard (localhost:5555) - Redis as message broker and result backend ### 🎯 API Improvements **Background Task Endpoints** - GitHub PoC sync now uses Celery (was blocking backend) - All sync operations return task IDs and monitoring URLs - Consistent error handling and progress tracking **New Endpoints** - POST /api/sync-cve2capec - CVE2CAPEC mapping sync - POST /api/build-exploitdb-index - ExploitDB index rebuild ### 📁 Cleanup **Removed Files** - fix_sigma_model.sh (replaced by setup_ollama_with_sigma.py) - Various test_* and debug_* files no longer needed - Old training scripts related to removed 'finetuned' provider - Utility scripts replaced by Docker services ### 🔧 Configuration **Key Files Added/Modified** - backend/celery_config.py - Complete Celery configuration - backend/initial_setup.py - First-boot database population - backend/setup_ollama_with_sigma.py - Integrated Ollama setup - CLAUDE.md - Project documentation and development guide 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
222 lines
8 KiB
Python
222 lines
8 KiB
Python
"""
|
|
Celery configuration for the Auto SIGMA Rule Generator
|
|
"""
|
|
import os
|
|
from celery import Celery
|
|
from celery.schedules import crontab
|
|
from kombu import Queue
|
|
|
|
# Celery configuration
|
|
broker_url = os.getenv('CELERY_BROKER_URL', 'redis://redis:6379/0')
|
|
result_backend = os.getenv('CELERY_RESULT_BACKEND', 'redis://redis:6379/0')
|
|
|
|
# Create Celery app
|
|
celery_app = Celery(
|
|
'sigma_generator',
|
|
broker=broker_url,
|
|
backend=result_backend,
|
|
include=[
|
|
'tasks.bulk_tasks',
|
|
'tasks.sigma_tasks',
|
|
'tasks.data_sync_tasks',
|
|
'tasks.maintenance_tasks'
|
|
]
|
|
)
|
|
|
|
# Celery configuration
|
|
celery_app.conf.update(
|
|
# Serialization
|
|
task_serializer='json',
|
|
accept_content=['json'],
|
|
result_serializer='json',
|
|
|
|
# Timezone
|
|
timezone='UTC',
|
|
enable_utc=True,
|
|
|
|
# Task tracking
|
|
task_track_started=True,
|
|
task_send_sent_event=True,
|
|
|
|
# Result backend settings
|
|
result_expires=3600, # Results expire after 1 hour
|
|
result_backend_transport_options={
|
|
'master_name': 'mymaster',
|
|
'visibility_timeout': 3600,
|
|
},
|
|
|
|
# Worker settings
|
|
worker_prefetch_multiplier=1,
|
|
task_acks_late=True,
|
|
worker_max_tasks_per_child=1000,
|
|
|
|
# Task routes - different queues for different types of tasks
|
|
task_routes={
|
|
'tasks.bulk_tasks.*': {'queue': 'bulk_processing'},
|
|
'tasks.sigma_tasks.*': {'queue': 'sigma_generation'},
|
|
'tasks.data_sync_tasks.*': {'queue': 'data_sync'},
|
|
},
|
|
|
|
# Queue definitions
|
|
task_default_queue='default',
|
|
task_queues=(
|
|
Queue('default', routing_key='default'),
|
|
Queue('bulk_processing', routing_key='bulk_processing'),
|
|
Queue('sigma_generation', routing_key='sigma_generation'),
|
|
Queue('data_sync', routing_key='data_sync'),
|
|
),
|
|
|
|
# Retry settings
|
|
task_default_retry_delay=60, # 1 minute
|
|
task_max_retries=3,
|
|
|
|
# Monitoring
|
|
worker_send_task_events=True,
|
|
|
|
# Optimized Beat schedule for daily workflow
|
|
# WORKFLOW: NVD incremental -> Exploit syncs -> Reference sync -> SIGMA rules
|
|
beat_schedule={
|
|
# STEP 1: NVD Incremental Update - Daily at 2:00 AM
|
|
# This runs first to get the latest CVE data from NVD
|
|
'daily-nvd-incremental-update': {
|
|
'task': 'bulk_tasks.incremental_update_task',
|
|
'schedule': crontab(minute=0, hour=2), # Daily at 2:00 AM
|
|
'options': {'queue': 'bulk_processing'},
|
|
'kwargs': {'batch_size': 100, 'skip_nvd': False, 'skip_nomi_sec': True}
|
|
},
|
|
|
|
# STEP 2: Exploit Data Syncing - Daily starting at 3:00 AM
|
|
# These run in parallel but start at different times to avoid conflicts
|
|
|
|
# CISA KEV Sync - Daily at 3:00 AM (15 minutes after NVD)
|
|
'daily-cisa-kev-sync': {
|
|
'task': 'data_sync_tasks.sync_cisa_kev',
|
|
'schedule': crontab(minute=0, hour=3), # Daily at 3:00 AM
|
|
'options': {'queue': 'data_sync'},
|
|
'kwargs': {'batch_size': 100}
|
|
},
|
|
|
|
# Nomi-sec PoC Sync - Daily at 3:15 AM
|
|
'daily-nomi-sec-sync': {
|
|
'task': 'data_sync_tasks.sync_nomi_sec',
|
|
'schedule': crontab(minute=15, hour=3), # Daily at 3:15 AM
|
|
'options': {'queue': 'data_sync'},
|
|
'kwargs': {'batch_size': 100}
|
|
},
|
|
|
|
# GitHub PoC Sync - Daily at 3:30 AM
|
|
'daily-github-poc-sync': {
|
|
'task': 'data_sync_tasks.sync_github_poc',
|
|
'schedule': crontab(minute=30, hour=3), # Daily at 3:30 AM
|
|
'options': {'queue': 'data_sync'},
|
|
'kwargs': {'batch_size': 50}
|
|
},
|
|
|
|
# ExploitDB Sync - Daily at 3:45 AM
|
|
'daily-exploitdb-sync': {
|
|
'task': 'data_sync_tasks.sync_exploitdb',
|
|
'schedule': crontab(minute=45, hour=3), # Daily at 3:45 AM
|
|
'options': {'queue': 'data_sync'},
|
|
'kwargs': {'batch_size': 30}
|
|
},
|
|
|
|
# CVE2CAPEC MITRE ATT&CK Mapping Sync - Daily at 4:00 AM
|
|
'daily-cve2capec-sync': {
|
|
'task': 'data_sync_tasks.sync_cve2capec',
|
|
'schedule': crontab(minute=0, hour=4), # Daily at 4:00 AM
|
|
'options': {'queue': 'data_sync'},
|
|
'kwargs': {'force_refresh': False} # Only refresh if cache is stale
|
|
},
|
|
|
|
# ExploitDB Index Rebuild - Daily at 4:15 AM
|
|
'daily-exploitdb-index-build': {
|
|
'task': 'data_sync_tasks.build_exploitdb_index',
|
|
'schedule': crontab(minute=15, hour=4), # Daily at 4:15 AM
|
|
'options': {'queue': 'data_sync'}
|
|
},
|
|
|
|
# STEP 3: Reference Content Sync - Daily at 5:00 AM
|
|
# This is the longest-running task, starts after exploit syncs have time to complete
|
|
'daily-reference-content-sync': {
|
|
'task': 'data_sync_tasks.sync_reference_content',
|
|
'schedule': crontab(minute=0, hour=5), # Daily at 5:00 AM
|
|
'options': {'queue': 'data_sync'},
|
|
'kwargs': {'batch_size': 30, 'max_cves': 200, 'force_resync': False}
|
|
},
|
|
|
|
# STEP 4: SIGMA Rule Generation - Daily at 8:00 AM
|
|
# This runs LAST after all other daily data sync jobs
|
|
'daily-sigma-rule-generation': {
|
|
'task': 'bulk_tasks.generate_enhanced_sigma_rules',
|
|
'schedule': crontab(minute=0, hour=8), # Daily at 8:00 AM
|
|
'options': {'queue': 'sigma_generation'}
|
|
},
|
|
|
|
# LLM-Enhanced SIGMA Rule Generation - Daily at 9:00 AM
|
|
# Additional LLM-based rule generation after standard rules
|
|
'daily-llm-sigma-generation': {
|
|
'task': 'sigma_tasks.generate_enhanced_rules',
|
|
'schedule': crontab(minute=0, hour=9), # Daily at 9:00 AM
|
|
'options': {'queue': 'sigma_generation'},
|
|
'kwargs': {'cve_ids': None} # Process all CVEs with PoCs
|
|
},
|
|
|
|
# MAINTENANCE TASKS
|
|
|
|
# Database Cleanup - Weekly on Sunday at 1:00 AM (before daily workflow)
|
|
'weekly-database-cleanup': {
|
|
'task': 'tasks.maintenance_tasks.database_cleanup_comprehensive',
|
|
'schedule': crontab(minute=0, hour=1, day_of_week=0), # Sunday at 1:00 AM
|
|
'options': {'queue': 'default'},
|
|
'kwargs': {'days_to_keep': 30, 'cleanup_failed_jobs': True, 'cleanup_logs': True}
|
|
},
|
|
|
|
# Health Check - Every 15 minutes
|
|
'health-check-detailed': {
|
|
'task': 'tasks.maintenance_tasks.health_check_detailed',
|
|
'schedule': crontab(minute='*/15'), # Every 15 minutes
|
|
'options': {'queue': 'default'}
|
|
},
|
|
|
|
# Celery result cleanup - Daily at 1:30 AM
|
|
'daily-cleanup-old-results': {
|
|
'task': 'tasks.maintenance_tasks.cleanup_old_results',
|
|
'schedule': crontab(minute=30, hour=1), # Daily at 1:30 AM
|
|
'options': {'queue': 'default'}
|
|
},
|
|
},
|
|
)
|
|
|
|
# Configure logging
|
|
celery_app.conf.update(
|
|
worker_log_format='[%(asctime)s: %(levelname)s/%(processName)s] %(message)s',
|
|
worker_task_log_format='[%(asctime)s: %(levelname)s/%(processName)s][%(task_name)s(%(task_id)s)] %(message)s',
|
|
)
|
|
|
|
# Database session configuration for tasks
|
|
from sqlalchemy import create_engine
|
|
from sqlalchemy.orm import sessionmaker
|
|
|
|
# Database configuration
|
|
DATABASE_URL = os.getenv('DATABASE_URL', 'postgresql://cve_user:cve_password@db:5432/cve_sigma_db')
|
|
|
|
# Create engine and session factory
|
|
engine = create_engine(DATABASE_URL)
|
|
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
|
|
|
def get_db_session():
|
|
"""Get database session for tasks"""
|
|
return SessionLocal()
|
|
|
|
# Import all task modules to register them
|
|
def register_tasks():
|
|
"""Register all task modules"""
|
|
try:
|
|
from tasks import bulk_tasks, sigma_tasks, data_sync_tasks, maintenance_tasks
|
|
print("All task modules registered successfully")
|
|
except ImportError as e:
|
|
print(f"Warning: Could not import some task modules: {e}")
|
|
|
|
# Auto-register tasks when module is imported
|
|
if __name__ != "__main__":
|
|
register_tasks()
|