This commit introduces major performance improvements and migrates from custom job scheduling to Celery Beat for better reliability and scalability. ### 🚀 Performance Optimizations **CVE2CAPEC Client Performance (Fixed startup blocking)** - Implement lazy loading with 24-hour cache for CVE2CAPEC mappings - Add background task for CVE2CAPEC sync (data_sync_tasks.sync_cve2capec) - Remove blocking data fetch during client initialization - API endpoint: POST /api/sync-cve2capec **ExploitDB Client Performance (Fixed webapp request blocking)** - Implement global file index cache to prevent rebuilding on every request - Add lazy loading with 24-hour cache expiry for 46K+ exploit index - Background task for index building (data_sync_tasks.build_exploitdb_index) - API endpoint: POST /api/build-exploitdb-index ### 🔄 Celery Migration & Scheduling **Celery Beat Integration** - Migrate from custom job scheduler to Celery Beat for reliability - Remove 'finetuned' LLM provider (logic moved to ollama container) - Optimized daily workflow with proper timing and dependencies **New Celery Tasks Structure** - tasks/bulk_tasks.py - NVD bulk processing and SIGMA generation - tasks/data_sync_tasks.py - All data synchronization tasks - tasks/maintenance_tasks.py - System maintenance and cleanup - tasks/sigma_tasks.py - SIGMA rule generation tasks **Daily Schedule (Optimized)** ``` 1:00 AM → Weekly cleanup (Sundays) 1:30 AM → Daily result cleanup 2:00 AM → NVD incremental update 3:00 AM → CISA KEV sync 3:15 AM → Nomi-sec PoC sync 3:30 AM → GitHub PoC sync 3:45 AM → ExploitDB sync 4:00 AM → CVE2CAPEC MITRE ATT&CK sync 4:15 AM → ExploitDB index rebuild 5:00 AM → Reference content sync 8:00 AM → SIGMA rule generation 9:00 AM → LLM-enhanced SIGMA generation Every 15min → Health checks ``` ### 🐳 Docker & Infrastructure **Enhanced Docker Setup** - Ollama setup with integrated SIGMA model creation (setup_ollama_with_sigma.py) - Initial database population check and trigger (initial_setup.py) - Proper service dependencies and health checks - Remove manual post-rebuild script requirements **Service Architecture** - Celery worker with 4-queue system (default, bulk_processing, sigma_generation, data_sync) - Flower monitoring dashboard (localhost:5555) - Redis as message broker and result backend ### 🎯 API Improvements **Background Task Endpoints** - GitHub PoC sync now uses Celery (was blocking backend) - All sync operations return task IDs and monitoring URLs - Consistent error handling and progress tracking **New Endpoints** - POST /api/sync-cve2capec - CVE2CAPEC mapping sync - POST /api/build-exploitdb-index - ExploitDB index rebuild ### 📁 Cleanup **Removed Files** - fix_sigma_model.sh (replaced by setup_ollama_with_sigma.py) - Various test_* and debug_* files no longer needed - Old training scripts related to removed 'finetuned' provider - Utility scripts replaced by Docker services ### 🔧 Configuration **Key Files Added/Modified** - backend/celery_config.py - Complete Celery configuration - backend/initial_setup.py - First-boot database population - backend/setup_ollama_with_sigma.py - Integrated Ollama setup - CLAUDE.md - Project documentation and development guide 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
235 lines
6.9 KiB
Python
235 lines
6.9 KiB
Python
"""
|
|
Bulk processing tasks for Celery
|
|
"""
|
|
import asyncio
|
|
import logging
|
|
from typing import Optional, Dict, Any
|
|
from celery import current_task
|
|
from celery_config import celery_app, get_db_session
|
|
import sys
|
|
import os
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
from bulk_seeder import BulkSeeder
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
@celery_app.task(bind=True, name='bulk_tasks.full_bulk_seed')
|
|
def full_bulk_seed_task(self, start_year: int = 2002, end_year: Optional[int] = None,
|
|
skip_nvd: bool = False, skip_nomi_sec: bool = False,
|
|
skip_exploitdb: bool = False, skip_cisa_kev: bool = False) -> Dict[str, Any]:
|
|
"""
|
|
Celery task for full bulk seeding operation
|
|
|
|
Args:
|
|
start_year: Starting year for NVD data
|
|
end_year: Ending year for NVD data
|
|
skip_nvd: Skip NVD bulk processing
|
|
skip_nomi_sec: Skip nomi-sec PoC synchronization
|
|
skip_exploitdb: Skip ExploitDB synchronization
|
|
skip_cisa_kev: Skip CISA KEV synchronization
|
|
|
|
Returns:
|
|
Dictionary containing operation results
|
|
"""
|
|
db_session = get_db_session()
|
|
|
|
try:
|
|
# Update task progress
|
|
self.update_state(
|
|
state='PROGRESS',
|
|
meta={
|
|
'stage': 'initializing',
|
|
'progress': 0,
|
|
'message': 'Starting bulk seeding operation'
|
|
}
|
|
)
|
|
|
|
logger.info(f"Starting full bulk seed task: {start_year}-{end_year}")
|
|
|
|
# Create seeder instance
|
|
seeder = BulkSeeder(db_session)
|
|
|
|
# Create progress callback
|
|
def update_progress(stage: str, progress: int, message: str = None):
|
|
self.update_state(
|
|
state='PROGRESS',
|
|
meta={
|
|
'stage': stage,
|
|
'progress': progress,
|
|
'message': message or f'Processing {stage}'
|
|
}
|
|
)
|
|
|
|
# Run the bulk seeding operation
|
|
# Note: We need to handle the async nature of bulk_seeder
|
|
loop = asyncio.new_event_loop()
|
|
asyncio.set_event_loop(loop)
|
|
|
|
try:
|
|
result = loop.run_until_complete(
|
|
seeder.full_bulk_seed(
|
|
start_year=start_year,
|
|
end_year=end_year,
|
|
skip_nvd=skip_nvd,
|
|
skip_nomi_sec=skip_nomi_sec,
|
|
skip_exploitdb=skip_exploitdb,
|
|
skip_cisa_kev=skip_cisa_kev,
|
|
progress_callback=update_progress
|
|
)
|
|
)
|
|
finally:
|
|
loop.close()
|
|
|
|
# Update final progress
|
|
self.update_state(
|
|
state='SUCCESS',
|
|
meta={
|
|
'stage': 'completed',
|
|
'progress': 100,
|
|
'message': 'Bulk seeding completed successfully'
|
|
}
|
|
)
|
|
|
|
logger.info(f"Full bulk seed task completed: {result}")
|
|
return result
|
|
|
|
except Exception as e:
|
|
logger.error(f"Full bulk seed task failed: {e}")
|
|
self.update_state(
|
|
state='FAILURE',
|
|
meta={
|
|
'stage': 'error',
|
|
'progress': 0,
|
|
'message': f'Task failed: {str(e)}',
|
|
'error': str(e)
|
|
}
|
|
)
|
|
raise
|
|
finally:
|
|
db_session.close()
|
|
|
|
@celery_app.task(bind=True, name='bulk_tasks.incremental_update_task')
|
|
def incremental_update_task(self) -> Dict[str, Any]:
|
|
"""
|
|
Celery task for incremental updates
|
|
|
|
Returns:
|
|
Dictionary containing update results
|
|
"""
|
|
db_session = get_db_session()
|
|
|
|
try:
|
|
# Update task progress
|
|
self.update_state(
|
|
state='PROGRESS',
|
|
meta={
|
|
'stage': 'incremental_update',
|
|
'progress': 0,
|
|
'message': 'Starting incremental update'
|
|
}
|
|
)
|
|
|
|
logger.info("Starting incremental update task")
|
|
|
|
# Create seeder instance
|
|
seeder = BulkSeeder(db_session)
|
|
|
|
# Run the incremental update
|
|
loop = asyncio.new_event_loop()
|
|
asyncio.set_event_loop(loop)
|
|
|
|
try:
|
|
result = loop.run_until_complete(seeder.incremental_update())
|
|
finally:
|
|
loop.close()
|
|
|
|
# Update final progress
|
|
self.update_state(
|
|
state='SUCCESS',
|
|
meta={
|
|
'stage': 'completed',
|
|
'progress': 100,
|
|
'message': 'Incremental update completed successfully'
|
|
}
|
|
)
|
|
|
|
logger.info(f"Incremental update task completed: {result}")
|
|
return result
|
|
|
|
except Exception as e:
|
|
logger.error(f"Incremental update task failed: {e}")
|
|
self.update_state(
|
|
state='FAILURE',
|
|
meta={
|
|
'stage': 'error',
|
|
'progress': 0,
|
|
'message': f'Task failed: {str(e)}',
|
|
'error': str(e)
|
|
}
|
|
)
|
|
raise
|
|
finally:
|
|
db_session.close()
|
|
|
|
@celery_app.task(bind=True, name='bulk_tasks.generate_enhanced_sigma_rules')
|
|
def generate_enhanced_sigma_rules_task(self) -> Dict[str, Any]:
|
|
"""
|
|
Celery task for generating enhanced SIGMA rules
|
|
|
|
Returns:
|
|
Dictionary containing generation results
|
|
"""
|
|
db_session = get_db_session()
|
|
|
|
try:
|
|
# Update task progress
|
|
self.update_state(
|
|
state='PROGRESS',
|
|
meta={
|
|
'stage': 'generating_rules',
|
|
'progress': 0,
|
|
'message': 'Starting enhanced SIGMA rule generation'
|
|
}
|
|
)
|
|
|
|
logger.info("Starting enhanced SIGMA rule generation task")
|
|
|
|
# Create seeder instance
|
|
seeder = BulkSeeder(db_session)
|
|
|
|
# Run the rule generation
|
|
loop = asyncio.new_event_loop()
|
|
asyncio.set_event_loop(loop)
|
|
|
|
try:
|
|
result = loop.run_until_complete(seeder.generate_enhanced_sigma_rules())
|
|
finally:
|
|
loop.close()
|
|
|
|
# Update final progress
|
|
self.update_state(
|
|
state='SUCCESS',
|
|
meta={
|
|
'stage': 'completed',
|
|
'progress': 100,
|
|
'message': 'Enhanced SIGMA rule generation completed successfully'
|
|
}
|
|
)
|
|
|
|
logger.info(f"Enhanced SIGMA rule generation task completed: {result}")
|
|
return result
|
|
|
|
except Exception as e:
|
|
logger.error(f"Enhanced SIGMA rule generation task failed: {e}")
|
|
self.update_state(
|
|
state='FAILURE',
|
|
meta={
|
|
'stage': 'error',
|
|
'progress': 0,
|
|
'message': f'Task failed: {str(e)}',
|
|
'error': str(e)
|
|
}
|
|
)
|
|
raise
|
|
finally:
|
|
db_session.close()
|