This commit introduces major performance improvements and migrates from custom job scheduling to Celery Beat for better reliability and scalability. ### 🚀 Performance Optimizations **CVE2CAPEC Client Performance (Fixed startup blocking)** - Implement lazy loading with 24-hour cache for CVE2CAPEC mappings - Add background task for CVE2CAPEC sync (data_sync_tasks.sync_cve2capec) - Remove blocking data fetch during client initialization - API endpoint: POST /api/sync-cve2capec **ExploitDB Client Performance (Fixed webapp request blocking)** - Implement global file index cache to prevent rebuilding on every request - Add lazy loading with 24-hour cache expiry for 46K+ exploit index - Background task for index building (data_sync_tasks.build_exploitdb_index) - API endpoint: POST /api/build-exploitdb-index ### 🔄 Celery Migration & Scheduling **Celery Beat Integration** - Migrate from custom job scheduler to Celery Beat for reliability - Remove 'finetuned' LLM provider (logic moved to ollama container) - Optimized daily workflow with proper timing and dependencies **New Celery Tasks Structure** - tasks/bulk_tasks.py - NVD bulk processing and SIGMA generation - tasks/data_sync_tasks.py - All data synchronization tasks - tasks/maintenance_tasks.py - System maintenance and cleanup - tasks/sigma_tasks.py - SIGMA rule generation tasks **Daily Schedule (Optimized)** ``` 1:00 AM → Weekly cleanup (Sundays) 1:30 AM → Daily result cleanup 2:00 AM → NVD incremental update 3:00 AM → CISA KEV sync 3:15 AM → Nomi-sec PoC sync 3:30 AM → GitHub PoC sync 3:45 AM → ExploitDB sync 4:00 AM → CVE2CAPEC MITRE ATT&CK sync 4:15 AM → ExploitDB index rebuild 5:00 AM → Reference content sync 8:00 AM → SIGMA rule generation 9:00 AM → LLM-enhanced SIGMA generation Every 15min → Health checks ``` ### 🐳 Docker & Infrastructure **Enhanced Docker Setup** - Ollama setup with integrated SIGMA model creation (setup_ollama_with_sigma.py) - Initial database population check and trigger (initial_setup.py) - Proper service dependencies and health checks - Remove manual post-rebuild script requirements **Service Architecture** - Celery worker with 4-queue system (default, bulk_processing, sigma_generation, data_sync) - Flower monitoring dashboard (localhost:5555) - Redis as message broker and result backend ### 🎯 API Improvements **Background Task Endpoints** - GitHub PoC sync now uses Celery (was blocking backend) - All sync operations return task IDs and monitoring URLs - Consistent error handling and progress tracking **New Endpoints** - POST /api/sync-cve2capec - CVE2CAPEC mapping sync - POST /api/build-exploitdb-index - ExploitDB index rebuild ### 📁 Cleanup **Removed Files** - fix_sigma_model.sh (replaced by setup_ollama_with_sigma.py) - Various test_* and debug_* files no longer needed - Old training scripts related to removed 'finetuned' provider - Utility scripts replaced by Docker services ### 🔧 Configuration **Key Files Added/Modified** - backend/celery_config.py - Complete Celery configuration - backend/initial_setup.py - First-boot database population - backend/setup_ollama_with_sigma.py - Integrated Ollama setup - CLAUDE.md - Project documentation and development guide 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
171 lines
No EOL
6.5 KiB
Python
171 lines
No EOL
6.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Initial setup script that runs once on first boot to populate the database.
|
|
This script checks if initial data seeding is needed and triggers it via Celery.
|
|
"""
|
|
import os
|
|
import sys
|
|
import time
|
|
import logging
|
|
from datetime import datetime, timedelta
|
|
from sqlalchemy import create_engine, text
|
|
from sqlalchemy.orm import sessionmaker
|
|
from sqlalchemy.exc import OperationalError
|
|
|
|
# Add the current directory to path so we can import our modules
|
|
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Database configuration
|
|
DATABASE_URL = os.getenv('DATABASE_URL', 'postgresql://cve_user:cve_password@db:5432/cve_sigma_db')
|
|
|
|
def wait_for_database(max_retries: int = 30, delay: int = 5) -> bool:
|
|
"""Wait for database to be ready"""
|
|
logger.info("Waiting for database to be ready...")
|
|
|
|
for attempt in range(max_retries):
|
|
try:
|
|
engine = create_engine(DATABASE_URL)
|
|
with engine.connect() as conn:
|
|
conn.execute(text("SELECT 1"))
|
|
logger.info("✅ Database is ready!")
|
|
return True
|
|
except OperationalError as e:
|
|
logger.info(f"Attempt {attempt + 1}/{max_retries}: Database not ready yet ({e})")
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error connecting to database: {e}")
|
|
|
|
if attempt < max_retries - 1:
|
|
time.sleep(delay)
|
|
|
|
logger.error("❌ Database failed to become ready")
|
|
return False
|
|
|
|
def check_initial_setup_needed() -> bool:
|
|
"""Check if initial setup is needed by examining the database state"""
|
|
try:
|
|
engine = create_engine(DATABASE_URL)
|
|
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
|
|
|
with SessionLocal() as session:
|
|
# Check if we have any CVEs in the database
|
|
result = session.execute(text("SELECT COUNT(*) FROM cves")).fetchone()
|
|
cve_count = result[0] if result else 0
|
|
|
|
logger.info(f"Current CVE count in database: {cve_count}")
|
|
|
|
# Check if we have any bulk processing jobs that completed successfully
|
|
bulk_jobs_result = session.execute(text("""
|
|
SELECT COUNT(*) FROM bulk_processing_jobs
|
|
WHERE job_type = 'nvd_bulk_seed'
|
|
AND status = 'completed'
|
|
AND created_at > NOW() - INTERVAL '30 days'
|
|
""")).fetchone()
|
|
|
|
recent_bulk_jobs = bulk_jobs_result[0] if bulk_jobs_result else 0
|
|
|
|
logger.info(f"Recent successful bulk seed jobs: {recent_bulk_jobs}")
|
|
|
|
# Initial setup needed if:
|
|
# 1. Very few CVEs (less than 1000) AND
|
|
# 2. No recent successful bulk seed jobs
|
|
initial_setup_needed = cve_count < 1000 and recent_bulk_jobs == 0
|
|
|
|
if initial_setup_needed:
|
|
logger.info("🔄 Initial setup is needed - will trigger full NVD sync")
|
|
else:
|
|
logger.info("✅ Initial setup already completed - database has sufficient data")
|
|
|
|
return initial_setup_needed
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error checking initial setup status: {e}")
|
|
# If we can't check, assume setup is needed
|
|
return True
|
|
|
|
def trigger_initial_bulk_seed():
|
|
"""Trigger initial bulk seed via Celery"""
|
|
try:
|
|
# Import here to avoid circular dependencies
|
|
from celery_config import celery_app
|
|
from tasks.bulk_tasks import full_bulk_seed_task
|
|
|
|
logger.info("🚀 Triggering initial full NVD bulk seed...")
|
|
|
|
# Start a comprehensive bulk seed job
|
|
# Start from 2020 for faster initial setup, can be adjusted
|
|
task_result = full_bulk_seed_task.delay(
|
|
start_year=2020, # Start from 2020 for faster initial setup
|
|
end_year=None, # Current year
|
|
skip_nvd=False,
|
|
skip_nomi_sec=True, # Skip nomi-sec initially, will be done daily
|
|
skip_exploitdb=True, # Skip exploitdb initially, will be done daily
|
|
skip_cisa_kev=True # Skip CISA KEV initially, will be done daily
|
|
)
|
|
|
|
logger.info(f"✅ Initial bulk seed task started with ID: {task_result.id}")
|
|
logger.info(f"Monitor progress at: http://localhost:5555/task/{task_result.id}")
|
|
|
|
return task_result.id
|
|
|
|
except Exception as e:
|
|
logger.error(f"❌ Failed to trigger initial bulk seed: {e}")
|
|
return None
|
|
|
|
def create_initial_setup_marker():
|
|
"""Create a marker to indicate initial setup was attempted"""
|
|
try:
|
|
engine = create_engine(DATABASE_URL)
|
|
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
|
|
|
with SessionLocal() as session:
|
|
# Insert a marker record
|
|
session.execute(text("""
|
|
INSERT INTO bulk_processing_jobs (job_type, status, job_metadata, created_at, started_at)
|
|
VALUES ('initial_setup_marker', 'completed', '{"purpose": "initial_setup_marker"}', NOW(), NOW())
|
|
ON CONFLICT DO NOTHING
|
|
"""))
|
|
session.commit()
|
|
|
|
logger.info("✅ Created initial setup marker")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error creating initial setup marker: {e}")
|
|
|
|
def main():
|
|
"""Main initial setup function"""
|
|
logger.info("🚀 Starting initial setup check...")
|
|
|
|
# Step 1: Wait for database
|
|
if not wait_for_database():
|
|
logger.error("❌ Initial setup failed: Database not available")
|
|
sys.exit(1)
|
|
|
|
# Step 2: Check if initial setup is needed
|
|
if not check_initial_setup_needed():
|
|
logger.info("🎉 Initial setup not needed - database already populated")
|
|
sys.exit(0)
|
|
|
|
# Step 3: Wait for Celery to be ready
|
|
logger.info("Waiting for Celery workers to be ready...")
|
|
time.sleep(10) # Give Celery workers time to start
|
|
|
|
# Step 4: Trigger initial bulk seed
|
|
task_id = trigger_initial_bulk_seed()
|
|
|
|
if task_id:
|
|
# Step 5: Create marker
|
|
create_initial_setup_marker()
|
|
|
|
logger.info("🎉 Initial setup triggered successfully!")
|
|
logger.info(f"Task ID: {task_id}")
|
|
logger.info("The system will begin daily scheduled tasks once initial setup completes.")
|
|
sys.exit(0)
|
|
else:
|
|
logger.error("❌ Initial setup failed")
|
|
sys.exit(1)
|
|
|
|
if __name__ == "__main__":
|
|
main() |