From de30d4ce99ba18191398bbf7ef157179e3e309da Mon Sep 17 00:00:00 2001 From: bpmcdevitt Date: Mon, 21 Jul 2025 13:24:38 -0500 Subject: [PATCH] CLEANUP: Remove legacy web application components and streamline for CLI-first architecture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit completes the transformation to a CLI-first SIGMA rule generator by removing all legacy web application components: REMOVED COMPONENTS: - Frontend React application (frontend/ directory) - Docker Compose web orchestration (docker-compose.yml, Dockerfiles) - FastAPI web backend (main.py, celery_config.py, bulk_seeder.py) - Web-specific task schedulers and executors - Initialization scripts for web deployment (start.sh, init.sql, Makefile) SIMPLIFIED ARCHITECTURE: - Created backend/database_models.py for migration-only database access - Updated CLI commands to use simplified database models - Retained core processing modules (sigma generator, PoC clients, NVD processor) - Fixed import paths in CLI migration and process commands The application now operates as a streamlined CLI tool with file-based SIGMA rule storage, eliminating web application complexity while maintaining all core CVE processing capabilities. ๐Ÿค– Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- CLAUDE.md | 39 +- Makefile | 70 - backend/Dockerfile | 26 - backend/bulk_seeder.py | 463 ------- backend/celery_config.py | 222 --- backend/convert_lora_to_gguf.py | 151 +++ backend/database_models.py | 91 ++ backend/delete_sigma_rules.py | 58 - backend/initial_setup.py | 171 --- backend/job_executors.py | 390 ------ backend/job_scheduler.py | 449 ------- backend/main.py | 1945 --------------------------- backend/mcdevitt_poc_client.py | 2 +- backend/scheduler_config.yaml | 182 --- backend/tasks/__init__.py | 3 - backend/tasks/bulk_tasks.py | 235 ---- backend/tasks/data_sync_tasks.py | 504 ------- backend/tasks/maintenance_tasks.py | 444 ------ backend/tasks/sigma_tasks.py | 409 ------ backend/test_enhanced_generation.py | 211 --- backend/yaml_metadata_generator.py | 155 +++ cli/commands/migrate_commands.py | 28 +- cli/commands/process_commands.py | 12 +- docker-compose.yml | 203 --- frontend/Dockerfile | 24 - frontend/package.json | 47 - frontend/postcss.config.js | 6 - frontend/public/index.html | 18 - frontend/src/App.css | 126 -- frontend/src/App.js | 1226 ----------------- frontend/src/index.js | 11 - frontend/tailwind.config.js | 33 - init.sql | 191 --- start.sh | 63 - 34 files changed, 434 insertions(+), 7774 deletions(-) delete mode 100644 Makefile delete mode 100644 backend/Dockerfile delete mode 100644 backend/bulk_seeder.py delete mode 100644 backend/celery_config.py create mode 100644 backend/convert_lora_to_gguf.py create mode 100644 backend/database_models.py delete mode 100644 backend/delete_sigma_rules.py delete mode 100644 backend/initial_setup.py delete mode 100644 backend/job_executors.py delete mode 100644 backend/job_scheduler.py delete mode 100644 backend/main.py delete mode 100644 backend/scheduler_config.yaml delete mode 100644 backend/tasks/__init__.py delete mode 100644 backend/tasks/bulk_tasks.py delete mode 100644 backend/tasks/data_sync_tasks.py delete mode 100644 backend/tasks/maintenance_tasks.py delete mode 100644 backend/tasks/sigma_tasks.py delete mode 100644 backend/test_enhanced_generation.py create mode 100644 backend/yaml_metadata_generator.py delete mode 100644 docker-compose.yml delete mode 100644 frontend/Dockerfile delete mode 100644 frontend/package.json delete mode 100644 frontend/postcss.config.js delete mode 100644 frontend/public/index.html delete mode 100644 frontend/src/App.css delete mode 100644 frontend/src/App.js delete mode 100644 frontend/src/index.js delete mode 100644 frontend/tailwind.config.js delete mode 100644 init.sql delete mode 100755 start.sh diff --git a/CLAUDE.md b/CLAUDE.md index 0584fc9..d44bb4b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -29,12 +29,10 @@ This is an enhanced CVE-SIGMA Auto Generator that has been **transformed from a - `rule_*.sigma`: Multiple SIGMA rule variants (template, LLM, hybrid) - `poc_analysis.json`: Extracted exploit indicators and analysis -### **Legacy Web Architecture (Optional, for Migration)** -- **Backend**: FastAPI with SQLAlchemy ORM (`backend/main.py`) -- **Frontend**: React with Tailwind CSS (`frontend/src/App.js`) -- **Database**: PostgreSQL (used only for migration to file-based system) -- **Cache**: Redis (optional) -- **Deployment**: Docker Compose (maintained for migration purposes) +### **Database Components (For Migration Only)** +- **Database Models**: `backend/database_models.py` - SQLAlchemy models for data migration +- **Legacy Support**: Core data processors maintained for CLI integration +- **Migration Tools**: Complete CLI-based migration utilities from legacy database ## Common Development Commands @@ -91,16 +89,13 @@ chmod +x cli/sigma_cli.py ./cli/sigma_cli.py stats overview ``` -### **Legacy Web Interface (Optional)** +### **Database Migration Support** ```bash -# Start legacy web interface (for migration only) -docker-compose up -d db redis backend frontend +# If you have an existing PostgreSQL database with CVE data +export DATABASE_URL="postgresql://user:pass@localhost:5432/cve_sigma_db" -# Access points: -# - Frontend: http://localhost:3000 -# - API: http://localhost:8000 -# - API Docs: http://localhost:8000/docs -# - Flower (Celery): http://localhost:5555 +# Migrate database to CLI file structure +./cli/sigma_cli.py migrate from-database --database-url $DATABASE_URL ``` ### **Development and Testing** @@ -138,12 +133,9 @@ ls -la cves/2024/CVE-2024-0001/ # View individual CVE files - `reports/`: Generated statistics and exports - `cli/`: Command-line tool and modules -### Legacy Service URLs (If Using Web Interface) -- Frontend: http://localhost:3000 -- Backend API: http://localhost:8000 -- API Documentation: http://localhost:8000/docs -- Database: localhost:5432 -- Redis: localhost:6379 +### Database Connection (For Migration Only) +- **PostgreSQL**: localhost:5432 (if migrating from legacy database) +- **Connection String**: Set via DATABASE_URL environment variable ### Enhanced API Endpoints @@ -187,13 +179,14 @@ ls -la cves/2024/CVE-2024-0001/ # View individual CVE files - **Metadata Format**: JSON files with processing history and PoC data - **Reports**: Generated statistics and export outputs -### **Legacy Backend Structure (For Migration)** -- **main.py**: Core FastAPI application (maintained for migration) -- **Data Processors**: Reused by CLI for CVE fetching and analysis +### **Backend Data Processors (Reused by CLI)** +- **database_models.py**: SQLAlchemy models for data migration +- **Data Processors**: Core processing logic reused by CLI - `nvd_bulk_processor.py`: NVD JSON dataset processing - `nomi_sec_client.py`: nomi-sec PoC integration - `enhanced_sigma_generator.py`: SIGMA rule generation - `llm_client.py`: Multi-provider LLM integration + - `poc_analyzer.py`: PoC content analysis ### **CLI-Based Data Processing Flow** 1. **CVE Processing**: NVD data fetch โ†’ File storage โ†’ PoC analysis โ†’ Metadata generation diff --git a/Makefile b/Makefile deleted file mode 100644 index cfebe35..0000000 --- a/Makefile +++ /dev/null @@ -1,70 +0,0 @@ -.PHONY: help start stop restart build logs clean dev setup - -# Default target -help: - @echo "CVE-SIGMA Auto Generator - Available Commands:" - @echo "==============================================" - @echo " make start - Start the application" - @echo " make stop - Stop the application" - @echo " make restart - Restart the application" - @echo " make build - Build and start with fresh images" - @echo " make logs - Show application logs" - @echo " make clean - Stop and remove all containers/volumes" - @echo " make dev - Start in development mode" - @echo " make setup - Initial setup (copy .env, etc.)" - @echo " make help - Show this help message" - -# Initial setup -setup: - @echo "๐Ÿ”ง Setting up CVE-SIGMA Auto Generator..." - @if [ ! -f .env ]; then \ - cp .env.example .env; \ - echo "โœ… .env file created from .env.example"; \ - echo "๐Ÿ’ก Edit .env to add your NVD API key for better rate limits"; \ - else \ - echo "โœ… .env file already exists"; \ - fi - -# Start the application -start: setup - @echo "๐Ÿš€ Starting CVE-SIGMA Auto Generator..." - docker-compose up -d - @echo "โœ… Application started!" - @echo "๐ŸŒ Frontend: http://localhost:3000" - @echo "๐Ÿ”ง Backend: http://localhost:8000" - @echo "๐Ÿ“š API Docs: http://localhost:8000/docs" - -# Stop the application -stop: - @echo "๐Ÿ›‘ Stopping CVE-SIGMA Auto Generator..." - docker-compose down - @echo "โœ… Application stopped!" - -# Restart the application -restart: stop start - -# Build and start with fresh images -build: setup - @echo "๐Ÿ”จ Building and starting CVE-SIGMA Auto Generator..." - docker-compose up -d --build - @echo "โœ… Application built and started!" - -# Show logs -logs: - @echo "๐Ÿ“‹ Application logs (press Ctrl+C to exit):" - docker-compose logs -f - -# Clean everything -clean: - @echo "๐Ÿงน Cleaning up CVE-SIGMA Auto Generator..." - docker-compose down -v --remove-orphans - docker system prune -f - @echo "โœ… Cleanup complete!" - -# Development mode (with hot reload) -dev: setup - @echo "๐Ÿ”ง Starting in development mode..." - docker-compose -f docker-compose.yml up -d db redis - @echo "๐Ÿ’ก Database and Redis started. Run backend and frontend locally for development." - @echo " Backend: cd backend && pip install -r requirements.txt && uvicorn main:app --reload" - @echo " Frontend: cd frontend && npm install && npm start" diff --git a/backend/Dockerfile b/backend/Dockerfile deleted file mode 100644 index 927b36c..0000000 --- a/backend/Dockerfile +++ /dev/null @@ -1,26 +0,0 @@ -FROM python:3.11-slim - -WORKDIR /app - -# Install system dependencies -RUN apt-get update && apt-get install -y \ - gcc \ - postgresql-client \ - && rm -rf /var/lib/apt/lists/* - -# Copy requirements first for better caching -COPY requirements.txt . - -# Install Python dependencies -RUN pip install --no-cache-dir -r requirements.txt - -# Copy application code -COPY . . - -# Create non-root user -RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app -USER appuser - -EXPOSE 8000 - -CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/backend/bulk_seeder.py b/backend/bulk_seeder.py deleted file mode 100644 index bcf2dcc..0000000 --- a/backend/bulk_seeder.py +++ /dev/null @@ -1,463 +0,0 @@ -""" -Bulk Data Seeding Coordinator -Orchestrates the complete bulk seeding process using NVD JSON feeds and nomi-sec PoC data -""" - -import asyncio -import logging -from datetime import datetime, timedelta -from typing import Optional -from sqlalchemy.orm import Session -from nvd_bulk_processor import NVDBulkProcessor -from nomi_sec_client import NomiSecClient -from exploitdb_client_local import ExploitDBLocalClient -from cisa_kev_client import CISAKEVClient - -# Configure logging -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -class BulkSeeder: - """Coordinates bulk seeding operations""" - - def __init__(self, db_session: Session): - self.db_session = db_session - self.nvd_processor = NVDBulkProcessor(db_session) - self.nomi_sec_client = NomiSecClient(db_session) - self.exploitdb_client = ExploitDBLocalClient(db_session) - self.cisa_kev_client = CISAKEVClient(db_session) - - async def full_bulk_seed(self, start_year: int = 2002, - end_year: Optional[int] = None, - skip_nvd: bool = False, - skip_nomi_sec: bool = False, - skip_exploitdb: bool = False, - skip_cisa_kev: bool = False, - progress_callback: Optional[callable] = None) -> dict: - """ - Perform complete bulk seeding operation - - Args: - start_year: Starting year for NVD data (default: 2002) - end_year: Ending year for NVD data (default: current year) - skip_nvd: Skip NVD bulk processing (default: False) - skip_nomi_sec: Skip nomi-sec PoC synchronization (default: False) - skip_exploitdb: Skip ExploitDB synchronization (default: False) - skip_cisa_kev: Skip CISA KEV synchronization (default: False) - - Returns: - Dictionary containing operation results - """ - if end_year is None: - end_year = datetime.now().year - - results = { - 'start_time': datetime.utcnow(), - 'nvd_results': None, - 'nomi_sec_results': None, - 'exploitdb_results': None, - 'cisa_kev_results': None, - 'sigma_results': None, - 'total_time': None, - 'status': 'running' - } - - logger.info(f"Starting full bulk seed operation ({start_year}-{end_year})") - - try: - # Phase 1: NVD Bulk Processing - if not skip_nvd: - if progress_callback: - progress_callback("nvd_processing", 10, "Starting NVD bulk processing...") - logger.info("Phase 1: Starting NVD bulk processing...") - nvd_results = await self.nvd_processor.bulk_seed_database( - start_year=start_year, - end_year=end_year - ) - results['nvd_results'] = nvd_results - if progress_callback: - progress_callback("nvd_processing", 25, f"NVD processing complete: {nvd_results['total_processed']} CVEs processed") - logger.info(f"Phase 1 complete: {nvd_results['total_processed']} CVEs processed") - else: - logger.info("Phase 1: Skipping NVD bulk processing") - if progress_callback: - progress_callback("nvd_processing", 25, "Skipping NVD bulk processing") - - # Phase 2: nomi-sec PoC Synchronization - if not skip_nomi_sec: - if progress_callback: - progress_callback("nomi_sec_sync", 30, "Starting nomi-sec PoC synchronization...") - logger.info("Phase 2: Starting nomi-sec PoC synchronization...") - nomi_sec_results = await self.nomi_sec_client.bulk_sync_all_cves( - batch_size=50 # Smaller batches for API stability - ) - results['nomi_sec_results'] = nomi_sec_results - if progress_callback: - progress_callback("nomi_sec_sync", 50, f"Nomi-sec sync complete: {nomi_sec_results['total_pocs_found']} PoCs found") - logger.info(f"Phase 2 complete: {nomi_sec_results['total_pocs_found']} PoCs found") - else: - logger.info("Phase 2: Skipping nomi-sec PoC synchronization") - if progress_callback: - progress_callback("nomi_sec_sync", 50, "Skipping nomi-sec PoC synchronization") - - # Phase 3: ExploitDB Synchronization - if not skip_exploitdb: - if progress_callback: - progress_callback("exploitdb_sync", 55, "Starting ExploitDB synchronization...") - logger.info("Phase 3: Starting ExploitDB synchronization...") - exploitdb_results = await self.exploitdb_client.bulk_sync_exploitdb( - batch_size=30 # Smaller batches for git API stability - ) - results['exploitdb_results'] = exploitdb_results - if progress_callback: - progress_callback("exploitdb_sync", 70, f"ExploitDB sync complete: {exploitdb_results['total_exploits_found']} exploits found") - logger.info(f"Phase 3 complete: {exploitdb_results['total_exploits_found']} exploits found") - else: - logger.info("Phase 3: Skipping ExploitDB synchronization") - if progress_callback: - progress_callback("exploitdb_sync", 70, "Skipping ExploitDB synchronization") - - # Phase 4: CISA KEV Synchronization - if not skip_cisa_kev: - if progress_callback: - progress_callback("cisa_kev_sync", 75, "Starting CISA KEV synchronization...") - logger.info("Phase 4: Starting CISA KEV synchronization...") - cisa_kev_results = await self.cisa_kev_client.bulk_sync_kev_data( - batch_size=100 # Can handle larger batches since data is already filtered - ) - results['cisa_kev_results'] = cisa_kev_results - if progress_callback: - progress_callback("cisa_kev_sync", 85, f"CISA KEV sync complete: {cisa_kev_results['total_kev_found']} KEV entries found") - logger.info(f"Phase 4 complete: {cisa_kev_results['total_kev_found']} KEV entries found") - else: - logger.info("Phase 4: Skipping CISA KEV synchronization") - if progress_callback: - progress_callback("cisa_kev_sync", 85, "Skipping CISA KEV synchronization") - - # Phase 5: Generate Enhanced SIGMA Rules - if progress_callback: - progress_callback("sigma_rules", 90, "Generating enhanced SIGMA rules...") - logger.info("Phase 5: Generating enhanced SIGMA rules...") - sigma_results = await self.generate_enhanced_sigma_rules() - results['sigma_results'] = sigma_results - if progress_callback: - progress_callback("sigma_rules", 95, f"SIGMA rule generation complete: {sigma_results['rules_generated']} rules generated") - logger.info(f"Phase 5 complete: {sigma_results['rules_generated']} rules generated") - - results['status'] = 'completed' - results['end_time'] = datetime.utcnow() - results['total_time'] = (results['end_time'] - results['start_time']).total_seconds() - - logger.info(f"Full bulk seed operation completed in {results['total_time']:.2f} seconds") - - except Exception as e: - logger.error(f"Bulk seed operation failed: {e}") - results['status'] = 'failed' - results['error'] = str(e) - results['end_time'] = datetime.utcnow() - - return results - - async def incremental_update(self) -> dict: - """ - Perform incremental update operation - - Returns: - Dictionary containing update results - """ - results = { - 'start_time': datetime.utcnow(), - 'nvd_update': None, - 'nomi_sec_update': None, - 'exploitdb_update': None, - 'cisa_kev_update': None, - 'status': 'running' - } - - logger.info("Starting incremental update...") - - try: - # Update NVD data using modified/recent feeds - logger.info("Updating NVD data...") - nvd_update = await self.nvd_processor.incremental_update() - results['nvd_update'] = nvd_update - - # Update PoC data for newly added/modified CVEs - if nvd_update['total_processed'] > 0: - logger.info("Updating PoC data for modified CVEs...") - # Get recently modified CVEs and sync their PoCs - recent_cves = await self._get_recently_modified_cves() - nomi_sec_update = await self._sync_specific_cves(recent_cves) - results['nomi_sec_update'] = nomi_sec_update - - # Update ExploitDB data for modified CVEs - logger.info("Updating ExploitDB data for modified CVEs...") - exploitdb_update = await self._sync_specific_cves_exploitdb(recent_cves) - results['exploitdb_update'] = exploitdb_update - - # Update CISA KEV data for modified CVEs - logger.info("Updating CISA KEV data for modified CVEs...") - cisa_kev_update = await self._sync_specific_cves_cisa_kev(recent_cves) - results['cisa_kev_update'] = cisa_kev_update - - results['status'] = 'completed' - results['end_time'] = datetime.utcnow() - - except Exception as e: - logger.error(f"Incremental update failed: {e}") - results['status'] = 'failed' - results['error'] = str(e) - results['end_time'] = datetime.utcnow() - - return results - - async def generate_enhanced_sigma_rules(self) -> dict: - """Generate enhanced SIGMA rules using nomi-sec PoC data""" - from main import CVE, SigmaRule - - # Import the enhanced rule generator - from enhanced_sigma_generator import EnhancedSigmaGenerator - - generator = EnhancedSigmaGenerator(self.db_session) - - # Get all CVEs that have PoC data but no enhanced rules - cves_with_pocs = self.db_session.query(CVE).filter( - CVE.poc_count > 0 - ).all() - - rules_generated = 0 - rules_updated = 0 - - for cve in cves_with_pocs: - try: - # Check if we need to generate/update the rule - existing_rule = self.db_session.query(SigmaRule).filter( - SigmaRule.cve_id == cve.cve_id - ).first() - - if existing_rule and existing_rule.poc_source == 'nomi_sec': - # Rule already exists and is up to date - continue - - # Generate enhanced rule - rule_result = await generator.generate_enhanced_rule(cve) - - if rule_result['success']: - if existing_rule: - rules_updated += 1 - else: - rules_generated += 1 - - except Exception as e: - logger.error(f"Error generating rule for {cve.cve_id}: {e}") - continue - - self.db_session.commit() - - return { - 'rules_generated': rules_generated, - 'rules_updated': rules_updated, - 'total_processed': len(cves_with_pocs) - } - - async def _get_recently_modified_cves(self, hours: int = 24) -> list: - """Get CVEs modified within the last N hours""" - from main import CVE - - cutoff_time = datetime.utcnow() - timedelta(hours=hours) - - recent_cves = self.db_session.query(CVE).filter( - CVE.updated_at >= cutoff_time - ).all() - - return [cve.cve_id for cve in recent_cves] - - async def _sync_specific_cves(self, cve_ids: list) -> dict: - """Sync PoC data for specific CVEs""" - total_processed = 0 - total_pocs_found = 0 - - for cve_id in cve_ids: - try: - result = await self.nomi_sec_client.sync_cve_pocs(cve_id) - total_processed += 1 - total_pocs_found += result.get('pocs_found', 0) - - # Small delay to avoid overwhelming the API - await asyncio.sleep(0.5) - - except Exception as e: - logger.error(f"Error syncing PoCs for {cve_id}: {e}") - continue - - return { - 'total_processed': total_processed, - 'total_pocs_found': total_pocs_found - } - - async def _sync_specific_cves_exploitdb(self, cve_ids: list) -> dict: - """Sync ExploitDB data for specific CVEs""" - total_processed = 0 - total_exploits_found = 0 - - for cve_id in cve_ids: - try: - result = await self.exploitdb_client.sync_cve_exploits(cve_id) - total_processed += 1 - total_exploits_found += result.get('exploits_found', 0) - - # Small delay to avoid overwhelming the API - await asyncio.sleep(0.5) - - except Exception as e: - logger.error(f"Error syncing ExploitDB for {cve_id}: {e}") - continue - - return { - 'total_processed': total_processed, - 'total_exploits_found': total_exploits_found - } - - async def _sync_specific_cves_cisa_kev(self, cve_ids: list) -> dict: - """Sync CISA KEV data for specific CVEs""" - total_processed = 0 - total_kev_found = 0 - - for cve_id in cve_ids: - try: - result = await self.cisa_kev_client.sync_cve_kev_data(cve_id) - total_processed += 1 - if result.get('kev_found', False): - total_kev_found += 1 - - # Small delay to be respectful to CISA - await asyncio.sleep(0.2) - - except Exception as e: - logger.error(f"Error syncing CISA KEV for {cve_id}: {e}") - continue - - return { - 'total_processed': total_processed, - 'total_kev_found': total_kev_found - } - - async def get_seeding_status(self) -> dict: - """Get current seeding status and statistics""" - from main import CVE, SigmaRule, BulkProcessingJob - - # Get database statistics - total_cves = self.db_session.query(CVE).count() - bulk_processed_cves = self.db_session.query(CVE).filter( - CVE.bulk_processed == True - ).count() - - cves_with_pocs = self.db_session.query(CVE).filter( - CVE.poc_count > 0 - ).count() - - total_rules = self.db_session.query(SigmaRule).count() - nomi_sec_rules = self.db_session.query(SigmaRule).filter( - SigmaRule.poc_source == 'nomi_sec' - ).count() - - # Get recent job status - recent_jobs = self.db_session.query(BulkProcessingJob).order_by( - BulkProcessingJob.created_at.desc() - ).limit(5).all() - - job_status = [] - for job in recent_jobs: - job_status.append({ - 'id': str(job.id), - 'job_type': job.job_type, - 'status': job.status, - 'created_at': job.created_at, - 'completed_at': job.completed_at, - 'processed_items': job.processed_items, - 'total_items': job.total_items, - 'failed_items': job.failed_items - }) - - return { - 'database_stats': { - 'total_cves': total_cves, - 'bulk_processed_cves': bulk_processed_cves, - 'cves_with_pocs': cves_with_pocs, - 'total_rules': total_rules, - 'nomi_sec_rules': nomi_sec_rules, - 'poc_coverage': (cves_with_pocs / total_cves * 100) if total_cves > 0 else 0, - 'nomi_sec_coverage': (nomi_sec_rules / total_rules * 100) if total_rules > 0 else 0 - }, - 'recent_jobs': job_status, - 'nvd_data_status': await self._get_nvd_data_status(), - 'nomi_sec_status': await self.nomi_sec_client.get_sync_status(), - 'exploitdb_status': await self.exploitdb_client.get_exploitdb_sync_status(), - 'cisa_kev_status': await self.cisa_kev_client.get_kev_sync_status() - } - - async def _get_nvd_data_status(self) -> dict: - """Get NVD data status""" - from main import CVE - - # Get year distribution - year_counts = {} - cves = self.db_session.query(CVE).all() - - for cve in cves: - if cve.published_date: - year = cve.published_date.year - year_counts[year] = year_counts.get(year, 0) + 1 - - # Get source distribution - source_counts = {} - for cve in cves: - source = cve.data_source or 'unknown' - source_counts[source] = source_counts.get(source, 0) + 1 - - return { - 'year_distribution': year_counts, - 'source_distribution': source_counts, - 'total_cves': len(cves), - 'date_range': { - 'earliest': min(cve.published_date for cve in cves if cve.published_date), - 'latest': max(cve.published_date for cve in cves if cve.published_date) - } if cves else None - } - - -# Standalone script functionality -async def main(): - """Main function for standalone execution""" - from main import SessionLocal, engine, Base - - # Create tables - Base.metadata.create_all(bind=engine) - - # Create database session - db_session = SessionLocal() - - try: - # Create bulk seeder - seeder = BulkSeeder(db_session) - - # Get current status - status = await seeder.get_seeding_status() - print(f"Current Status: {status['database_stats']['total_cves']} CVEs in database") - - # Perform full bulk seed if database is empty - if status['database_stats']['total_cves'] == 0: - print("Database is empty. Starting full bulk seed...") - results = await seeder.full_bulk_seed(start_year=2020) # Start from 2020 for faster testing - print(f"Bulk seed completed: {results}") - else: - print("Database contains data. Running incremental update...") - results = await seeder.incremental_update() - print(f"Incremental update completed: {results}") - - finally: - db_session.close() - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/backend/celery_config.py b/backend/celery_config.py deleted file mode 100644 index 664802a..0000000 --- a/backend/celery_config.py +++ /dev/null @@ -1,222 +0,0 @@ -""" -Celery configuration for the Auto SIGMA Rule Generator -""" -import os -from celery import Celery -from celery.schedules import crontab -from kombu import Queue - -# Celery configuration -broker_url = os.getenv('CELERY_BROKER_URL', 'redis://redis:6379/0') -result_backend = os.getenv('CELERY_RESULT_BACKEND', 'redis://redis:6379/0') - -# Create Celery app -celery_app = Celery( - 'sigma_generator', - broker=broker_url, - backend=result_backend, - include=[ - 'tasks.bulk_tasks', - 'tasks.sigma_tasks', - 'tasks.data_sync_tasks', - 'tasks.maintenance_tasks' - ] -) - -# Celery configuration -celery_app.conf.update( - # Serialization - task_serializer='json', - accept_content=['json'], - result_serializer='json', - - # Timezone - timezone='UTC', - enable_utc=True, - - # Task tracking - task_track_started=True, - task_send_sent_event=True, - - # Result backend settings - result_expires=3600, # Results expire after 1 hour - result_backend_transport_options={ - 'master_name': 'mymaster', - 'visibility_timeout': 3600, - }, - - # Worker settings - worker_prefetch_multiplier=1, - task_acks_late=True, - worker_max_tasks_per_child=1000, - - # Task routes - different queues for different types of tasks - task_routes={ - 'tasks.bulk_tasks.*': {'queue': 'bulk_processing'}, - 'tasks.sigma_tasks.*': {'queue': 'sigma_generation'}, - 'tasks.data_sync_tasks.*': {'queue': 'data_sync'}, - }, - - # Queue definitions - task_default_queue='default', - task_queues=( - Queue('default', routing_key='default'), - Queue('bulk_processing', routing_key='bulk_processing'), - Queue('sigma_generation', routing_key='sigma_generation'), - Queue('data_sync', routing_key='data_sync'), - ), - - # Retry settings - task_default_retry_delay=60, # 1 minute - task_max_retries=3, - - # Monitoring - worker_send_task_events=True, - - # Optimized Beat schedule for daily workflow - # WORKFLOW: NVD incremental -> Exploit syncs -> Reference sync -> SIGMA rules - beat_schedule={ - # STEP 1: NVD Incremental Update - Daily at 2:00 AM - # This runs first to get the latest CVE data from NVD - 'daily-nvd-incremental-update': { - 'task': 'bulk_tasks.incremental_update_task', - 'schedule': crontab(minute=0, hour=2), # Daily at 2:00 AM - 'options': {'queue': 'bulk_processing'}, - 'kwargs': {'batch_size': 100, 'skip_nvd': False, 'skip_nomi_sec': True} - }, - - # STEP 2: Exploit Data Syncing - Daily starting at 3:00 AM - # These run in parallel but start at different times to avoid conflicts - - # CISA KEV Sync - Daily at 3:00 AM (15 minutes after NVD) - 'daily-cisa-kev-sync': { - 'task': 'data_sync_tasks.sync_cisa_kev', - 'schedule': crontab(minute=0, hour=3), # Daily at 3:00 AM - 'options': {'queue': 'data_sync'}, - 'kwargs': {'batch_size': 100} - }, - - # Nomi-sec PoC Sync - Daily at 3:15 AM - 'daily-nomi-sec-sync': { - 'task': 'data_sync_tasks.sync_nomi_sec', - 'schedule': crontab(minute=15, hour=3), # Daily at 3:15 AM - 'options': {'queue': 'data_sync'}, - 'kwargs': {'batch_size': 100} - }, - - # GitHub PoC Sync - Daily at 3:30 AM - 'daily-github-poc-sync': { - 'task': 'data_sync_tasks.sync_github_poc', - 'schedule': crontab(minute=30, hour=3), # Daily at 3:30 AM - 'options': {'queue': 'data_sync'}, - 'kwargs': {'batch_size': 50} - }, - - # ExploitDB Sync - Daily at 3:45 AM - 'daily-exploitdb-sync': { - 'task': 'data_sync_tasks.sync_exploitdb', - 'schedule': crontab(minute=45, hour=3), # Daily at 3:45 AM - 'options': {'queue': 'data_sync'}, - 'kwargs': {'batch_size': 30} - }, - - # CVE2CAPEC MITRE ATT&CK Mapping Sync - Daily at 4:00 AM - 'daily-cve2capec-sync': { - 'task': 'data_sync_tasks.sync_cve2capec', - 'schedule': crontab(minute=0, hour=4), # Daily at 4:00 AM - 'options': {'queue': 'data_sync'}, - 'kwargs': {'force_refresh': False} # Only refresh if cache is stale - }, - - # ExploitDB Index Rebuild - Daily at 4:15 AM - 'daily-exploitdb-index-build': { - 'task': 'data_sync_tasks.build_exploitdb_index', - 'schedule': crontab(minute=15, hour=4), # Daily at 4:15 AM - 'options': {'queue': 'data_sync'} - }, - - # STEP 3: Reference Content Sync - Daily at 5:00 AM - # This is the longest-running task, starts after exploit syncs have time to complete - 'daily-reference-content-sync': { - 'task': 'data_sync_tasks.sync_reference_content', - 'schedule': crontab(minute=0, hour=5), # Daily at 5:00 AM - 'options': {'queue': 'data_sync'}, - 'kwargs': {'batch_size': 30, 'max_cves': 200, 'force_resync': False} - }, - - # STEP 4: SIGMA Rule Generation - Daily at 8:00 AM - # This runs LAST after all other daily data sync jobs - 'daily-sigma-rule-generation': { - 'task': 'bulk_tasks.generate_enhanced_sigma_rules', - 'schedule': crontab(minute=0, hour=8), # Daily at 8:00 AM - 'options': {'queue': 'sigma_generation'} - }, - - # LLM-Enhanced SIGMA Rule Generation - Daily at 9:00 AM - # Additional LLM-based rule generation after standard rules - 'daily-llm-sigma-generation': { - 'task': 'sigma_tasks.generate_enhanced_rules', - 'schedule': crontab(minute=0, hour=9), # Daily at 9:00 AM - 'options': {'queue': 'sigma_generation'}, - 'kwargs': {'cve_ids': None} # Process all CVEs with PoCs - }, - - # MAINTENANCE TASKS - - # Database Cleanup - Weekly on Sunday at 1:00 AM (before daily workflow) - 'weekly-database-cleanup': { - 'task': 'tasks.maintenance_tasks.database_cleanup_comprehensive', - 'schedule': crontab(minute=0, hour=1, day_of_week=0), # Sunday at 1:00 AM - 'options': {'queue': 'default'}, - 'kwargs': {'days_to_keep': 30, 'cleanup_failed_jobs': True, 'cleanup_logs': True} - }, - - # Health Check - Every 15 minutes - 'health-check-detailed': { - 'task': 'tasks.maintenance_tasks.health_check_detailed', - 'schedule': crontab(minute='*/15'), # Every 15 minutes - 'options': {'queue': 'default'} - }, - - # Celery result cleanup - Daily at 1:30 AM - 'daily-cleanup-old-results': { - 'task': 'tasks.maintenance_tasks.cleanup_old_results', - 'schedule': crontab(minute=30, hour=1), # Daily at 1:30 AM - 'options': {'queue': 'default'} - }, - }, -) - -# Configure logging -celery_app.conf.update( - worker_log_format='[%(asctime)s: %(levelname)s/%(processName)s] %(message)s', - worker_task_log_format='[%(asctime)s: %(levelname)s/%(processName)s][%(task_name)s(%(task_id)s)] %(message)s', -) - -# Database session configuration for tasks -from sqlalchemy import create_engine -from sqlalchemy.orm import sessionmaker - -# Database configuration -DATABASE_URL = os.getenv('DATABASE_URL', 'postgresql://cve_user:cve_password@db:5432/cve_sigma_db') - -# Create engine and session factory -engine = create_engine(DATABASE_URL) -SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) - -def get_db_session(): - """Get database session for tasks""" - return SessionLocal() - -# Import all task modules to register them -def register_tasks(): - """Register all task modules""" - try: - from tasks import bulk_tasks, sigma_tasks, data_sync_tasks, maintenance_tasks - print("All task modules registered successfully") - except ImportError as e: - print(f"Warning: Could not import some task modules: {e}") - -# Auto-register tasks when module is imported -if __name__ != "__main__": - register_tasks() diff --git a/backend/convert_lora_to_gguf.py b/backend/convert_lora_to_gguf.py new file mode 100644 index 0000000..f50c8e3 --- /dev/null +++ b/backend/convert_lora_to_gguf.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python3 +""" +Convert LoRA adapter to GGUF format for better Ollama integration +""" + +import os +import sys +import subprocess +import tempfile +import shutil +from pathlib import Path +import logging + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +def check_llama_cpp_tools(): + """Check if llama.cpp tools are available""" + tools_needed = [ + 'convert_hf_to_gguf.py', + 'convert_lora_to_gguf.py', + 'llama-export-lora' + ] + + missing_tools = [] + for tool in tools_needed: + if not shutil.which(tool): + missing_tools.append(tool) + + if missing_tools: + logger.error(f"Missing required tools: {missing_tools}") + logger.error("Please install llama.cpp and ensure tools are in PATH") + logger.error("See: https://github.com/ggerganov/llama.cpp") + return False + + return True + +def convert_lora_to_gguf(lora_path: str, base_model_name: str = "meta-llama/Llama-3.2-3B-Instruct"): + """Convert LoRA adapter to GGUF format""" + + if not check_llama_cpp_tools(): + logger.error("Cannot convert LoRA - missing llama.cpp tools") + return None + + try: + # Create temporary directory for conversion + with tempfile.TemporaryDirectory() as temp_dir: + logger.info(f"Converting LoRA adapter from {lora_path}") + + # Step 1: Convert LoRA to GGUF + lora_gguf_path = os.path.join(temp_dir, "adapter.gguf") + + cmd = [ + 'convert_lora_to_gguf.py', + '--base', base_model_name, + '--outtype', 'q8_0', # High quality quantization + '--outfile', lora_gguf_path, + lora_path + ] + + logger.info(f"Running: {' '.join(cmd)}") + result = subprocess.run(cmd, capture_output=True, text=True) + + if result.returncode != 0: + logger.error(f"LoRA conversion failed: {result.stderr}") + return None + + # Step 2: Download/prepare base model in GGUF format + base_model_path = os.path.join(temp_dir, "base_model.gguf") + + # This would need to download the base model in GGUF format + # For now, we'll assume it's available or use Ollama's version + + # Step 3: Merge LoRA with base model + merged_model_path = os.path.join(temp_dir, "merged_model.gguf") + + merge_cmd = [ + 'llama-export-lora', + '-m', base_model_path, + '-o', merged_model_path, + '--lora', lora_gguf_path + ] + + logger.info(f"Running: {' '.join(merge_cmd)}") + merge_result = subprocess.run(merge_cmd, capture_output=True, text=True) + + if merge_result.returncode != 0: + logger.error(f"LoRA merge failed: {merge_result.stderr}") + return None + + # Copy merged model to output location + output_path = "/app/models/sigma_llama_merged.gguf" + shutil.copy2(merged_model_path, output_path) + + logger.info(f"โœ… Successfully converted and merged LoRA to {output_path}") + return output_path + + except Exception as e: + logger.error(f"Error converting LoRA: {e}") + return None + +def create_gguf_modelfile(gguf_path: str) -> str: + """Create Modelfile for GGUF merged model""" + + modelfile_content = f"""FROM {gguf_path} + +TEMPLATE \"\"\"<|begin_of_text|><|start_header_id|>system<|end_header_id|> + +You are a cybersecurity expert specializing in SIGMA rule creation. You have been fine-tuned specifically for generating high-quality SIGMA detection rules. + +Generate valid SIGMA rules in YAML format based on the provided CVE and exploit information. Output ONLY valid YAML starting with 'title:' and ending with the last YAML line. + +Focus on: +- Accurate logsource identification +- Precise detection logic +- Relevant fields and values +- Proper YAML formatting +- Contextual understanding from CVE details<|eot_id|><|start_header_id|>user<|end_header_id|> + +{{ .Prompt }}<|eot_id|><|start_header_id|>assistant<|end_header_id|> + +\"\"\" + +# Optimized parameters for SIGMA rule generation +PARAMETER temperature 0.1 +PARAMETER top_p 0.9 +PARAMETER top_k 40 +PARAMETER repeat_penalty 1.1 +PARAMETER num_ctx 4096 +PARAMETER stop "<|eot_id|>" +PARAMETER stop "<|end_of_text|>" + +SYSTEM \"\"\"You are a specialized SIGMA rule generation model. Your training has optimized you for creating accurate, contextual SIGMA detection rules. Generate only valid YAML format rules based on the provided context.\"\"\" +""" + + return modelfile_content + +if __name__ == "__main__": + # Test conversion + lora_path = "/app/models/sigma_llama_finetuned/checkpoint-2268" + + if os.path.exists(lora_path): + converted_path = convert_lora_to_gguf(lora_path) + if converted_path: + print(f"โœ… Conversion successful: {converted_path}") + print("Use this path in your Ollama Modelfile:") + print(create_gguf_modelfile(converted_path)) + else: + print("โŒ Conversion failed") + else: + print(f"โŒ LoRA path not found: {lora_path}") \ No newline at end of file diff --git a/backend/database_models.py b/backend/database_models.py new file mode 100644 index 0000000..ac40487 --- /dev/null +++ b/backend/database_models.py @@ -0,0 +1,91 @@ +""" +Database Models for CVE-SIGMA Auto Generator CLI + +Maintains database model definitions for migration and data processing purposes. +Used by CLI migration tools to export data from legacy web application database. +""" + +import uuid +from datetime import datetime +from sqlalchemy import create_engine, Column, String, Text, DECIMAL, TIMESTAMP, Boolean, ARRAY, Integer, JSON +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import sessionmaker +from sqlalchemy.dialects.postgresql import UUID +import os + +# Database setup +DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://cve_user:cve_password@localhost:5432/cve_sigma_db") +engine = create_engine(DATABASE_URL) +SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) +Base = declarative_base() + +# Database Models (for migration purposes) +class CVE(Base): + __tablename__ = "cves" + + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + cve_id = Column(String(20), unique=True, nullable=False) + description = Column(Text) + cvss_score = Column(DECIMAL(3, 1)) + severity = Column(String(20)) + published_date = Column(TIMESTAMP) + modified_date = Column(TIMESTAMP) + affected_products = Column(ARRAY(String)) + reference_urls = Column(ARRAY(String)) + # Bulk processing fields + data_source = Column(String(20), default='nvd_api') # 'nvd_api', 'nvd_bulk', 'manual' + nvd_json_version = Column(String(10), default='2.0') + bulk_processed = Column(Boolean, default=False) + # nomi-sec PoC fields + poc_count = Column(Integer, default=0) + poc_data = Column(JSON) # Store nomi-sec PoC metadata + # Reference data fields + reference_data = Column(JSON) # Store extracted reference content and analysis + reference_sync_status = Column(String(20), default='pending') # 'pending', 'processing', 'completed', 'failed' + reference_last_synced = Column(TIMESTAMP) + created_at = Column(TIMESTAMP, default=datetime.utcnow) + updated_at = Column(TIMESTAMP, default=datetime.utcnow) + +class SigmaRule(Base): + __tablename__ = "sigma_rules" + + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + cve_id = Column(String(20)) + rule_name = Column(String(255), nullable=False) + rule_content = Column(Text, nullable=False) + detection_type = Column(String(50)) + log_source = Column(String(100)) + confidence_level = Column(String(20)) + auto_generated = Column(Boolean, default=True) + exploit_based = Column(Boolean, default=False) + github_repos = Column(ARRAY(String)) + exploit_indicators = Column(Text) # JSON string of extracted indicators + # Enhanced fields for new data sources + poc_source = Column(String(20), default='github_search') # 'github_search', 'nomi_sec', 'manual' + poc_quality_score = Column(Integer, default=0) # Based on star count, activity, etc. + nomi_sec_data = Column(JSON) # Store nomi-sec PoC metadata + created_at = Column(TIMESTAMP, default=datetime.utcnow) + updated_at = Column(TIMESTAMP, default=datetime.utcnow) + +class RuleTemplate(Base): + __tablename__ = "rule_templates" + + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + template_name = Column(String(255), nullable=False) + template_content = Column(Text, nullable=False) + applicable_product_patterns = Column(ARRAY(String)) + description = Column(Text) + created_at = Column(TIMESTAMP, default=datetime.utcnow) + +def get_db(): + """Get database session for migration purposes""" + db = SessionLocal() + try: + yield db + finally: + db.close() + +# Create all tables (for migration purposes) +def create_tables(): + """Create database tables""" + Base.metadata.create_all(bind=engine) \ No newline at end of file diff --git a/backend/delete_sigma_rules.py b/backend/delete_sigma_rules.py deleted file mode 100644 index 2003a1c..0000000 --- a/backend/delete_sigma_rules.py +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env python3 -""" -Script to delete all SIGMA rules from the database -This will clear existing rules so they can be regenerated with the improved LLM client -""" - -from main import SigmaRule, SessionLocal -import logging - -# Setup logging -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -def delete_all_sigma_rules(): - """Delete all SIGMA rules from the database""" - - db = SessionLocal() - - try: - # Count existing rules - total_rules = db.query(SigmaRule).count() - logger.info(f"Found {total_rules} SIGMA rules in database") - - if total_rules == 0: - logger.info("No SIGMA rules to delete") - return 0 - - # Delete all SIGMA rules - logger.info("Deleting all SIGMA rules...") - deleted_count = db.query(SigmaRule).delete() - db.commit() - - logger.info(f"โœ… Successfully deleted {deleted_count} SIGMA rules") - - # Verify deletion - remaining_rules = db.query(SigmaRule).count() - logger.info(f"Remaining rules in database: {remaining_rules}") - - return deleted_count - - except Exception as e: - logger.error(f"Error deleting SIGMA rules: {e}") - db.rollback() - raise - finally: - db.close() - -if __name__ == "__main__": - print("๐Ÿ—‘๏ธ Deleting all SIGMA rules from database...") - print("This will allow regeneration with the improved LLM client.") - - deleted_count = delete_all_sigma_rules() - - if deleted_count > 0: - print(f"\n๐ŸŽ‰ Successfully deleted {deleted_count} SIGMA rules!") - print("You can now regenerate them with the fixed LLM prompts.") - else: - print("\nโœ… No SIGMA rules were found to delete.") \ No newline at end of file diff --git a/backend/initial_setup.py b/backend/initial_setup.py deleted file mode 100644 index 954b78f..0000000 --- a/backend/initial_setup.py +++ /dev/null @@ -1,171 +0,0 @@ -#!/usr/bin/env python3 -""" -Initial setup script that runs once on first boot to populate the database. -This script checks if initial data seeding is needed and triggers it via Celery. -""" -import os -import sys -import time -import logging -from datetime import datetime, timedelta -from sqlalchemy import create_engine, text -from sqlalchemy.orm import sessionmaker -from sqlalchemy.exc import OperationalError - -# Add the current directory to path so we can import our modules -sys.path.append(os.path.dirname(os.path.abspath(__file__))) - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -# Database configuration -DATABASE_URL = os.getenv('DATABASE_URL', 'postgresql://cve_user:cve_password@db:5432/cve_sigma_db') - -def wait_for_database(max_retries: int = 30, delay: int = 5) -> bool: - """Wait for database to be ready""" - logger.info("Waiting for database to be ready...") - - for attempt in range(max_retries): - try: - engine = create_engine(DATABASE_URL) - with engine.connect() as conn: - conn.execute(text("SELECT 1")) - logger.info("โœ… Database is ready!") - return True - except OperationalError as e: - logger.info(f"Attempt {attempt + 1}/{max_retries}: Database not ready yet ({e})") - except Exception as e: - logger.error(f"Unexpected error connecting to database: {e}") - - if attempt < max_retries - 1: - time.sleep(delay) - - logger.error("โŒ Database failed to become ready") - return False - -def check_initial_setup_needed() -> bool: - """Check if initial setup is needed by examining the database state""" - try: - engine = create_engine(DATABASE_URL) - SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) - - with SessionLocal() as session: - # Check if we have any CVEs in the database - result = session.execute(text("SELECT COUNT(*) FROM cves")).fetchone() - cve_count = result[0] if result else 0 - - logger.info(f"Current CVE count in database: {cve_count}") - - # Check if we have any bulk processing jobs that completed successfully - bulk_jobs_result = session.execute(text(""" - SELECT COUNT(*) FROM bulk_processing_jobs - WHERE job_type = 'nvd_bulk_seed' - AND status = 'completed' - AND created_at > NOW() - INTERVAL '30 days' - """)).fetchone() - - recent_bulk_jobs = bulk_jobs_result[0] if bulk_jobs_result else 0 - - logger.info(f"Recent successful bulk seed jobs: {recent_bulk_jobs}") - - # Initial setup needed if: - # 1. Very few CVEs (less than 1000) AND - # 2. No recent successful bulk seed jobs - initial_setup_needed = cve_count < 1000 and recent_bulk_jobs == 0 - - if initial_setup_needed: - logger.info("๐Ÿ”„ Initial setup is needed - will trigger full NVD sync") - else: - logger.info("โœ… Initial setup already completed - database has sufficient data") - - return initial_setup_needed - - except Exception as e: - logger.error(f"Error checking initial setup status: {e}") - # If we can't check, assume setup is needed - return True - -def trigger_initial_bulk_seed(): - """Trigger initial bulk seed via Celery""" - try: - # Import here to avoid circular dependencies - from celery_config import celery_app - from tasks.bulk_tasks import full_bulk_seed_task - - logger.info("๐Ÿš€ Triggering initial full NVD bulk seed...") - - # Start a comprehensive bulk seed job - # Start from 2020 for faster initial setup, can be adjusted - task_result = full_bulk_seed_task.delay( - start_year=2020, # Start from 2020 for faster initial setup - end_year=None, # Current year - skip_nvd=False, - skip_nomi_sec=True, # Skip nomi-sec initially, will be done daily - skip_exploitdb=True, # Skip exploitdb initially, will be done daily - skip_cisa_kev=True # Skip CISA KEV initially, will be done daily - ) - - logger.info(f"โœ… Initial bulk seed task started with ID: {task_result.id}") - logger.info(f"Monitor progress at: http://localhost:5555/task/{task_result.id}") - - return task_result.id - - except Exception as e: - logger.error(f"โŒ Failed to trigger initial bulk seed: {e}") - return None - -def create_initial_setup_marker(): - """Create a marker to indicate initial setup was attempted""" - try: - engine = create_engine(DATABASE_URL) - SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) - - with SessionLocal() as session: - # Insert a marker record - session.execute(text(""" - INSERT INTO bulk_processing_jobs (job_type, status, job_metadata, created_at, started_at) - VALUES ('initial_setup_marker', 'completed', '{"purpose": "initial_setup_marker"}', NOW(), NOW()) - ON CONFLICT DO NOTHING - """)) - session.commit() - - logger.info("โœ… Created initial setup marker") - - except Exception as e: - logger.error(f"Error creating initial setup marker: {e}") - -def main(): - """Main initial setup function""" - logger.info("๐Ÿš€ Starting initial setup check...") - - # Step 1: Wait for database - if not wait_for_database(): - logger.error("โŒ Initial setup failed: Database not available") - sys.exit(1) - - # Step 2: Check if initial setup is needed - if not check_initial_setup_needed(): - logger.info("๐ŸŽ‰ Initial setup not needed - database already populated") - sys.exit(0) - - # Step 3: Wait for Celery to be ready - logger.info("Waiting for Celery workers to be ready...") - time.sleep(10) # Give Celery workers time to start - - # Step 4: Trigger initial bulk seed - task_id = trigger_initial_bulk_seed() - - if task_id: - # Step 5: Create marker - create_initial_setup_marker() - - logger.info("๐ŸŽ‰ Initial setup triggered successfully!") - logger.info(f"Task ID: {task_id}") - logger.info("The system will begin daily scheduled tasks once initial setup completes.") - sys.exit(0) - else: - logger.error("โŒ Initial setup failed") - sys.exit(1) - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/backend/job_executors.py b/backend/job_executors.py deleted file mode 100644 index 4a0ec41..0000000 --- a/backend/job_executors.py +++ /dev/null @@ -1,390 +0,0 @@ -""" -Job Executors for Scheduled Tasks -Integrates existing job functions with the scheduler -""" - -import asyncio -import logging -from typing import Dict, Any -from sqlalchemy.orm import Session -from datetime import datetime, timedelta - -logger = logging.getLogger(__name__) - -class JobExecutors: - """Collection of job executor functions for the scheduler""" - - @staticmethod - async def incremental_update(db_session: Session, parameters: Dict[str, Any]) -> Dict[str, Any]: - """Execute NVD incremental update job""" - try: - from bulk_seeder import BulkSeeder - - seeder = BulkSeeder(db_session) - - # Extract parameters - batch_size = parameters.get('batch_size', 100) - skip_nvd = parameters.get('skip_nvd', False) - skip_nomi_sec = parameters.get('skip_nomi_sec', True) - - logger.info(f"Starting incremental update - batch_size: {batch_size}") - - result = await seeder.incremental_update() - - return { - 'status': 'completed', - 'result': result, - 'job_type': 'incremental_update', - 'completed_at': datetime.utcnow().isoformat() - } - - except Exception as e: - logger.error(f"Incremental update job failed: {e}") - return { - 'status': 'failed', - 'error': str(e), - 'job_type': 'incremental_update', - 'completed_at': datetime.utcnow().isoformat() - } - - @staticmethod - async def cisa_kev_sync(db_session: Session, parameters: Dict[str, Any]) -> Dict[str, Any]: - """Execute CISA KEV sync job""" - try: - from cisa_kev_client import CISAKEVClient - - client = CISAKEVClient(db_session) - - # Extract parameters - batch_size = parameters.get('batch_size', 100) - - logger.info(f"Starting CISA KEV sync - batch_size: {batch_size}") - - result = await client.bulk_sync_kev_data(batch_size=batch_size) - - return { - 'status': 'completed', - 'result': result, - 'job_type': 'cisa_kev_sync', - 'completed_at': datetime.utcnow().isoformat() - } - - except Exception as e: - logger.error(f"CISA KEV sync job failed: {e}") - return { - 'status': 'failed', - 'error': str(e), - 'job_type': 'cisa_kev_sync', - 'completed_at': datetime.utcnow().isoformat() - } - - @staticmethod - async def nomi_sec_sync(db_session: Session, parameters: Dict[str, Any]) -> Dict[str, Any]: - """Execute optimized nomi-sec PoC sync job""" - try: - from nomi_sec_client import NomiSecClient - - client = NomiSecClient(db_session) - - # Extract parameters with optimized defaults - batch_size = parameters.get('batch_size', 100) - max_cves = parameters.get('max_cves', 1000) - force_resync = parameters.get('force_resync', False) - - logger.info(f"Starting optimized nomi-sec sync - batch_size: {batch_size}, max_cves: {max_cves}") - - result = await client.bulk_sync_poc_data( - batch_size=batch_size, - max_cves=max_cves, - force_resync=force_resync - ) - - return { - 'status': 'completed', - 'result': result, - 'job_type': 'nomi_sec_sync', - 'completed_at': datetime.utcnow().isoformat() - } - - except Exception as e: - logger.error(f"Nomi-sec sync job failed: {e}") - return { - 'status': 'failed', - 'error': str(e), - 'job_type': 'nomi_sec_sync', - 'completed_at': datetime.utcnow().isoformat() - } - - @staticmethod - async def github_poc_sync(db_session: Session, parameters: Dict[str, Any]) -> Dict[str, Any]: - """Execute GitHub PoC sync job""" - try: - from mcdevitt_poc_client import GitHubPoCClient - - client = GitHubPoCClient(db_session) - - # Extract parameters - batch_size = parameters.get('batch_size', 50) - - logger.info(f"Starting GitHub PoC sync - batch_size: {batch_size}") - - result = await client.bulk_sync_poc_data(batch_size=batch_size) - - return { - 'status': 'completed', - 'result': result, - 'job_type': 'github_poc_sync', - 'completed_at': datetime.utcnow().isoformat() - } - - except Exception as e: - logger.error(f"GitHub PoC sync job failed: {e}") - return { - 'status': 'failed', - 'error': str(e), - 'job_type': 'github_poc_sync', - 'completed_at': datetime.utcnow().isoformat() - } - - @staticmethod - async def exploitdb_sync(db_session: Session, parameters: Dict[str, Any]) -> Dict[str, Any]: - """Execute ExploitDB sync job""" - try: - from exploitdb_client_local import ExploitDBLocalClient - - client = ExploitDBLocalClient(db_session) - - # Extract parameters - batch_size = parameters.get('batch_size', 30) - - logger.info(f"Starting ExploitDB sync - batch_size: {batch_size}") - - result = await client.bulk_sync_exploitdb_data(batch_size=batch_size) - - return { - 'status': 'completed', - 'result': result, - 'job_type': 'exploitdb_sync', - 'completed_at': datetime.utcnow().isoformat() - } - - except Exception as e: - logger.error(f"ExploitDB sync job failed: {e}") - return { - 'status': 'failed', - 'error': str(e), - 'job_type': 'exploitdb_sync', - 'completed_at': datetime.utcnow().isoformat() - } - - @staticmethod - async def reference_sync(db_session: Session, parameters: Dict[str, Any]) -> Dict[str, Any]: - """Execute reference data sync job""" - try: - from reference_client import ReferenceClient - - client = ReferenceClient(db_session) - - # Extract parameters - batch_size = parameters.get('batch_size', 30) - max_cves = parameters.get('max_cves', 200) - force_resync = parameters.get('force_resync', False) - - logger.info(f"Starting reference sync - batch_size: {batch_size}, max_cves: {max_cves}") - - result = await client.bulk_sync_references( - batch_size=batch_size, - max_cves=max_cves, - force_resync=force_resync - ) - - return { - 'status': 'completed', - 'result': result, - 'job_type': 'reference_sync', - 'completed_at': datetime.utcnow().isoformat() - } - - except Exception as e: - logger.error(f"Reference sync job failed: {e}") - return { - 'status': 'failed', - 'error': str(e), - 'job_type': 'reference_sync', - 'completed_at': datetime.utcnow().isoformat() - } - - @staticmethod - async def rule_regeneration(db_session: Session, parameters: Dict[str, Any]) -> Dict[str, Any]: - """Execute SIGMA rule regeneration job""" - try: - from enhanced_sigma_generator import EnhancedSigmaGenerator - - generator = EnhancedSigmaGenerator(db_session) - - # Extract parameters - force = parameters.get('force', False) - - logger.info(f"Starting rule regeneration - force: {force}") - - # Get CVEs that need rule regeneration - from main import CVE - if force: - # Regenerate all rules - cves = db_session.query(CVE).all() - else: - # Only regenerate for CVEs with new data - cves = db_session.query(CVE).filter( - CVE.updated_at > CVE.created_at - ).all() - - total_processed = 0 - total_generated = 0 - - for cve in cves: - try: - # Generate enhanced rule - rule_content = await generator.generate_enhanced_sigma_rule(cve.cve_id) - if rule_content: - total_generated += 1 - total_processed += 1 - - # Small delay to prevent overwhelming the system - await asyncio.sleep(0.1) - - except Exception as e: - logger.error(f"Error regenerating rule for {cve.cve_id}: {e}") - - result = { - 'total_processed': total_processed, - 'total_generated': total_generated, - 'generation_rate': total_generated / total_processed if total_processed > 0 else 0 - } - - return { - 'status': 'completed', - 'result': result, - 'job_type': 'rule_regeneration', - 'completed_at': datetime.utcnow().isoformat() - } - - except Exception as e: - logger.error(f"Rule regeneration job failed: {e}") - return { - 'status': 'failed', - 'error': str(e), - 'job_type': 'rule_regeneration', - 'completed_at': datetime.utcnow().isoformat() - } - - @staticmethod - async def bulk_seed(db_session: Session, parameters: Dict[str, Any]) -> Dict[str, Any]: - """Execute full bulk seed job""" - try: - from bulk_seeder import BulkSeeder - - seeder = BulkSeeder(db_session) - - # Extract parameters - start_year = parameters.get('start_year', 2020) - end_year = parameters.get('end_year', 2025) - batch_size = parameters.get('batch_size', 100) - skip_nvd = parameters.get('skip_nvd', False) - skip_nomi_sec = parameters.get('skip_nomi_sec', False) - - logger.info(f"Starting full bulk seed - years: {start_year}-{end_year}") - - result = await seeder.full_bulk_seed( - start_year=start_year, - end_year=end_year - ) - - return { - 'status': 'completed', - 'result': result, - 'job_type': 'bulk_seed', - 'completed_at': datetime.utcnow().isoformat() - } - - except Exception as e: - logger.error(f"Bulk seed job failed: {e}") - return { - 'status': 'failed', - 'error': str(e), - 'job_type': 'bulk_seed', - 'completed_at': datetime.utcnow().isoformat() - } - - @staticmethod - async def database_cleanup(db_session: Session, parameters: Dict[str, Any]) -> Dict[str, Any]: - """Execute database cleanup job""" - try: - from main import BulkProcessingJob - - # Extract parameters - days_to_keep = parameters.get('days_to_keep', 30) - cleanup_failed_jobs = parameters.get('cleanup_failed_jobs', True) - cleanup_logs = parameters.get('cleanup_logs', True) - - logger.info(f"Starting database cleanup - keep {days_to_keep} days") - - cutoff_date = datetime.utcnow() - timedelta(days=days_to_keep) - - deleted_jobs = 0 - - # Clean up old bulk processing jobs - if cleanup_failed_jobs: - # Delete failed jobs older than cutoff - deleted = db_session.query(BulkProcessingJob).filter( - BulkProcessingJob.status == 'failed', - BulkProcessingJob.created_at < cutoff_date - ).delete() - deleted_jobs += deleted - - # Delete completed jobs older than cutoff (keep some recent ones) - very_old_cutoff = datetime.utcnow() - timedelta(days=days_to_keep * 2) - deleted = db_session.query(BulkProcessingJob).filter( - BulkProcessingJob.status == 'completed', - BulkProcessingJob.created_at < very_old_cutoff - ).delete() - deleted_jobs += deleted - - db_session.commit() - - result = { - 'deleted_jobs': deleted_jobs, - 'cutoff_date': cutoff_date.isoformat(), - 'cleanup_type': 'bulk_processing_jobs' - } - - return { - 'status': 'completed', - 'result': result, - 'job_type': 'database_cleanup', - 'completed_at': datetime.utcnow().isoformat() - } - - except Exception as e: - logger.error(f"Database cleanup job failed: {e}") - return { - 'status': 'failed', - 'error': str(e), - 'job_type': 'database_cleanup', - 'completed_at': datetime.utcnow().isoformat() - } - -def register_all_executors(scheduler): - """Register all job executors with the scheduler""" - executors = JobExecutors() - - scheduler.register_job_executor('incremental_update', executors.incremental_update) - scheduler.register_job_executor('cisa_kev_sync', executors.cisa_kev_sync) - scheduler.register_job_executor('nomi_sec_sync', executors.nomi_sec_sync) - scheduler.register_job_executor('github_poc_sync', executors.github_poc_sync) - scheduler.register_job_executor('exploitdb_sync', executors.exploitdb_sync) - scheduler.register_job_executor('reference_sync', executors.reference_sync) - scheduler.register_job_executor('rule_regeneration', executors.rule_regeneration) - scheduler.register_job_executor('bulk_seed', executors.bulk_seed) - scheduler.register_job_executor('database_cleanup', executors.database_cleanup) - - logger.info("All job executors registered successfully") \ No newline at end of file diff --git a/backend/job_scheduler.py b/backend/job_scheduler.py deleted file mode 100644 index 8411172..0000000 --- a/backend/job_scheduler.py +++ /dev/null @@ -1,449 +0,0 @@ -""" -CVE-SIGMA Auto Generator - Cron-like Job Scheduler -Automated scheduling and execution of data processing jobs -""" - -import asyncio -import yaml -import logging -import threading -import time -from datetime import datetime, timedelta -from typing import Dict, List, Optional, Callable, Any -from dataclasses import dataclass, field -from croniter import croniter -from sqlalchemy.orm import Session -import pytz -import uuid -import json - -# Configure logging -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -@dataclass -class ScheduledJob: - """Represents a scheduled job configuration""" - job_id: str - name: str - enabled: bool - schedule: str # Cron expression - description: str - job_type: str - parameters: Dict[str, Any] - priority: str - timeout_minutes: int - retry_on_failure: bool - last_run: Optional[datetime] = None - next_run: Optional[datetime] = None - run_count: int = 0 - failure_count: int = 0 - is_running: bool = False - max_retries: int = 2 - - def __post_init__(self): - if self.next_run is None: - self.calculate_next_run() - - def calculate_next_run(self, base_time: datetime = None) -> datetime: - """Calculate the next run time based on cron schedule""" - if base_time is None: - base_time = datetime.now(pytz.UTC) - - try: - cron = croniter(self.schedule, base_time) - self.next_run = cron.get_next(datetime) - return self.next_run - except Exception as e: - logger.error(f"Error calculating next run for job {self.name}: {e}") - # Fallback to 1 hour from now - self.next_run = base_time + timedelta(hours=1) - return self.next_run - - def should_run(self, current_time: datetime = None) -> bool: - """Check if job should run now""" - if not self.enabled or self.is_running: - return False - - if current_time is None: - current_time = datetime.now(pytz.UTC) - - return self.next_run and current_time >= self.next_run - - def mark_started(self): - """Mark job as started""" - self.is_running = True - self.last_run = datetime.now(pytz.UTC) - self.run_count += 1 - - def mark_completed(self, success: bool = True): - """Mark job as completed""" - self.is_running = False - if not success: - self.failure_count += 1 - self.calculate_next_run() - - def to_dict(self) -> Dict[str, Any]: - """Convert to dictionary for serialization""" - return { - 'job_id': self.job_id, - 'name': self.name, - 'enabled': self.enabled, - 'schedule': self.schedule, - 'description': self.description, - 'job_type': self.job_type, - 'parameters': self.parameters, - 'priority': self.priority, - 'timeout_minutes': self.timeout_minutes, - 'retry_on_failure': self.retry_on_failure, - 'last_run': self.last_run.isoformat() if self.last_run else None, - 'next_run': self.next_run.isoformat() if self.next_run else None, - 'run_count': self.run_count, - 'failure_count': self.failure_count, - 'is_running': self.is_running, - 'max_retries': self.max_retries - } - -class JobRegistry: - """Registry of available job executors""" - - def __init__(self): - self.executors: Dict[str, Callable] = {} - self.db_session_factory = None - - def register_executor(self, job_type: str, executor_func: Callable): - """Register a job executor function""" - self.executors[job_type] = executor_func - logger.info(f"Registered executor for job type: {job_type}") - - def set_db_session_factory(self, session_factory): - """Set database session factory""" - self.db_session_factory = session_factory - - async def execute_job(self, job: ScheduledJob) -> bool: - """Execute a scheduled job""" - if job.job_type not in self.executors: - logger.error(f"No executor found for job type: {job.job_type}") - return False - - try: - logger.info(f"Executing scheduled job: {job.name} (type: {job.job_type})") - - # Get database session - if self.db_session_factory: - db_session = self.db_session_factory() - else: - logger.error("No database session factory available") - return False - - try: - # Execute the job - executor = self.executors[job.job_type] - result = await executor(db_session, job.parameters) - - # Check result - if isinstance(result, dict): - success = result.get('status') in ['completed', 'success'] - if not success: - logger.warning(f"Job {job.name} completed with status: {result.get('status')}") - else: - success = bool(result) - - logger.info(f"Job {job.name} completed successfully: {success}") - return success - - finally: - db_session.close() - - except Exception as e: - logger.error(f"Error executing job {job.name}: {e}") - return False - -class JobScheduler: - """Main job scheduler with cron-like functionality""" - - def __init__(self, config_path: str = "scheduler_config.yaml"): - self.config_path = config_path - self.config: Dict[str, Any] = {} - self.jobs: Dict[str, ScheduledJob] = {} - self.registry = JobRegistry() - self.is_running = False - self.scheduler_thread: Optional[threading.Thread] = None - self.stop_event = threading.Event() - self.timezone = pytz.UTC - self.max_concurrent_jobs = 3 - self.current_jobs = 0 - self.job_lock = threading.Lock() - - # Load configuration - self.load_config() - - # Setup logging - self.setup_logging() - - def load_config(self): - """Load scheduler configuration from YAML file""" - try: - with open(self.config_path, 'r') as f: - self.config = yaml.safe_load(f) - - # Extract scheduler settings - scheduler_config = self.config.get('scheduler', {}) - self.timezone = pytz.timezone(scheduler_config.get('timezone', 'UTC')) - self.max_concurrent_jobs = scheduler_config.get('max_concurrent_jobs', 3) - - # Load job configurations - self.load_jobs() - - logger.info(f"Loaded scheduler configuration with {len(self.jobs)} jobs") - - except Exception as e: - logger.error(f"Error loading scheduler config: {e}") - self.config = {} - - def load_jobs(self): - """Load job configurations from config""" - jobs_config = self.config.get('jobs', {}) - - for job_name, job_config in jobs_config.items(): - try: - job = ScheduledJob( - job_id=str(uuid.uuid4()), - name=job_name, - enabled=job_config.get('enabled', True), - schedule=job_config.get('schedule', '0 0 * * *'), - description=job_config.get('description', ''), - job_type=job_config.get('job_type', job_name), - parameters=job_config.get('parameters', {}), - priority=job_config.get('priority', 'medium'), - timeout_minutes=job_config.get('timeout_minutes', 60), - retry_on_failure=job_config.get('retry_on_failure', True), - max_retries=job_config.get('max_retries', 2) - ) - - self.jobs[job_name] = job - logger.info(f"Loaded job: {job_name} - Next run: {job.next_run}") - - except Exception as e: - logger.error(f"Error loading job {job_name}: {e}") - - def setup_logging(self): - """Setup scheduler-specific logging""" - log_config = self.config.get('logging', {}) - if log_config.get('enabled', True): - log_level = getattr(logging, log_config.get('level', 'INFO')) - logger.setLevel(log_level) - - def register_job_executor(self, job_type: str, executor_func: Callable): - """Register a job executor""" - self.registry.register_executor(job_type, executor_func) - - def set_db_session_factory(self, session_factory): - """Set database session factory""" - self.registry.set_db_session_factory(session_factory) - - def start(self): - """Start the job scheduler""" - if self.is_running: - logger.warning("Scheduler is already running") - return - - if not self.config.get('scheduler', {}).get('enabled', True): - logger.info("Scheduler is disabled in configuration") - return - - self.is_running = True - self.stop_event.clear() - - self.scheduler_thread = threading.Thread(target=self._scheduler_loop, daemon=True) - self.scheduler_thread.start() - - logger.info("Job scheduler started") - - def stop(self): - """Stop the job scheduler""" - if not self.is_running: - return - - self.is_running = False - self.stop_event.set() - - if self.scheduler_thread: - self.scheduler_thread.join(timeout=5) - - logger.info("Job scheduler stopped") - - def _scheduler_loop(self): - """Main scheduler loop""" - logger.info("Scheduler loop started") - - while self.is_running and not self.stop_event.is_set(): - try: - current_time = datetime.now(self.timezone) - - # Check each job - for job_name, job in self.jobs.items(): - if job.should_run(current_time) and self.current_jobs < self.max_concurrent_jobs: - # Execute job in background - threading.Thread( - target=self._execute_job_wrapper, - args=(job,), - daemon=True - ).start() - - # Sleep for 60 seconds (check every minute) - self.stop_event.wait(60) - - except Exception as e: - logger.error(f"Error in scheduler loop: {e}") - self.stop_event.wait(60) - - logger.info("Scheduler loop stopped") - - def _execute_job_wrapper(self, job: ScheduledJob): - """Wrapper for job execution with proper error handling""" - with self.job_lock: - self.current_jobs += 1 - - try: - job.mark_started() - - # Create asyncio event loop for this thread - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - - try: - # Execute job with timeout - success = loop.run_until_complete( - asyncio.wait_for( - self.registry.execute_job(job), - timeout=job.timeout_minutes * 60 - ) - ) - - job.mark_completed(success) - - if not success and job.retry_on_failure and job.failure_count < job.max_retries: - logger.info(f"Job {job.name} failed, will retry later") - # Schedule retry (next run will be calculated) - job.calculate_next_run(datetime.now(self.timezone) + timedelta(minutes=30)) - - except asyncio.TimeoutError: - logger.error(f"Job {job.name} timed out after {job.timeout_minutes} minutes") - job.mark_completed(False) - finally: - loop.close() - - except Exception as e: - logger.error(f"Error executing job {job.name}: {e}") - job.mark_completed(False) - finally: - with self.job_lock: - self.current_jobs -= 1 - - def get_job_status(self, job_name: str = None) -> Dict[str, Any]: - """Get status of jobs""" - if job_name: - job = self.jobs.get(job_name) - if job: - return job.to_dict() - else: - return {"error": f"Job {job_name} not found"} - - return { - "scheduler_running": self.is_running, - "total_jobs": len(self.jobs), - "enabled_jobs": sum(1 for job in self.jobs.values() if job.enabled), - "running_jobs": sum(1 for job in self.jobs.values() if job.is_running), - "jobs": {name: job.to_dict() for name, job in self.jobs.items()} - } - - def enable_job(self, job_name: str) -> bool: - """Enable a job""" - if job_name in self.jobs: - self.jobs[job_name].enabled = True - self.jobs[job_name].calculate_next_run() - logger.info(f"Enabled job: {job_name}") - return True - return False - - def disable_job(self, job_name: str) -> bool: - """Disable a job""" - if job_name in self.jobs: - self.jobs[job_name].enabled = False - logger.info(f"Disabled job: {job_name}") - return True - return False - - def trigger_job(self, job_name: str) -> bool: - """Manually trigger a job""" - if job_name not in self.jobs: - return False - - job = self.jobs[job_name] - if job.is_running: - logger.warning(f"Job {job_name} is already running") - return False - - if self.current_jobs >= self.max_concurrent_jobs: - logger.warning(f"Maximum concurrent jobs reached, cannot start {job_name}") - return False - - # Execute job immediately - threading.Thread( - target=self._execute_job_wrapper, - args=(job,), - daemon=True - ).start() - - logger.info(f"Manually triggered job: {job_name}") - return True - - def update_job_schedule(self, job_name: str, new_schedule: str) -> bool: - """Update job schedule""" - if job_name not in self.jobs: - return False - - try: - # Validate cron expression - croniter(new_schedule) - - job = self.jobs[job_name] - job.schedule = new_schedule - job.calculate_next_run() - - logger.info(f"Updated schedule for job {job_name}: {new_schedule}") - return True - - except Exception as e: - logger.error(f"Invalid cron expression {new_schedule}: {e}") - return False - - def reload_config(self) -> bool: - """Reload configuration from file""" - try: - self.load_config() - logger.info("Configuration reloaded successfully") - return True - except Exception as e: - logger.error(f"Error reloading configuration: {e}") - return False - -# Global scheduler instance -scheduler_instance: Optional[JobScheduler] = None - -def get_scheduler() -> JobScheduler: - """Get the global scheduler instance""" - global scheduler_instance - if scheduler_instance is None: - scheduler_instance = JobScheduler() - return scheduler_instance - -def initialize_scheduler(config_path: str = None) -> JobScheduler: - """Initialize the global scheduler""" - global scheduler_instance - if config_path: - scheduler_instance = JobScheduler(config_path) - else: - scheduler_instance = JobScheduler() - return scheduler_instance \ No newline at end of file diff --git a/backend/main.py b/backend/main.py deleted file mode 100644 index 8e5936a..0000000 --- a/backend/main.py +++ /dev/null @@ -1,1945 +0,0 @@ -from fastapi import FastAPI, HTTPException, BackgroundTasks, Depends -from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import JSONResponse -from sqlalchemy import create_engine, Column, String, Text, DECIMAL, TIMESTAMP, Boolean, ARRAY, Integer, JSON, func, or_ -from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import sessionmaker, Session -from sqlalchemy.dialects.postgresql import UUID -import uuid -from datetime import datetime, timedelta -import requests -import json -import re -import os -from typing import List, Optional -from pydantic import BaseModel -import asyncio -from contextlib import asynccontextmanager -import base64 -from github import Github -from urllib.parse import urlparse -import hashlib -import logging -import threading -from mcdevitt_poc_client import GitHubPoCClient -from cve2capec_client import CVE2CAPECClient - -# Setup logging -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - - -# Database setup -DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://cve_user:cve_password@localhost:5432/cve_sigma_db") -engine = create_engine(DATABASE_URL) -SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) -Base = declarative_base() - -# Database Models -class CVE(Base): - __tablename__ = "cves" - - id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) - cve_id = Column(String(20), unique=True, nullable=False) - description = Column(Text) - cvss_score = Column(DECIMAL(3, 1)) - severity = Column(String(20)) - published_date = Column(TIMESTAMP) - modified_date = Column(TIMESTAMP) - affected_products = Column(ARRAY(String)) - reference_urls = Column(ARRAY(String)) - # Bulk processing fields - data_source = Column(String(20), default='nvd_api') # 'nvd_api', 'nvd_bulk', 'manual' - nvd_json_version = Column(String(10), default='2.0') - bulk_processed = Column(Boolean, default=False) - # nomi-sec PoC fields - poc_count = Column(Integer, default=0) - poc_data = Column(JSON) # Store nomi-sec PoC metadata - # Reference data fields - reference_data = Column(JSON) # Store extracted reference content and analysis - reference_sync_status = Column(String(20), default='pending') # 'pending', 'processing', 'completed', 'failed' - reference_last_synced = Column(TIMESTAMP) - created_at = Column(TIMESTAMP, default=datetime.utcnow) - updated_at = Column(TIMESTAMP, default=datetime.utcnow) - -class SigmaRule(Base): - __tablename__ = "sigma_rules" - - id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) - cve_id = Column(String(20)) - rule_name = Column(String(255), nullable=False) - rule_content = Column(Text, nullable=False) - detection_type = Column(String(50)) - log_source = Column(String(100)) - confidence_level = Column(String(20)) - auto_generated = Column(Boolean, default=True) - exploit_based = Column(Boolean, default=False) - github_repos = Column(ARRAY(String)) - exploit_indicators = Column(Text) # JSON string of extracted indicators - # Enhanced fields for new data sources - poc_source = Column(String(20), default='github_search') # 'github_search', 'nomi_sec', 'manual' - poc_quality_score = Column(Integer, default=0) # Based on star count, activity, etc. - nomi_sec_data = Column(JSON) # Store nomi-sec PoC metadata - created_at = Column(TIMESTAMP, default=datetime.utcnow) - updated_at = Column(TIMESTAMP, default=datetime.utcnow) - -class RuleTemplate(Base): - __tablename__ = "rule_templates" - - id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) - template_name = Column(String(255), nullable=False) - template_content = Column(Text, nullable=False) - applicable_product_patterns = Column(ARRAY(String)) - description = Column(Text) - created_at = Column(TIMESTAMP, default=datetime.utcnow) - - -# Pydantic models -class CVEResponse(BaseModel): - id: str - cve_id: str - description: Optional[str] = None - cvss_score: Optional[float] = None - severity: Optional[str] = None - published_date: Optional[datetime] = None - affected_products: Optional[List[str]] = None - reference_urls: Optional[List[str]] = None - poc_count: Optional[int] = 0 - poc_data: Optional[dict] = {} - - class Config: - from_attributes = True - -class SigmaRuleResponse(BaseModel): - id: str - cve_id: str - rule_name: str - rule_content: str - detection_type: Optional[str] = None - log_source: Optional[str] = None - confidence_level: Optional[str] = None - auto_generated: bool = True - exploit_based: bool = False - github_repos: Optional[List[str]] = None - exploit_indicators: Optional[str] = None - created_at: datetime - - class Config: - from_attributes = True - -# Request models -class BulkSeedRequest(BaseModel): - start_year: int = 2002 - end_year: Optional[int] = None - skip_nvd: bool = False - skip_nomi_sec: bool = True - -class NomiSecSyncRequest(BaseModel): - cve_id: Optional[str] = None - batch_size: int = 50 - -class GitHubPoCSyncRequest(BaseModel): - cve_id: Optional[str] = None - batch_size: int = 50 - -class ExploitDBSyncRequest(BaseModel): - cve_id: Optional[str] = None - batch_size: int = 30 - -class CISAKEVSyncRequest(BaseModel): - cve_id: Optional[str] = None - batch_size: int = 100 - -class ReferenceSyncRequest(BaseModel): - cve_id: Optional[str] = None - batch_size: int = 30 - max_cves: Optional[int] = None - force_resync: bool = False - -class RuleRegenRequest(BaseModel): - force: bool = False - -# GitHub Exploit Analysis Service -class GitHubExploitAnalyzer: - def __init__(self): - self.github_token = os.getenv("GITHUB_TOKEN") - self.github = Github(self.github_token) if self.github_token else None - - async def search_exploits_for_cve(self, cve_id: str) -> List[dict]: - """Search GitHub for exploit code related to a CVE""" - if not self.github: - print(f"No GitHub token configured, skipping exploit search for {cve_id}") - return [] - - try: - print(f"Searching GitHub for exploits for {cve_id}") - - # Search queries to find exploit code - search_queries = [ - f"{cve_id} exploit", - f"{cve_id} poc", - f"{cve_id} vulnerability", - f'"{cve_id}" exploit code', - f"{cve_id.replace('-', '_')} exploit" - ] - - exploits = [] - seen_repos = set() - - for query in search_queries[:2]: # Limit to 2 queries to avoid rate limits - try: - # Search repositories - repos = self.github.search_repositories( - query=query, - sort="updated", - order="desc" - ) - - # Get top 5 results per query - for repo in repos[:5]: - if repo.full_name in seen_repos: - continue - seen_repos.add(repo.full_name) - - # Analyze repository - exploit_info = await self._analyze_repository(repo, cve_id) - if exploit_info: - exploits.append(exploit_info) - - if len(exploits) >= 10: # Limit total exploits - break - - if len(exploits) >= 10: - break - - except Exception as e: - print(f"Error searching GitHub with query '{query}': {str(e)}") - continue - - print(f"Found {len(exploits)} potential exploits for {cve_id}") - return exploits - - except Exception as e: - print(f"Error searching GitHub for {cve_id}: {str(e)}") - return [] - - async def _analyze_repository(self, repo, cve_id: str) -> Optional[dict]: - """Analyze a GitHub repository for exploit code""" - try: - # Check if repo name or description mentions the CVE - repo_text = f"{repo.name} {repo.description or ''}".lower() - if cve_id.lower() not in repo_text and cve_id.replace('-', '_').lower() not in repo_text: - return None - - # Get repository contents - exploit_files = [] - indicators = { - 'processes': set(), - 'files': set(), - 'registry': set(), - 'network': set(), - 'commands': set(), - 'powershell': set(), - 'urls': set() - } - - try: - contents = repo.get_contents("") - for content in contents[:20]: # Limit files to analyze - if content.type == "file" and self._is_exploit_file(content.name): - file_analysis = await self._analyze_file_content(repo, content, cve_id) - if file_analysis: - exploit_files.append(file_analysis) - # Merge indicators - for key, values in file_analysis.get('indicators', {}).items(): - if key in indicators: - indicators[key].update(values) - - except Exception as e: - print(f"Error analyzing repo contents for {repo.full_name}: {str(e)}") - - if not exploit_files: - return None - - return { - 'repo_name': repo.full_name, - 'repo_url': repo.html_url, - 'description': repo.description, - 'language': repo.language, - 'stars': repo.stargazers_count, - 'updated': repo.updated_at.isoformat(), - 'files': exploit_files, - 'indicators': {k: list(v) for k, v in indicators.items()} - } - - except Exception as e: - print(f"Error analyzing repository {repo.full_name}: {str(e)}") - return None - - def _is_exploit_file(self, filename: str) -> bool: - """Check if a file is likely to contain exploit code""" - exploit_extensions = ['.py', '.ps1', '.sh', '.c', '.cpp', '.js', '.rb', '.pl', '.php', '.java'] - exploit_names = ['exploit', 'poc', 'payload', 'shell', 'reverse', 'bind', 'attack'] - - filename_lower = filename.lower() - - # Check extension - if not any(filename_lower.endswith(ext) for ext in exploit_extensions): - return False - - # Check filename for exploit-related terms - return any(term in filename_lower for term in exploit_names) or 'cve' in filename_lower - - async def _analyze_file_content(self, repo, file_content, cve_id: str) -> Optional[dict]: - """Analyze individual file content for exploit indicators""" - try: - if file_content.size > 100000: # Skip files larger than 100KB - return None - - # Decode file content - content = file_content.decoded_content.decode('utf-8', errors='ignore') - - # Check if file actually mentions the CVE - if cve_id.lower() not in content.lower() and cve_id.replace('-', '_').lower() not in content.lower(): - return None - - indicators = self._extract_indicators_from_code(content, file_content.name) - - if not any(indicators.values()): - return None - - return { - 'filename': file_content.name, - 'path': file_content.path, - 'size': file_content.size, - 'indicators': indicators - } - - except Exception as e: - print(f"Error analyzing file {file_content.name}: {str(e)}") - return None - - def _extract_indicators_from_code(self, content: str, filename: str) -> dict: - """Extract security indicators from exploit code""" - indicators = { - 'processes': set(), - 'files': set(), - 'registry': set(), - 'network': set(), - 'commands': set(), - 'powershell': set(), - 'urls': set() - } - - # Process patterns - process_patterns = [ - r'CreateProcess[AW]?\s*\(\s*["\']([^"\']+)["\']', - r'ShellExecute[AW]?\s*\([^,]*,\s*["\']([^"\']+)["\']', - r'system\s*\(\s*["\']([^"\']+)["\']', - r'exec\s*\(\s*["\']([^"\']+)["\']', - r'subprocess\.(?:call|run|Popen)\s*\(\s*["\']([^"\']+)["\']' - ] - - # File patterns - file_patterns = [ - r'(?:fopen|CreateFile|WriteFile|ReadFile)\s*\(\s*["\']([^"\']+\.[a-zA-Z0-9]+)["\']', - r'(?:copy|move|del|rm)\s+["\']?([^\s"\']+\.[a-zA-Z0-9]+)["\']?', - r'\\\\[^\\]+\\[^\\]+\\([^\\]+\.[a-zA-Z0-9]+)', - r'[C-Z]:\\\\[^\\]+\\\\([^\\]+\.[a-zA-Z0-9]+)' - ] - - # Registry patterns - registry_patterns = [ - r'(?:RegOpenKey|RegSetValue|RegCreateKey)\s*\([^,]*,\s*["\']([^"\']+)["\']', - r'HKEY_[A-Z_]+\\\\([^"\'\\]+)', - r'reg\s+add\s+["\']?([^"\'\\]+\\\\[^"\']+)["\']?' - ] - - # Network patterns - network_patterns = [ - r'(?:connect|bind|listen)\s*\([^,]*,\s*(\d+)', - r'socket\.connect\s*\(\s*\(["\']?([^"\']+)["\']?,\s*(\d+)\)', - r'(?:http|https|ftp)://([^\s"\'<>]+)', - r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+)' - ] - - # PowerShell patterns - powershell_patterns = [ - r'(?:powershell|pwsh)\s+(?:-[a-zA-Z]+\s+)*["\']?([^"\']+)["\']?', - r'Invoke-(?:Expression|Command|WebRequest|RestMethod)\s+["\']?([^"\']+)["\']?', - r'Start-Process\s+["\']?([^"\']+)["\']?', - r'Get-Process\s+["\']?([^"\']+)["\']?' - ] - - # Command patterns - command_patterns = [ - r'(?:cmd|command)\s+(?:/[a-zA-Z]+\s+)*["\']?([^"\']+)["\']?', - r'(?:ping|nslookup|netstat|tasklist|wmic)\s+([^\s"\']+)', - r'(?:net|sc|schtasks)\s+[a-zA-Z]+\s+([^\s"\']+)' - ] - - # Extract indicators using regex patterns - patterns = { - 'processes': process_patterns, - 'files': file_patterns, - 'registry': registry_patterns, - 'powershell': powershell_patterns, - 'commands': command_patterns - } - - for category, pattern_list in patterns.items(): - for pattern in pattern_list: - matches = re.findall(pattern, content, re.IGNORECASE | re.MULTILINE) - for match in matches: - if isinstance(match, tuple): - indicators[category].add(match[0]) - else: - indicators[category].add(match) - - # Special handling for network indicators - for pattern in network_patterns: - matches = re.findall(pattern, content, re.IGNORECASE) - for match in matches: - if isinstance(match, tuple): - if len(match) >= 2: - indicators['network'].add(f"{match[0]}:{match[1]}") - else: - indicators['network'].add(match[0]) - else: - indicators['network'].add(match) - - # Convert sets to lists and filter out empty/invalid indicators - cleaned_indicators = {} - for key, values in indicators.items(): - cleaned_values = [v for v in values if v and len(v.strip()) > 2 and len(v) < 200] - if cleaned_values: - cleaned_indicators[key] = cleaned_values[:10] # Limit to 10 per category - - return cleaned_indicators -class CVESigmaService: - def __init__(self, db: Session): - self.db = db - self.nvd_api_key = os.getenv("NVD_API_KEY") - - async def fetch_recent_cves(self, days_back: int = 7): - """Fetch recent CVEs from NVD API""" - end_date = datetime.utcnow() - start_date = end_date - timedelta(days=days_back) - - url = "https://services.nvd.nist.gov/rest/json/cves/2.0" - params = { - "pubStartDate": start_date.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z", - "pubEndDate": end_date.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z", - "resultsPerPage": 100 - } - - headers = {} - if self.nvd_api_key: - headers["apiKey"] = self.nvd_api_key - - try: - response = requests.get(url, params=params, headers=headers, timeout=30) - response.raise_for_status() - data = response.json() - - new_cves = [] - for vuln in data.get("vulnerabilities", []): - cve_data = vuln.get("cve", {}) - cve_id = cve_data.get("id") - - # Check if CVE already exists - existing = self.db.query(CVE).filter(CVE.cve_id == cve_id).first() - if existing: - continue - - # Extract CVE information - description = "" - if cve_data.get("descriptions"): - description = cve_data["descriptions"][0].get("value", "") - - cvss_score = None - severity = None - if cve_data.get("metrics", {}).get("cvssMetricV31"): - cvss_data = cve_data["metrics"]["cvssMetricV31"][0] - cvss_score = cvss_data.get("cvssData", {}).get("baseScore") - severity = cvss_data.get("cvssData", {}).get("baseSeverity") - - affected_products = [] - if cve_data.get("configurations"): - for config in cve_data["configurations"]: - for node in config.get("nodes", []): - for cpe_match in node.get("cpeMatch", []): - if cpe_match.get("vulnerable"): - affected_products.append(cpe_match.get("criteria", "")) - - reference_urls = [] - if cve_data.get("references"): - reference_urls = [ref.get("url", "") for ref in cve_data["references"]] - - cve_obj = CVE( - cve_id=cve_id, - description=description, - cvss_score=cvss_score, - severity=severity, - published_date=datetime.fromisoformat(cve_data.get("published", "").replace("Z", "+00:00")), - modified_date=datetime.fromisoformat(cve_data.get("lastModified", "").replace("Z", "+00:00")), - affected_products=affected_products, - reference_urls=reference_urls - ) - - self.db.add(cve_obj) - new_cves.append(cve_obj) - - self.db.commit() - return new_cves - - except Exception as e: - print(f"Error fetching CVEs: {str(e)}") - return [] - - def generate_sigma_rule(self, cve: CVE) -> Optional[SigmaRule]: - """Generate SIGMA rule based on CVE data""" - if not cve.description: - return None - - # Analyze CVE to determine appropriate template - description_lower = cve.description.lower() - affected_products = [p.lower() for p in (cve.affected_products or [])] - - template = self._select_template(description_lower, affected_products) - if not template: - return None - - # Generate rule content - rule_content = self._populate_template(cve, template) - if not rule_content: - return None - - # Determine detection type and confidence - detection_type = self._determine_detection_type(description_lower) - confidence_level = self._calculate_confidence(cve) - - sigma_rule = SigmaRule( - cve_id=cve.cve_id, - rule_name=f"CVE-{cve.cve_id.split('-')[1]}-{cve.cve_id.split('-')[2]} Detection", - rule_content=rule_content, - detection_type=detection_type, - log_source=template.template_name.lower().replace(" ", "_"), - confidence_level=confidence_level, - auto_generated=True - ) - - self.db.add(sigma_rule) - return sigma_rule - - def _select_template(self, description: str, affected_products: List[str], exploit_indicators: dict = None): - """Select appropriate SIGMA rule template based on CVE and exploit analysis""" - templates = self.db.query(RuleTemplate).all() - - # If we have exploit indicators, use them to determine the best template - if exploit_indicators: - if exploit_indicators.get('powershell'): - powershell_template = next((t for t in templates if "PowerShell" in t.template_name), None) - if powershell_template: - return powershell_template - - if exploit_indicators.get('network'): - network_template = next((t for t in templates if "Network Connection" in t.template_name), None) - if network_template: - return network_template - - if exploit_indicators.get('files'): - file_template = next((t for t in templates if "File Modification" in t.template_name), None) - if file_template: - return file_template - - if exploit_indicators.get('processes') or exploit_indicators.get('commands'): - process_template = next((t for t in templates if "Process Execution" in t.template_name), None) - if process_template: - return process_template - - # Fallback to original logic - if any("windows" in p or "microsoft" in p for p in affected_products): - if "process" in description or "execution" in description: - return next((t for t in templates if "Process Execution" in t.template_name), None) - elif "network" in description or "remote" in description: - return next((t for t in templates if "Network Connection" in t.template_name), None) - elif "file" in description or "write" in description: - return next((t for t in templates if "File Modification" in t.template_name), None) - - # Default to process execution template - return next((t for t in templates if "Process Execution" in t.template_name), None) - - def _populate_template(self, cve: CVE, template: RuleTemplate, exploit_indicators: dict = None) -> str: - """Populate template with CVE-specific data and exploit indicators""" - try: - # Use exploit indicators if available, otherwise extract from description - if exploit_indicators: - suspicious_processes = exploit_indicators.get('processes', []) + exploit_indicators.get('commands', []) - suspicious_ports = [] - file_patterns = exploit_indicators.get('files', []) - - # Extract ports from network indicators - for net_indicator in exploit_indicators.get('network', []): - if ':' in str(net_indicator): - try: - port = int(str(net_indicator).split(':')[-1]) - suspicious_ports.append(port) - except ValueError: - pass - else: - # Fallback to original extraction - suspicious_processes = self._extract_suspicious_indicators(cve.description, "process") - suspicious_ports = self._extract_suspicious_indicators(cve.description, "port") - file_patterns = self._extract_suspicious_indicators(cve.description, "file") - - # Determine severity level - level = "high" if cve.cvss_score and cve.cvss_score >= 7.0 else "medium" - - # Create enhanced description - enhanced_description = cve.description[:200] + "..." if len(cve.description) > 200 else cve.description - if exploit_indicators: - enhanced_description += " [Enhanced with GitHub exploit analysis]" - - # Build tags - tags = [f"attack.{self._get_mitre_technique(cve.description, exploit_indicators)}", cve.cve_id.lower()] - if exploit_indicators: - tags.append("exploit.github") - - rule_content = template.template_content.format( - title=f"CVE-{cve.cve_id} {'Exploit-Based ' if exploit_indicators else ''}Detection", - description=enhanced_description, - rule_id=str(uuid.uuid4()), - date=datetime.utcnow().strftime("%Y/%m/%d"), - cve_url=f"https://nvd.nist.gov/vuln/detail/{cve.cve_id}", - cve_id=cve.cve_id.lower(), - tags="\n - ".join(tags), - suspicious_processes=suspicious_processes or ["suspicious.exe", "malware.exe"], - suspicious_ports=suspicious_ports or [4444, 8080, 9999], - file_patterns=file_patterns or ["temp", "malware", "exploit"], - level=level - ) - - return rule_content - - except Exception as e: - print(f"Error populating template: {str(e)}") - return None - - def _get_mitre_technique(self, description: str, exploit_indicators: dict = None) -> str: - """Map CVE and exploit indicators to MITRE ATT&CK techniques""" - desc_lower = description.lower() - - # Check exploit indicators first - if exploit_indicators: - if exploit_indicators.get('powershell'): - return "t1059.001" # PowerShell - elif exploit_indicators.get('commands'): - return "t1059.003" # Windows Command Shell - elif exploit_indicators.get('network'): - return "t1071.001" # Web Protocols - elif exploit_indicators.get('files'): - return "t1105" # Ingress Tool Transfer - elif exploit_indicators.get('processes'): - return "t1106" # Native API - - # Fallback to description analysis - if "powershell" in desc_lower: - return "t1059.001" - elif "command" in desc_lower or "cmd" in desc_lower: - return "t1059.003" - elif "network" in desc_lower or "remote" in desc_lower: - return "t1071.001" - elif "file" in desc_lower or "upload" in desc_lower: - return "t1105" - elif "process" in desc_lower or "execution" in desc_lower: - return "t1106" - else: - return "execution" # Generic - - def _extract_suspicious_indicators(self, description: str, indicator_type: str) -> List: - """Extract suspicious indicators from CVE description""" - if indicator_type == "process": - # Look for executable names or process patterns - exe_pattern = re.findall(r'(\w+\.exe)', description, re.IGNORECASE) - return exe_pattern[:5] if exe_pattern else None - - elif indicator_type == "port": - # Look for port numbers - port_pattern = re.findall(r'port\s+(\d+)', description, re.IGNORECASE) - return [int(p) for p in port_pattern[:3]] if port_pattern else None - - elif indicator_type == "file": - # Look for file extensions or paths - file_pattern = re.findall(r'(\w+\.\w{3,4})', description, re.IGNORECASE) - return file_pattern[:5] if file_pattern else None - - return None - - def _determine_detection_type(self, description: str, exploit_indicators: dict = None) -> str: - """Determine detection type based on CVE description and exploit indicators""" - if exploit_indicators: - if exploit_indicators.get('powershell'): - return "powershell" - elif exploit_indicators.get('network'): - return "network" - elif exploit_indicators.get('files'): - return "file" - elif exploit_indicators.get('processes') or exploit_indicators.get('commands'): - return "process" - - # Fallback to original logic - if "remote" in description or "network" in description: - return "network" - elif "process" in description or "execution" in description: - return "process" - elif "file" in description or "filesystem" in description: - return "file" - else: - return "general" - - def _calculate_confidence(self, cve: CVE, exploit_based: bool = False) -> str: - """Calculate confidence level for the generated rule""" - base_confidence = 0 - - # CVSS score contributes to confidence - if cve.cvss_score: - if cve.cvss_score >= 9.0: - base_confidence += 3 - elif cve.cvss_score >= 7.0: - base_confidence += 2 - else: - base_confidence += 1 - - # Exploit-based rules get higher confidence - if exploit_based: - base_confidence += 2 - - # Map to confidence levels - if base_confidence >= 4: - return "high" - elif base_confidence >= 2: - return "medium" - else: - return "low" - -# Dependency -def get_db(): - db = SessionLocal() - try: - yield db - finally: - db.close() - -# Background task to fetch CVEs and generate rules -async def background_cve_fetch(): - retry_count = 0 - max_retries = 3 - - while True: - try: - db = SessionLocal() - service = CVESigmaService(db) - current_time = datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S') - print(f"[{current_time}] Starting CVE fetch cycle...") - - # Use a longer initial period (30 days) to find CVEs - new_cves = await service.fetch_recent_cves(days_back=30) - - if new_cves: - print(f"Found {len(new_cves)} new CVEs, generating SIGMA rules...") - rules_generated = 0 - for cve in new_cves: - try: - sigma_rule = service.generate_sigma_rule(cve) - if sigma_rule: - rules_generated += 1 - print(f"Generated SIGMA rule for {cve.cve_id}") - else: - print(f"Could not generate rule for {cve.cve_id} - insufficient data") - except Exception as e: - print(f"Error generating rule for {cve.cve_id}: {str(e)}") - - db.commit() - print(f"Successfully generated {rules_generated} SIGMA rules") - retry_count = 0 # Reset retry count on success - else: - print("No new CVEs found in this cycle") - # After first successful run, reduce to 7 days for regular updates - if retry_count == 0: - print("Switching to 7-day lookback for future runs...") - - db.close() - - except Exception as e: - retry_count += 1 - print(f"Background task error (attempt {retry_count}/{max_retries}): {str(e)}") - if retry_count >= max_retries: - print(f"Max retries reached, waiting longer before next attempt...") - await asyncio.sleep(1800) # Wait 30 minutes on repeated failures - retry_count = 0 - else: - await asyncio.sleep(300) # Wait 5 minutes before retry - continue - - # Wait 1 hour before next fetch (or 30 minutes if there were errors) - wait_time = 3600 if retry_count == 0 else 1800 - print(f"Next CVE fetch in {wait_time//60} minutes...") - await asyncio.sleep(wait_time) - -@asynccontextmanager -async def lifespan(app: FastAPI): - # Initialize database - Base.metadata.create_all(bind=engine) - - # Initialize rule templates - db = SessionLocal() - try: - existing_templates = db.query(RuleTemplate).count() - if existing_templates == 0: - logger.info("No rule templates found. Database initialization will handle template creation.") - except Exception as e: - logger.error(f"Error checking rule templates: {e}") - finally: - db.close() - - # Note: Job scheduling is now handled by Celery Beat - # All scheduled tasks are defined in celery_config.py - logger.info("Application startup complete - scheduled tasks handled by Celery Beat") - - yield - - # Shutdown - logger.info("Application shutdown complete") - -# FastAPI app -app = FastAPI(title="CVE-SIGMA Auto Generator", lifespan=lifespan) - -app.add_middleware( - CORSMiddleware, - allow_origins=["http://localhost:3000"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) - -# Include Celery job management routes -try: - from routers.celery_jobs import router as celery_router - app.include_router(celery_router, prefix="/api") - logger.info("Celery job routes loaded successfully") -except ImportError as e: - logger.warning(f"Celery job routes not available: {e}") -except Exception as e: - logger.error(f"Error loading Celery job routes: {e}") - -@app.get("/api/cves", response_model=dict) -async def get_cves( - skip: int = 0, - limit: int = 50, - search: Optional[str] = None, - severity: Optional[str] = None, - db: Session = Depends(get_db) -): - # Build query with filters - query = db.query(CVE) - - # Search filter - if search: - search_filter = f"%{search}%" - query = query.filter( - or_( - CVE.cve_id.ilike(search_filter), - CVE.description.ilike(search_filter), - CVE.affected_products.any(search_filter) - ) - ) - - # Severity filter - if severity: - query = query.filter(CVE.severity.ilike(severity)) - - # Get total count for pagination - total_count = query.count() - - # Apply pagination and ordering - cves = query.order_by(CVE.published_date.desc()).offset(skip).limit(limit).all() - - # Convert UUID to string for each CVE - result = [] - for cve in cves: - cve_dict = { - 'id': str(cve.id), - 'cve_id': cve.cve_id, - 'description': cve.description, - 'cvss_score': float(cve.cvss_score) if cve.cvss_score else None, - 'severity': cve.severity, - 'published_date': cve.published_date, - 'affected_products': cve.affected_products, - 'reference_urls': cve.reference_urls, - 'poc_count': cve.poc_count or 0, - 'poc_data': cve.poc_data or {} - } - result.append(CVEResponse(**cve_dict)) - - return { - 'cves': result, - 'total': total_count, - 'skip': skip, - 'limit': limit, - 'has_more': skip + limit < total_count - } - -@app.get("/api/cves/{cve_id}", response_model=CVEResponse) -async def get_cve(cve_id: str, db: Session = Depends(get_db)): - cve = db.query(CVE).filter(CVE.cve_id == cve_id).first() - if not cve: - raise HTTPException(status_code=404, detail="CVE not found") - - cve_dict = { - 'id': str(cve.id), - 'cve_id': cve.cve_id, - 'description': cve.description, - 'cvss_score': float(cve.cvss_score) if cve.cvss_score else None, - 'severity': cve.severity, - 'published_date': cve.published_date, - 'affected_products': cve.affected_products, - 'reference_urls': cve.reference_urls, - 'poc_count': cve.poc_count or 0, - 'poc_data': cve.poc_data or {} - } - return CVEResponse(**cve_dict) - -@app.get("/api/sigma-rules", response_model=List[SigmaRuleResponse]) -async def get_sigma_rules(skip: int = 0, limit: int = 50, db: Session = Depends(get_db)): - rules = db.query(SigmaRule).order_by(SigmaRule.created_at.desc()).offset(skip).limit(limit).all() - # Convert UUID to string for each rule - result = [] - for rule in rules: - rule_dict = { - 'id': str(rule.id), - 'cve_id': rule.cve_id, - 'rule_name': rule.rule_name, - 'rule_content': rule.rule_content, - 'detection_type': rule.detection_type, - 'log_source': rule.log_source, - 'confidence_level': rule.confidence_level, - 'auto_generated': rule.auto_generated, - 'exploit_based': rule.exploit_based or False, - 'github_repos': rule.github_repos or [], - 'exploit_indicators': rule.exploit_indicators, - 'created_at': rule.created_at - } - result.append(SigmaRuleResponse(**rule_dict)) - return result - -@app.get("/api/sigma-rules/{cve_id}", response_model=List[SigmaRuleResponse]) -async def get_sigma_rules_by_cve(cve_id: str, db: Session = Depends(get_db)): - rules = db.query(SigmaRule).filter(SigmaRule.cve_id == cve_id).all() - # Convert UUID to string for each rule - result = [] - for rule in rules: - rule_dict = { - 'id': str(rule.id), - 'cve_id': rule.cve_id, - 'rule_name': rule.rule_name, - 'rule_content': rule.rule_content, - 'detection_type': rule.detection_type, - 'log_source': rule.log_source, - 'confidence_level': rule.confidence_level, - 'auto_generated': rule.auto_generated, - 'exploit_based': rule.exploit_based or False, - 'github_repos': rule.github_repos or [], - 'exploit_indicators': rule.exploit_indicators, - 'created_at': rule.created_at - } - result.append(SigmaRuleResponse(**rule_dict)) - return result - -@app.post("/api/fetch-cves") -async def manual_fetch_cves(background_tasks: BackgroundTasks, db: Session = Depends(get_db)): - async def fetch_task(): - try: - service = CVESigmaService(db) - print("Manual CVE fetch initiated...") - # Use 30 days for manual fetch to get more results - new_cves = await service.fetch_recent_cves(days_back=30) - - rules_generated = 0 - for cve in new_cves: - sigma_rule = service.generate_sigma_rule(cve) - if sigma_rule: - rules_generated += 1 - - db.commit() - print(f"Manual fetch complete: {len(new_cves)} CVEs, {rules_generated} rules generated") - except Exception as e: - print(f"Manual fetch error: {str(e)}") - import traceback - traceback.print_exc() - - background_tasks.add_task(fetch_task) - return {"message": "CVE fetch initiated (30-day lookback)", "status": "started"} - -@app.get("/api/test-nvd") -async def test_nvd_connection(): - """Test endpoint to check NVD API connectivity""" - try: - # Test with a simple request using current date - end_date = datetime.utcnow() - start_date = end_date - timedelta(days=30) - - url = "https://services.nvd.nist.gov/rest/json/cves/2.0/" - params = { - "lastModStartDate": start_date.strftime("%Y-%m-%dT%H:%M:%S.000+00:00"), - "lastModEndDate": end_date.strftime("%Y-%m-%dT%H:%M:%S.000+00:00"), - "resultsPerPage": 5, - "startIndex": 0 - } - - headers = { - "User-Agent": "CVE-SIGMA-Generator/1.0", - "Accept": "application/json" - } - - nvd_api_key = os.getenv("NVD_API_KEY") - if nvd_api_key: - headers["apiKey"] = nvd_api_key - - print(f"Testing NVD API with URL: {url}") - print(f"Test params: {params}") - print(f"Test headers: {headers}") - - response = requests.get(url, params=params, headers=headers, timeout=15) - - result = { - "status": "success" if response.status_code == 200 else "error", - "status_code": response.status_code, - "has_api_key": bool(nvd_api_key), - "request_url": f"{url}?{requests.compat.urlencode(params)}", - "response_headers": dict(response.headers) - } - - if response.status_code == 200: - data = response.json() - result.update({ - "total_results": data.get("totalResults", 0), - "results_per_page": data.get("resultsPerPage", 0), - "vulnerabilities_returned": len(data.get("vulnerabilities", [])), - "message": "NVD API is accessible and returning data" - }) - else: - result.update({ - "error_message": response.text[:200], - "message": f"NVD API returned {response.status_code}" - }) - - # Try fallback without date filters if we get 404 - if response.status_code == 404: - print("Trying fallback without date filters...") - fallback_params = { - "resultsPerPage": 5, - "startIndex": 0 - } - fallback_response = requests.get(url, params=fallback_params, headers=headers, timeout=15) - result["fallback_status_code"] = fallback_response.status_code - - if fallback_response.status_code == 200: - fallback_data = fallback_response.json() - result.update({ - "fallback_success": True, - "fallback_total_results": fallback_data.get("totalResults", 0), - "message": "NVD API works without date filters" - }) - - return result - - except Exception as e: - print(f"NVD API test error: {str(e)}") - return { - "status": "error", - "message": f"Failed to connect to NVD API: {str(e)}" - } - -@app.get("/api/stats") -async def get_stats(db: Session = Depends(get_db)): - total_cves = db.query(CVE).count() - total_rules = db.query(SigmaRule).count() - recent_cves = db.query(CVE).filter(CVE.published_date >= datetime.utcnow() - timedelta(days=7)).count() - - # Enhanced stats with bulk processing info - bulk_processed_cves = db.query(CVE).filter(CVE.bulk_processed == True).count() - cves_with_pocs = db.query(CVE).filter(CVE.poc_count > 0).count() - nomi_sec_rules = db.query(SigmaRule).filter(SigmaRule.poc_source == 'nomi_sec').count() - - return { - "total_cves": total_cves, - "total_sigma_rules": total_rules, - "recent_cves_7_days": recent_cves, - "bulk_processed_cves": bulk_processed_cves, - "cves_with_pocs": cves_with_pocs, - "nomi_sec_rules": nomi_sec_rules, - "poc_coverage": (cves_with_pocs / total_cves * 100) if total_cves > 0 else 0, - "nomi_sec_coverage": (nomi_sec_rules / total_rules * 100) if total_rules > 0 else 0 - } - -# New bulk processing endpoints -@app.post("/api/bulk-seed") -async def start_bulk_seed(request: BulkSeedRequest): - """Start bulk seeding process - redirects to async endpoint""" - try: - from routers.celery_jobs import start_bulk_seed as async_bulk_seed - from routers.celery_jobs import BulkSeedRequest as CeleryBulkSeedRequest - - # Convert request to Celery format - celery_request = CeleryBulkSeedRequest( - start_year=request.start_year, - end_year=request.end_year, - skip_nvd=request.skip_nvd, - skip_nomi_sec=request.skip_nomi_sec, - skip_exploitdb=getattr(request, 'skip_exploitdb', False), - skip_cisa_kev=getattr(request, 'skip_cisa_kev', False) - ) - - # Call async endpoint - result = await async_bulk_seed(celery_request) - - return { - "message": "Bulk seeding process started (async)", - "status": "started", - "task_id": result.task_id, - "start_year": request.start_year, - "end_year": request.end_year or datetime.now().year, - "skip_nvd": request.skip_nvd, - "skip_nomi_sec": request.skip_nomi_sec, - "async_endpoint": f"/api/task-status/{result.task_id}" - } - except Exception as e: - logger.error(f"Error starting bulk seed: {e}") - raise HTTPException(status_code=500, detail=f"Failed to start bulk seed: {e}") - -@app.post("/api/incremental-update") -async def start_incremental_update(): - """Start incremental update process - redirects to async endpoint""" - try: - from routers.celery_jobs import start_incremental_update as async_incremental_update - - # Call async endpoint - result = await async_incremental_update() - - return { - "message": "Incremental update process started (async)", - "status": "started", - "task_id": result.task_id, - "async_endpoint": f"/api/task-status/{result.task_id}" - } - except Exception as e: - logger.error(f"Error starting incremental update: {e}") - raise HTTPException(status_code=500, detail=f"Failed to start incremental update: {e}") - -@app.post("/api/sync-nomi-sec") -async def sync_nomi_sec(request: NomiSecSyncRequest): - """Synchronize nomi-sec PoC data - redirects to async endpoint""" - try: - from routers.celery_jobs import start_nomi_sec_sync as async_nomi_sec_sync - from routers.celery_jobs import DataSyncRequest as CeleryDataSyncRequest - - # Convert request to Celery format - celery_request = CeleryDataSyncRequest( - batch_size=request.batch_size - ) - - # Call async endpoint - result = await async_nomi_sec_sync(celery_request) - - return { - "message": f"Nomi-sec sync started (async)" + (f" for {request.cve_id}" if request.cve_id else " for all CVEs"), - "status": "started", - "task_id": result.task_id, - "cve_id": request.cve_id, - "batch_size": request.batch_size, - "async_endpoint": f"/api/task-status/{result.task_id}" - } - except Exception as e: - logger.error(f"Error starting nomi-sec sync: {e}") - raise HTTPException(status_code=500, detail=f"Failed to start nomi-sec sync: {e}") - -@app.post("/api/sync-github-pocs") -async def sync_github_pocs(request: GitHubPoCSyncRequest, - db: Session = Depends(get_db)): - """Synchronize GitHub PoC data using Celery task""" - try: - from celery_config import celery_app - from tasks.data_sync_tasks import sync_github_poc_task - - # Launch Celery task - if request.cve_id: - # For specific CVE sync, we'll still use the general task - task_result = sync_github_poc_task.delay(batch_size=request.batch_size) - else: - # For bulk sync - task_result = sync_github_poc_task.delay(batch_size=request.batch_size) - - return { - "message": f"GitHub PoC sync started via Celery" + (f" for {request.cve_id}" if request.cve_id else " for all CVEs"), - "status": "started", - "task_id": task_result.id, - "cve_id": request.cve_id, - "batch_size": request.batch_size, - "monitor_url": "http://localhost:5555/task/" + task_result.id - } - - except Exception as e: - logger.error(f"Error starting GitHub PoC sync via Celery: {e}") - raise HTTPException(status_code=500, detail=f"Failed to start GitHub PoC sync: {e}") - -@app.post("/api/sync-exploitdb") -async def sync_exploitdb(request: ExploitDBSyncRequest, db: Session = Depends(get_db)): - """Synchronize ExploitDB data from git mirror""" - - try: - # Import Celery task - from tasks.data_sync_tasks import sync_exploitdb_task - - # Start Celery task - task_result = sync_exploitdb_task.delay(batch_size=request.batch_size) - - return { - "message": f"ExploitDB sync started" + (f" for {request.cve_id}" if request.cve_id else " for all CVEs"), - "status": "started", - "task_id": task_result.id, - "monitor_url": f"http://localhost:5555/task/{task_result.id}", - "batch_size": request.batch_size - } - except Exception as e: - logger.error(f"Error starting ExploitDB sync via Celery: {e}") - raise HTTPException(status_code=500, detail=f"Failed to start ExploitDB sync: {e}") - -@app.post("/api/sync-cisa-kev") -async def sync_cisa_kev(request: CISAKEVSyncRequest, db: Session = Depends(get_db)): - """Synchronize CISA Known Exploited Vulnerabilities data""" - - try: - # Import Celery task - from tasks.data_sync_tasks import sync_cisa_kev_task - - # Start Celery task - task_result = sync_cisa_kev_task.delay(batch_size=request.batch_size) - - return { - "message": f"CISA KEV sync started" + (f" for {request.cve_id}" if request.cve_id else " for all CVEs"), - "status": "started", - "task_id": task_result.id, - "monitor_url": f"http://localhost:5555/task/{task_result.id}", - "batch_size": request.batch_size - } - except Exception as e: - logger.error(f"Error starting CISA KEV sync via Celery: {e}") - raise HTTPException(status_code=500, detail=f"Failed to start CISA KEV sync: {e}") - -@app.post("/api/sync-references") -async def sync_references(request: ReferenceSyncRequest, background_tasks: BackgroundTasks, db: Session = Depends(get_db)): - """Start reference data synchronization""" - - try: - from reference_client import ReferenceClient - client = ReferenceClient(db) - - # Create job ID - job_id = str(uuid.uuid4()) - - # Add job to tracking - running_jobs[job_id] = { - 'type': 'reference_sync', - 'status': 'running', - 'cve_id': request.cve_id, - 'batch_size': request.batch_size, - 'max_cves': request.max_cves, - 'force_resync': request.force_resync, - 'started_at': datetime.utcnow() - } - - # Create cancellation flag - job_cancellation_flags[job_id] = False - - async def sync_task(): - try: - if request.cve_id: - # Single CVE sync - result = await client.sync_cve_references(request.cve_id) - running_jobs[job_id]['result'] = result - running_jobs[job_id]['status'] = 'completed' - else: - # Bulk sync - result = await client.bulk_sync_references( - batch_size=request.batch_size, - max_cves=request.max_cves, - force_resync=request.force_resync, - cancellation_flag=lambda: job_cancellation_flags.get(job_id, False) - ) - running_jobs[job_id]['result'] = result - running_jobs[job_id]['status'] = 'completed' - - running_jobs[job_id]['completed_at'] = datetime.utcnow() - - except Exception as e: - logger.error(f"Reference sync task failed: {e}") - running_jobs[job_id]['status'] = 'failed' - running_jobs[job_id]['error'] = str(e) - running_jobs[job_id]['completed_at'] = datetime.utcnow() - finally: - # Clean up cancellation flag - job_cancellation_flags.pop(job_id, None) - - background_tasks.add_task(sync_task) - - return { - "message": f"Reference sync started" + (f" for {request.cve_id}" if request.cve_id else " for all CVEs"), - "status": "started", - "job_id": job_id, - "cve_id": request.cve_id, - "batch_size": request.batch_size, - "max_cves": request.max_cves, - "force_resync": request.force_resync - } - - except Exception as e: - logger.error(f"Failed to start reference sync: {e}") - raise HTTPException(status_code=500, detail=f"Failed to start reference sync: {str(e)}") - -@app.get("/api/reference-stats") -async def get_reference_stats(db: Session = Depends(get_db)): - """Get reference synchronization statistics""" - - try: - from reference_client import ReferenceClient - client = ReferenceClient(db) - - # Get sync status - status = await client.get_reference_sync_status() - - # Get quality distribution from reference data - quality_distribution = {} - from sqlalchemy import text - cves_with_references = db.query(CVE).filter( - text("reference_data::text LIKE '%\"reference_analysis\"%'") - ).all() - - for cve in cves_with_references: - if cve.reference_data and 'reference_analysis' in cve.reference_data: - ref_analysis = cve.reference_data['reference_analysis'] - high_conf_refs = ref_analysis.get('high_confidence_references', 0) - total_refs = ref_analysis.get('reference_count', 0) - - if total_refs > 0: - quality_ratio = high_conf_refs / total_refs - if quality_ratio >= 0.8: - quality_tier = 'excellent' - elif quality_ratio >= 0.6: - quality_tier = 'good' - elif quality_ratio >= 0.4: - quality_tier = 'fair' - else: - quality_tier = 'poor' - - quality_distribution[quality_tier] = quality_distribution.get(quality_tier, 0) + 1 - - # Get reference type distribution - reference_type_distribution = {} - for cve in cves_with_references: - if cve.reference_data and 'reference_analysis' in cve.reference_data: - ref_analysis = cve.reference_data['reference_analysis'] - ref_types = ref_analysis.get('reference_types', []) - for ref_type in ref_types: - reference_type_distribution[ref_type] = reference_type_distribution.get(ref_type, 0) + 1 - - return { - 'reference_sync_status': status, - 'quality_distribution': quality_distribution, - 'reference_type_distribution': reference_type_distribution, - 'total_with_reference_analysis': len(cves_with_references), - 'source': 'reference_extraction' - } - - except Exception as e: - logger.error(f"Failed to get reference stats: {e}") - raise HTTPException(status_code=500, detail=f"Failed to get reference stats: {str(e)}") - -@app.get("/api/exploitdb-stats") -async def get_exploitdb_stats(db: Session = Depends(get_db)): - """Get ExploitDB-related statistics""" - - try: - from exploitdb_client_local import ExploitDBLocalClient - client = ExploitDBLocalClient(db) - - # Get sync status - status = await client.get_exploitdb_sync_status() - - # Get quality distribution from ExploitDB data - quality_distribution = {} - from sqlalchemy import text - cves_with_exploitdb = db.query(CVE).filter( - text("poc_data::text LIKE '%\"exploitdb\"%'") - ).all() - - for cve in cves_with_exploitdb: - if cve.poc_data and 'exploitdb' in cve.poc_data: - exploits = cve.poc_data['exploitdb'].get('exploits', []) - for exploit in exploits: - quality_tier = exploit.get('quality_analysis', {}).get('quality_tier', 'unknown') - quality_distribution[quality_tier] = quality_distribution.get(quality_tier, 0) + 1 - - # Get category distribution - category_distribution = {} - for cve in cves_with_exploitdb: - if cve.poc_data and 'exploitdb' in cve.poc_data: - exploits = cve.poc_data['exploitdb'].get('exploits', []) - for exploit in exploits: - category = exploit.get('category', 'unknown') - category_distribution[category] = category_distribution.get(category, 0) + 1 - - return { - "exploitdb_sync_status": status, - "quality_distribution": quality_distribution, - "category_distribution": category_distribution, - "total_exploitdb_cves": len(cves_with_exploitdb), - "total_exploits": sum( - len(cve.poc_data.get('exploitdb', {}).get('exploits', [])) - for cve in cves_with_exploitdb - if cve.poc_data and 'exploitdb' in cve.poc_data - ) - } - - except Exception as e: - logger.error(f"Error getting ExploitDB stats: {e}") - raise HTTPException(status_code=500, detail=str(e)) - -@app.get("/api/github-poc-stats") -async def get_github_poc_stats(db: Session = Depends(get_db)): - """Get GitHub PoC-related statistics""" - - try: - # Get basic statistics - github_poc_rules = db.query(SigmaRule).filter(SigmaRule.poc_source == 'github_poc').count() - cves_with_github_pocs = db.query(CVE).filter( - CVE.poc_data.isnot(None), # Check if poc_data exists - func.json_extract_path_text(CVE.poc_data, '0', 'source') == 'github_poc' - ).count() - - # Get quality distribution - quality_distribution = {} - try: - quality_results = db.query( - func.json_extract_path_text(CVE.poc_data, '0', 'quality_analysis', 'quality_tier').label('tier'), - func.count().label('count') - ).filter( - CVE.poc_data.isnot(None), - func.json_extract_path_text(CVE.poc_data, '0', 'source') == 'github_poc' - ).group_by('tier').all() - - for tier, count in quality_results: - if tier: - quality_distribution[tier] = count - except Exception as e: - logger.warning(f"Error getting quality distribution: {e}") - quality_distribution = {} - - # Calculate average quality score - try: - avg_quality = db.query( - func.avg(func.json_extract_path_text(CVE.poc_data, '0', 'quality_analysis', 'quality_score').cast(Integer)) - ).filter( - CVE.poc_data.isnot(None), - func.json_extract_path_text(CVE.poc_data, '0', 'source') == 'github_poc' - ).scalar() or 0 - except Exception as e: - logger.warning(f"Error calculating average quality: {e}") - avg_quality = 0 - - return { - 'github_poc_rules': github_poc_rules, - 'cves_with_github_pocs': cves_with_github_pocs, - 'quality_distribution': quality_distribution, - 'average_quality_score': float(avg_quality) if avg_quality else 0, - 'source': 'github_poc' - } - except Exception as e: - logger.error(f"Error getting GitHub PoC stats: {e}") - return {"error": str(e)} - -@app.get("/api/github-poc-status") -async def get_github_poc_status(db: Session = Depends(get_db)): - """Get GitHub PoC data availability status""" - - try: - client = GitHubPoCClient(db) - - # Check if GitHub PoC data is available - github_poc_data = client.load_github_poc_data() - - return { - 'github_poc_data_available': len(github_poc_data) > 0, - 'total_cves_with_pocs': len(github_poc_data), - 'sample_cve_ids': list(github_poc_data.keys())[:10], # First 10 CVE IDs - 'data_path': str(client.github_poc_path), - 'path_exists': client.github_poc_path.exists() - } - except Exception as e: - logger.error(f"Error checking GitHub PoC status: {e}") - return {"error": str(e)} - -@app.get("/api/cisa-kev-stats") -async def get_cisa_kev_stats(db: Session = Depends(get_db)): - """Get CISA KEV-related statistics""" - - try: - from cisa_kev_client import CISAKEVClient - client = CISAKEVClient(db) - - # Get sync status - status = await client.get_kev_sync_status() - - # Get threat level distribution from CISA KEV data - threat_level_distribution = {} - from sqlalchemy import text - cves_with_kev = db.query(CVE).filter( - text("poc_data::text LIKE '%\"cisa_kev\"%'") - ).all() - - for cve in cves_with_kev: - if cve.poc_data and 'cisa_kev' in cve.poc_data: - vuln_data = cve.poc_data['cisa_kev'].get('vulnerability_data', {}) - threat_level = vuln_data.get('threat_level', 'unknown') - threat_level_distribution[threat_level] = threat_level_distribution.get(threat_level, 0) + 1 - - # Get vulnerability category distribution - category_distribution = {} - for cve in cves_with_kev: - if cve.poc_data and 'cisa_kev' in cve.poc_data: - vuln_data = cve.poc_data['cisa_kev'].get('vulnerability_data', {}) - category = vuln_data.get('vulnerability_category', 'unknown') - category_distribution[category] = category_distribution.get(category, 0) + 1 - - # Get ransomware usage statistics - ransomware_stats = {'known': 0, 'unknown': 0} - for cve in cves_with_kev: - if cve.poc_data and 'cisa_kev' in cve.poc_data: - vuln_data = cve.poc_data['cisa_kev'].get('vulnerability_data', {}) - ransomware_use = vuln_data.get('known_ransomware_use', 'Unknown').lower() - if ransomware_use == 'known': - ransomware_stats['known'] += 1 - else: - ransomware_stats['unknown'] += 1 - - # Calculate average threat score - threat_scores = [] - for cve in cves_with_kev: - if cve.poc_data and 'cisa_kev' in cve.poc_data: - vuln_data = cve.poc_data['cisa_kev'].get('vulnerability_data', {}) - threat_score = vuln_data.get('threat_score', 0) - if threat_score: - threat_scores.append(threat_score) - - avg_threat_score = sum(threat_scores) / len(threat_scores) if threat_scores else 0 - - return { - "cisa_kev_sync_status": status, - "threat_level_distribution": threat_level_distribution, - "category_distribution": category_distribution, - "ransomware_stats": ransomware_stats, - "average_threat_score": round(avg_threat_score, 2), - "total_kev_cves": len(cves_with_kev), - "total_with_threat_scores": len(threat_scores) - } - - except Exception as e: - logger.error(f"Error getting CISA KEV stats: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - - -@app.get("/api/poc-stats") -async def get_poc_stats(db: Session = Depends(get_db)): - """Get PoC-related statistics""" - - try: - from nomi_sec_client import NomiSecClient - client = NomiSecClient(db) - stats = await client.get_sync_status() - - # Additional PoC statistics - high_quality_cves = db.query(CVE).filter( - CVE.poc_count > 0, - func.json_extract_path_text(CVE.poc_data, '0', 'quality_analysis', 'quality_score').cast(Integer) > 60 - ).count() - - stats.update({ - 'high_quality_cves': high_quality_cves, - 'avg_poc_count': db.query(func.avg(CVE.poc_count)).filter(CVE.poc_count > 0).scalar() or 0 - }) - - return stats - except Exception as e: - logger.error(f"Error getting PoC stats: {e}") - return {"error": str(e)} - -@app.post("/api/sync-cve2capec") -async def sync_cve2capec(force_refresh: bool = False): - """Synchronize CVE2CAPEC MITRE ATT&CK mappings using Celery task""" - try: - from celery_config import celery_app - from tasks.data_sync_tasks import sync_cve2capec_task - - # Launch Celery task - task_result = sync_cve2capec_task.delay(force_refresh=force_refresh) - - return { - "message": "CVE2CAPEC MITRE ATT&CK mapping sync started via Celery", - "status": "started", - "task_id": task_result.id, - "force_refresh": force_refresh, - "monitor_url": f"http://localhost:5555/task/{task_result.id}" - } - - except ImportError as e: - logger.error(f"Failed to import Celery components: {e}") - raise HTTPException(status_code=500, detail="Celery not properly configured") - except Exception as e: - logger.error(f"Error starting CVE2CAPEC sync: {e}") - raise HTTPException(status_code=500, detail=f"Failed to start CVE2CAPEC sync: {e}") - -@app.post("/api/build-exploitdb-index") -async def build_exploitdb_index(): - """Build/rebuild ExploitDB file index using Celery task""" - try: - from celery_config import celery_app - from tasks.data_sync_tasks import build_exploitdb_index_task - - # Launch Celery task - task_result = build_exploitdb_index_task.delay() - - return { - "message": "ExploitDB file index build started via Celery", - "status": "started", - "task_id": task_result.id, - "monitor_url": f"http://localhost:5555/task/{task_result.id}" - } - - except ImportError as e: - logger.error(f"Failed to import Celery components: {e}") - raise HTTPException(status_code=500, detail="Celery not properly configured") - except Exception as e: - logger.error(f"Error starting ExploitDB index build: {e}") - raise HTTPException(status_code=500, detail=f"Failed to start ExploitDB index build: {e}") - -@app.get("/api/cve2capec-stats") -async def get_cve2capec_stats(): - """Get CVE2CAPEC MITRE ATT&CK mapping statistics""" - - try: - client = CVE2CAPECClient() - stats = client.get_stats() - - return { - "status": "success", - "data": stats, - "description": "CVE to MITRE ATT&CK technique mappings from CVE2CAPEC repository" - } - except Exception as e: - logger.error(f"Error getting CVE2CAPEC stats: {e}") - return {"error": str(e)} - -@app.post("/api/regenerate-rules") -async def regenerate_sigma_rules(background_tasks: BackgroundTasks, - request: RuleRegenRequest, - db: Session = Depends(get_db)): - """Regenerate SIGMA rules using enhanced nomi-sec data""" - - async def regenerate_task(): - try: - from enhanced_sigma_generator import EnhancedSigmaGenerator - generator = EnhancedSigmaGenerator(db) - - # Get CVEs with PoC data - cves_with_pocs = db.query(CVE).filter(CVE.poc_count > 0).all() - - rules_generated = 0 - rules_updated = 0 - - for cve in cves_with_pocs: - # Check if we should regenerate - existing_rule = db.query(SigmaRule).filter( - SigmaRule.cve_id == cve.cve_id - ).first() - - if existing_rule and existing_rule.poc_source == 'nomi_sec' and not request.force: - continue - - # Generate enhanced rule - result = await generator.generate_enhanced_rule(cve) - - if result['success']: - if existing_rule: - rules_updated += 1 - else: - rules_generated += 1 - - logger.info(f"Rule regeneration completed: {rules_generated} new, {rules_updated} updated") - - except Exception as e: - logger.error(f"Rule regeneration failed: {e}") - import traceback - traceback.print_exc() - - background_tasks.add_task(regenerate_task) - - return { - "message": "SIGMA rule regeneration started", - "status": "started", - "force": request.force - } - -@app.post("/api/llm-enhanced-rules") -async def generate_llm_enhanced_rules(request: dict, background_tasks: BackgroundTasks, db: Session = Depends(get_db)): - """Generate SIGMA rules using LLM API for enhanced analysis""" - - # Parse request parameters - cve_id = request.get('cve_id') - force = request.get('force', False) - llm_provider = request.get('provider', os.getenv('LLM_PROVIDER')) - llm_model = request.get('model', os.getenv('LLM_MODEL')) - - # Validation - if cve_id and not re.match(r'^CVE-\d{4}-\d{4,}$', cve_id): - raise HTTPException(status_code=400, detail="Invalid CVE ID format") - - async def llm_generation_task(): - """Background task for LLM-enhanced rule generation""" - try: - from enhanced_sigma_generator import EnhancedSigmaGenerator - - generator = EnhancedSigmaGenerator(db, llm_provider, llm_model) - - # Process specific CVE or all CVEs with PoC data - if cve_id: - cve = db.query(CVE).filter(CVE.cve_id == cve_id).first() - if not cve: - logger.error(f"CVE {cve_id} not found") - return - - cves_to_process = [cve] - else: - # Process CVEs with PoC data that either have no rules or force update - query = db.query(CVE).filter(CVE.poc_count > 0) - - if not force: - # Only process CVEs without existing LLM-generated rules - existing_llm_rules = db.query(SigmaRule).filter( - SigmaRule.detection_type.like('llm_%') - ).all() - existing_cve_ids = {rule.cve_id for rule in existing_llm_rules} - cves_to_process = [cve for cve in query.all() if cve.cve_id not in existing_cve_ids] - else: - cves_to_process = query.all() - - logger.info(f"Processing {len(cves_to_process)} CVEs for LLM-enhanced rule generation using {llm_provider}") - - rules_generated = 0 - rules_updated = 0 - failures = 0 - - for cve in cves_to_process: - try: - # Check if CVE has sufficient PoC data - if not cve.poc_data or not cve.poc_count: - logger.debug(f"Skipping {cve.cve_id} - no PoC data") - continue - - # Generate LLM-enhanced rule - result = await generator.generate_enhanced_rule(cve, use_llm=True) - - if result.get('success'): - if result.get('updated'): - rules_updated += 1 - else: - rules_generated += 1 - - logger.info(f"Successfully generated LLM-enhanced rule for {cve.cve_id}") - else: - failures += 1 - logger.warning(f"Failed to generate LLM-enhanced rule for {cve.cve_id}: {result.get('error')}") - - except Exception as e: - failures += 1 - logger.error(f"Error generating LLM-enhanced rule for {cve.cve_id}: {e}") - continue - - logger.info(f"LLM-enhanced rule generation completed: {rules_generated} new, {rules_updated} updated, {failures} failures") - - except Exception as e: - logger.error(f"LLM-enhanced rule generation failed: {e}") - import traceback - traceback.print_exc() - - background_tasks.add_task(llm_generation_task) - - return { - "message": "LLM-enhanced SIGMA rule generation started", - "status": "started", - "cve_id": cve_id, - "force": force, - "provider": llm_provider, - "model": llm_model, - "note": "Requires appropriate LLM API key to be set" - } - -@app.get("/api/llm-status") -async def get_llm_status(): - """Check LLM API availability status""" - try: - from llm_client import LLMClient - - # Get current provider configuration - provider = os.getenv('LLM_PROVIDER') - model = os.getenv('LLM_MODEL') - - client = LLMClient(provider=provider, model=model) - provider_info = client.get_provider_info() - - # Get all available providers - all_providers = LLMClient.get_available_providers() - - return { - "current_provider": provider_info, - "available_providers": all_providers, - "status": "ready" if client.is_available() else "unavailable" - } - except Exception as e: - logger.error(f"Error checking LLM status: {e}") - return { - "current_provider": {"provider": "unknown", "available": False}, - "available_providers": [], - "status": "error", - "error": str(e) - } - -@app.post("/api/llm-switch") -async def switch_llm_provider(request: dict): - """Switch LLM provider and model""" - try: - from llm_client import LLMClient - - provider = request.get('provider') - model = request.get('model') - - if not provider: - raise HTTPException(status_code=400, detail="Provider is required") - - # Validate provider - if provider not in LLMClient.SUPPORTED_PROVIDERS: - raise HTTPException(status_code=400, detail=f"Unsupported provider: {provider}") - - # Test the new configuration - client = LLMClient(provider=provider, model=model) - - if not client.is_available(): - raise HTTPException(status_code=400, detail=f"Provider {provider} is not available or not configured") - - # Update environment variables (note: this only affects the current session) - os.environ['LLM_PROVIDER'] = provider - if model: - os.environ['LLM_MODEL'] = model - - provider_info = client.get_provider_info() - - return { - "message": f"Switched to {provider}", - "provider_info": provider_info, - "status": "success" - } - - except HTTPException: - raise - except Exception as e: - logger.error(f"Error switching LLM provider: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - - -@app.post("/api/ollama-pull-model") -async def pull_ollama_model(request: dict, background_tasks: BackgroundTasks): - """Pull an Ollama model""" - try: - from llm_client import LLMClient - - model = request.get('model') - if not model: - raise HTTPException(status_code=400, detail="Model name is required") - - # Create a background task to pull the model - def pull_model_task(): - try: - client = LLMClient(provider='ollama', model=model) - base_url = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434') - - if client._pull_ollama_model(base_url, model): - logger.info(f"Successfully pulled Ollama model: {model}") - else: - logger.error(f"Failed to pull Ollama model: {model}") - except Exception as e: - logger.error(f"Error in model pull task: {e}") - - background_tasks.add_task(pull_model_task) - - return { - "message": f"Started pulling model {model}", - "status": "started", - "model": model - } - - except Exception as e: - logger.error(f"Error starting model pull: {e}") - raise HTTPException(status_code=500, detail=str(e)) - -@app.get("/api/ollama-models") -async def get_ollama_models(): - """Get available Ollama models""" - try: - from llm_client import LLMClient - - client = LLMClient(provider='ollama') - available_models = client._get_ollama_available_models() - - return { - "available_models": available_models, - "total_models": len(available_models), - "status": "success" - } - - except Exception as e: - logger.error(f"Error getting Ollama models: {e}") - raise HTTPException(status_code=500, detail=str(e)) - -# ============================================================================ -# NOTE: SCHEDULER ENDPOINTS REMOVED -# ============================================================================ -# -# Job scheduling is now handled by Celery Beat with periodic tasks. -# All scheduled tasks are defined in celery_config.py beat_schedule. -# -# To manage scheduled tasks: -# - View tasks: Use Celery monitoring tools (Flower, Celery events) -# - Control tasks: Use Celery control commands or through Celery job management endpoints -# - Schedule changes: Update celery_config.py and restart Celery Beat -# -# Available Celery job management endpoints: -# - GET /api/celery/tasks - List all active tasks -# - POST /api/celery/tasks/{task_id}/revoke - Cancel a running task -# - GET /api/celery/workers - View worker status -# -# ============================================================================ - -if __name__ == "__main__": - import uvicorn - uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/backend/mcdevitt_poc_client.py b/backend/mcdevitt_poc_client.py index 7d1b356..1eef765 100644 --- a/backend/mcdevitt_poc_client.py +++ b/backend/mcdevitt_poc_client.py @@ -514,7 +514,7 @@ class GitHubPoCClient: async def bulk_sync_all_cves(self, batch_size: int = 50) -> dict: """Bulk synchronize all CVEs with GitHub PoC data""" - from main import CVE + from database_models import CVE # Load all GitHub PoC data first github_poc_data = self.load_github_poc_data() diff --git a/backend/scheduler_config.yaml b/backend/scheduler_config.yaml deleted file mode 100644 index 43f54e0..0000000 --- a/backend/scheduler_config.yaml +++ /dev/null @@ -1,182 +0,0 @@ -# CVE-SIGMA Auto Generator - Job Scheduler Configuration -# Cron-like scheduling for automated jobs -# -# Cron Format: minute hour day_of_month month day_of_week -# Special values: -# * = any value -# */N = every N units -# N-M = range from N to M -# N,M,O = specific values N, M, and O -# -# Examples: -# "0 */6 * * *" = Every 6 hours -# "0 2 * * *" = Daily at 2 AM -# "0 2 * * 1" = Weekly on Monday at 2 AM -# "0 0 1 * *" = Monthly on the 1st at midnight -# "*/30 * * * *" = Every 30 minutes - -scheduler: - enabled: true - timezone: "UTC" - max_concurrent_jobs: 3 - job_timeout_hours: 4 - retry_failed_jobs: true - max_retries: 2 - -jobs: - # NVD Incremental Updates - Fetch new CVEs regularly - nvd_incremental_update: - enabled: true - schedule: "0 */6 * * *" # Every 6 hours - description: "Fetch new CVEs from NVD modified feeds" - job_type: "incremental_update" - parameters: - batch_size: 100 - skip_nvd: false - skip_nomi_sec: true - priority: "high" - timeout_minutes: 60 - retry_on_failure: true - - # CISA KEV Sync - Update known exploited vulnerabilities - cisa_kev_sync: - enabled: true - schedule: "0 3 * * *" # Daily at 3 AM - description: "Sync CISA Known Exploited Vulnerabilities" - job_type: "cisa_kev_sync" - parameters: - batch_size: 100 - priority: "high" - timeout_minutes: 30 - retry_on_failure: true - - # Nomi-sec PoC Sync - Update proof-of-concept data (OPTIMIZED) - nomi_sec_sync: - enabled: true - schedule: "0 4 * * 1" # Weekly on Monday at 4 AM - description: "Sync nomi-sec Proof-of-Concept data (optimized)" - job_type: "nomi_sec_sync" - parameters: - batch_size: 100 # Increased batch size - max_cves: 1000 # Limit to recent/important CVEs - force_resync: false # Skip recently synced CVEs - priority: "high" # Increased priority - timeout_minutes: 60 # Reduced timeout due to optimizations - retry_on_failure: true - - # GitHub PoC Sync - Update GitHub proof-of-concept data - github_poc_sync: - enabled: true - schedule: "0 5 * * 1" # Weekly on Monday at 5 AM - description: "Sync GitHub Proof-of-Concept data" - job_type: "github_poc_sync" - parameters: - batch_size: 50 - priority: "medium" - timeout_minutes: 120 - retry_on_failure: true - - # ExploitDB Sync - Update exploit database - exploitdb_sync: - enabled: true - schedule: "0 6 * * 2" # Weekly on Tuesday at 6 AM - description: "Sync ExploitDB data" - job_type: "exploitdb_sync" - parameters: - batch_size: 30 - priority: "medium" - timeout_minutes: 90 - retry_on_failure: true - - # Reference Data Sync - Extract content from CVE references - reference_sync: - enabled: true - schedule: "0 2 * * 3" # Weekly on Wednesday at 2 AM - description: "Extract and analyze CVE reference content" - job_type: "reference_sync" - parameters: - batch_size: 30 - max_cves: 200 - force_resync: false - priority: "medium" - timeout_minutes: 180 - retry_on_failure: true - - # Rule Regeneration - Regenerate SIGMA rules with latest data - rule_regeneration: - enabled: true - schedule: "0 7 * * 4" # Weekly on Thursday at 7 AM - description: "Regenerate SIGMA rules with enhanced data" - job_type: "rule_regeneration" - parameters: - force: false - priority: "low" - timeout_minutes: 240 - retry_on_failure: false - - # Full Bulk Seed - Complete data refresh (monthly) - full_bulk_seed: - enabled: false # Disabled by default due to resource intensity - schedule: "0 1 1 * *" # Monthly on the 1st at 1 AM - description: "Complete bulk seed of all data sources" - job_type: "bulk_seed" - parameters: - start_year: 2020 - end_year: 2025 - batch_size: 100 - skip_nvd: false - skip_nomi_sec: false - priority: "low" - timeout_minutes: 1440 # 24 hours - retry_on_failure: false - - # Database Cleanup - Clean old job records and logs - database_cleanup: - enabled: true - schedule: "0 0 * * 0" # Weekly on Sunday at midnight - description: "Clean up old job records and temporary data" - job_type: "database_cleanup" - parameters: - days_to_keep: 30 - cleanup_failed_jobs: true - cleanup_logs: true - priority: "low" - timeout_minutes: 30 - retry_on_failure: false - -# Job execution policies -policies: - # Prevent overlapping jobs of the same type - prevent_overlap: true - - # Maximum job execution time before forced termination - max_execution_time_hours: 6 - - # Retry policy for failed jobs - retry_policy: - enabled: true - max_retries: 2 - retry_delay_minutes: 30 - exponential_backoff: true - - # Resource management - resource_limits: - max_memory_mb: 2048 - max_cpu_percent: 80 - - # Notification settings (future enhancement) - notifications: - enabled: false - on_success: false - on_failure: true - webhook_url: "" - email_recipients: [] - -# Logging configuration for scheduler -logging: - enabled: true - level: "INFO" - log_file: "/app/logs/scheduler.log" - max_log_size_mb: 100 - backup_count: 5 - log_format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" \ No newline at end of file diff --git a/backend/tasks/__init__.py b/backend/tasks/__init__.py deleted file mode 100644 index 6f650c0..0000000 --- a/backend/tasks/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -""" -Celery tasks for the Auto SIGMA Rule Generator -""" \ No newline at end of file diff --git a/backend/tasks/bulk_tasks.py b/backend/tasks/bulk_tasks.py deleted file mode 100644 index 3506b96..0000000 --- a/backend/tasks/bulk_tasks.py +++ /dev/null @@ -1,235 +0,0 @@ -""" -Bulk processing tasks for Celery -""" -import asyncio -import logging -from typing import Optional, Dict, Any -from celery import current_task -from celery_config import celery_app, get_db_session -import sys -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from bulk_seeder import BulkSeeder - -logger = logging.getLogger(__name__) - -@celery_app.task(bind=True, name='bulk_tasks.full_bulk_seed') -def full_bulk_seed_task(self, start_year: int = 2002, end_year: Optional[int] = None, - skip_nvd: bool = False, skip_nomi_sec: bool = False, - skip_exploitdb: bool = False, skip_cisa_kev: bool = False) -> Dict[str, Any]: - """ - Celery task for full bulk seeding operation - - Args: - start_year: Starting year for NVD data - end_year: Ending year for NVD data - skip_nvd: Skip NVD bulk processing - skip_nomi_sec: Skip nomi-sec PoC synchronization - skip_exploitdb: Skip ExploitDB synchronization - skip_cisa_kev: Skip CISA KEV synchronization - - Returns: - Dictionary containing operation results - """ - db_session = get_db_session() - - try: - # Update task progress - self.update_state( - state='PROGRESS', - meta={ - 'stage': 'initializing', - 'progress': 0, - 'message': 'Starting bulk seeding operation' - } - ) - - logger.info(f"Starting full bulk seed task: {start_year}-{end_year}") - - # Create seeder instance - seeder = BulkSeeder(db_session) - - # Create progress callback - def update_progress(stage: str, progress: int, message: str = None): - self.update_state( - state='PROGRESS', - meta={ - 'stage': stage, - 'progress': progress, - 'message': message or f'Processing {stage}' - } - ) - - # Run the bulk seeding operation - # Note: We need to handle the async nature of bulk_seeder - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - - try: - result = loop.run_until_complete( - seeder.full_bulk_seed( - start_year=start_year, - end_year=end_year, - skip_nvd=skip_nvd, - skip_nomi_sec=skip_nomi_sec, - skip_exploitdb=skip_exploitdb, - skip_cisa_kev=skip_cisa_kev, - progress_callback=update_progress - ) - ) - finally: - loop.close() - - # Update final progress - self.update_state( - state='SUCCESS', - meta={ - 'stage': 'completed', - 'progress': 100, - 'message': 'Bulk seeding completed successfully' - } - ) - - logger.info(f"Full bulk seed task completed: {result}") - return result - - except Exception as e: - logger.error(f"Full bulk seed task failed: {e}") - self.update_state( - state='FAILURE', - meta={ - 'stage': 'error', - 'progress': 0, - 'message': f'Task failed: {str(e)}', - 'error': str(e) - } - ) - raise - finally: - db_session.close() - -@celery_app.task(bind=True, name='bulk_tasks.incremental_update_task') -def incremental_update_task(self) -> Dict[str, Any]: - """ - Celery task for incremental updates - - Returns: - Dictionary containing update results - """ - db_session = get_db_session() - - try: - # Update task progress - self.update_state( - state='PROGRESS', - meta={ - 'stage': 'incremental_update', - 'progress': 0, - 'message': 'Starting incremental update' - } - ) - - logger.info("Starting incremental update task") - - # Create seeder instance - seeder = BulkSeeder(db_session) - - # Run the incremental update - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - - try: - result = loop.run_until_complete(seeder.incremental_update()) - finally: - loop.close() - - # Update final progress - self.update_state( - state='SUCCESS', - meta={ - 'stage': 'completed', - 'progress': 100, - 'message': 'Incremental update completed successfully' - } - ) - - logger.info(f"Incremental update task completed: {result}") - return result - - except Exception as e: - logger.error(f"Incremental update task failed: {e}") - self.update_state( - state='FAILURE', - meta={ - 'stage': 'error', - 'progress': 0, - 'message': f'Task failed: {str(e)}', - 'error': str(e) - } - ) - raise - finally: - db_session.close() - -@celery_app.task(bind=True, name='bulk_tasks.generate_enhanced_sigma_rules') -def generate_enhanced_sigma_rules_task(self) -> Dict[str, Any]: - """ - Celery task for generating enhanced SIGMA rules - - Returns: - Dictionary containing generation results - """ - db_session = get_db_session() - - try: - # Update task progress - self.update_state( - state='PROGRESS', - meta={ - 'stage': 'generating_rules', - 'progress': 0, - 'message': 'Starting enhanced SIGMA rule generation' - } - ) - - logger.info("Starting enhanced SIGMA rule generation task") - - # Create seeder instance - seeder = BulkSeeder(db_session) - - # Run the rule generation - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - - try: - result = loop.run_until_complete(seeder.generate_enhanced_sigma_rules()) - finally: - loop.close() - - # Update final progress - self.update_state( - state='SUCCESS', - meta={ - 'stage': 'completed', - 'progress': 100, - 'message': 'Enhanced SIGMA rule generation completed successfully' - } - ) - - logger.info(f"Enhanced SIGMA rule generation task completed: {result}") - return result - - except Exception as e: - logger.error(f"Enhanced SIGMA rule generation task failed: {e}") - self.update_state( - state='FAILURE', - meta={ - 'stage': 'error', - 'progress': 0, - 'message': f'Task failed: {str(e)}', - 'error': str(e) - } - ) - raise - finally: - db_session.close() diff --git a/backend/tasks/data_sync_tasks.py b/backend/tasks/data_sync_tasks.py deleted file mode 100644 index 10a5eec..0000000 --- a/backend/tasks/data_sync_tasks.py +++ /dev/null @@ -1,504 +0,0 @@ -""" -Data synchronization tasks for Celery -""" -import asyncio -import logging -import sys -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from typing import Dict, Any -from celery import current_task -from celery_config import celery_app, get_db_session -from nomi_sec_client import NomiSecClient -from exploitdb_client_local import ExploitDBLocalClient -from cisa_kev_client import CISAKEVClient -from mcdevitt_poc_client import GitHubPoCClient - -logger = logging.getLogger(__name__) - -@celery_app.task(bind=True, name='data_sync_tasks.sync_nomi_sec') -def sync_nomi_sec_task(self, batch_size: int = 50) -> Dict[str, Any]: - """ - Celery task for nomi-sec PoC synchronization - - Args: - batch_size: Number of CVEs to process in each batch - - Returns: - Dictionary containing sync results - """ - db_session = get_db_session() - - try: - # Update task progress - self.update_state( - state='PROGRESS', - meta={ - 'stage': 'sync_nomi_sec', - 'progress': 0, - 'message': 'Starting nomi-sec PoC synchronization' - } - ) - - logger.info(f"Starting nomi-sec sync task with batch size: {batch_size}") - - # Create client instance - client = NomiSecClient(db_session) - - # Run the synchronization - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - - try: - result = loop.run_until_complete( - client.bulk_sync_all_cves(batch_size=batch_size) - ) - finally: - loop.close() - - # Update final progress - self.update_state( - state='SUCCESS', - meta={ - 'stage': 'completed', - 'progress': 100, - 'message': 'Nomi-sec synchronization completed successfully' - } - ) - - logger.info(f"Nomi-sec sync task completed: {result}") - return result - - except Exception as e: - logger.error(f"Nomi-sec sync task failed: {e}") - self.update_state( - state='FAILURE', - meta={ - 'stage': 'error', - 'progress': 0, - 'message': f'Task failed: {str(e)}', - 'error': str(e) - } - ) - raise - finally: - db_session.close() - - -@celery_app.task(bind=True, name='data_sync_tasks.sync_github_poc') -def sync_github_poc_task(self, batch_size: int = 50) -> Dict[str, Any]: - """ - Celery task for GitHub PoC synchronization - - Args: - batch_size: Number of CVEs to process in each batch - - Returns: - Dictionary containing sync results - """ - db_session = get_db_session() - - try: - # Update task progress - self.update_state( - state='PROGRESS', - meta={ - 'stage': 'sync_github_poc', - 'progress': 0, - 'message': 'Starting GitHub PoC synchronization' - } - ) - - logger.info(f"Starting GitHub PoC sync task with batch size: {batch_size}") - - # Create client instance - client = GitHubPoCClient(db_session) - - # Run the synchronization - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - - try: - result = loop.run_until_complete( - client.bulk_sync_all_cves(batch_size=batch_size) - ) - finally: - loop.close() - - # Update final progress - self.update_state( - state='SUCCESS', - meta={ - 'stage': 'completed', - 'progress': 100, - 'message': 'GitHub PoC synchronization completed successfully' - } - ) - - logger.info(f"GitHub PoC sync task completed: {result}") - return result - - except Exception as e: - logger.error(f"GitHub PoC sync task failed: {e}") - self.update_state( - state='FAILURE', - meta={ - 'stage': 'error', - 'progress': 0, - 'message': f'Task failed: {str(e)}', - 'error': str(e) - } - ) - raise - finally: - db_session.close() - - -@celery_app.task(bind=True, name='data_sync_tasks.sync_reference_content') -def sync_reference_content_task(self, batch_size: int = 30, max_cves: int = 200, - force_resync: bool = False) -> Dict[str, Any]: - """ - Celery task for CVE reference content extraction and analysis - - Args: - batch_size: Number of CVEs to process in each batch - max_cves: Maximum number of CVEs to process - force_resync: Force re-sync of recently processed CVEs - - Returns: - Dictionary containing sync results - """ - db_session = get_db_session() - - try: - # Import here to avoid circular imports - import sys - import os - sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - from main import CVE - - # Update task progress - self.update_state( - state='PROGRESS', - meta={ - 'stage': 'sync_reference_content', - 'progress': 0, - 'message': 'Starting CVE reference content extraction' - } - ) - - logger.info(f"Starting reference content sync task - batch_size: {batch_size}, max_cves: {max_cves}") - - # Get CVEs to process (prioritize those with references but no extracted content) - query = db_session.query(CVE) - - if not force_resync: - # Skip CVEs that were recently processed - from datetime import datetime, timedelta - cutoff_date = datetime.utcnow() - timedelta(days=7) - query = query.filter( - (CVE.reference_content_extracted_at.is_(None)) | - (CVE.reference_content_extracted_at < cutoff_date) - ) - - # Prioritize CVEs with references - cves = query.filter(CVE.references.isnot(None)).limit(max_cves).all() - - if not cves: - logger.info("No CVEs found for reference content extraction") - return {'total_processed': 0, 'successful_extractions': 0, 'failed_extractions': 0} - - total_processed = 0 - successful_extractions = 0 - failed_extractions = 0 - - # Process CVEs in batches - for i in range(0, len(cves), batch_size): - batch = cves[i:i + batch_size] - - for j, cve in enumerate(batch): - try: - # Update progress - overall_progress = int(((i + j) / len(cves)) * 100) - self.update_state( - state='PROGRESS', - meta={ - 'stage': 'sync_reference_content', - 'progress': overall_progress, - 'message': f'Processing CVE {cve.cve_id} ({i + j + 1}/{len(cves)})', - 'current_cve': cve.cve_id, - 'processed': i + j, - 'total': len(cves) - } - ) - - # For now, simulate reference content extraction - # In a real implementation, you would create a ReferenceContentExtractor - # and extract content from CVE references - - # Mark CVE as processed - from datetime import datetime - cve.reference_content_extracted_at = datetime.utcnow() - - successful_extractions += 1 - total_processed += 1 - - # Small delay between requests - import time - time.sleep(2) - - except Exception as e: - logger.error(f"Error processing reference content for CVE {cve.cve_id}: {e}") - failed_extractions += 1 - total_processed += 1 - - # Commit after each batch - db_session.commit() - logger.info(f"Processed batch {i//batch_size + 1}/{(len(cves) + batch_size - 1)//batch_size}") - - # Final results - result = { - 'total_processed': total_processed, - 'successful_extractions': successful_extractions, - 'failed_extractions': failed_extractions, - 'extraction_rate': (successful_extractions / total_processed * 100) if total_processed > 0 else 0 - } - - # Update final progress - self.update_state( - state='SUCCESS', - meta={ - 'stage': 'completed', - 'progress': 100, - 'message': f'Reference content extraction completed: {successful_extractions} successful, {failed_extractions} failed', - 'results': result - } - ) - - logger.info(f"Reference content sync task completed: {result}") - return result - - except Exception as e: - logger.error(f"Reference content sync task failed: {e}") - self.update_state( - state='FAILURE', - meta={ - 'stage': 'error', - 'progress': 0, - 'message': f'Task failed: {str(e)}', - 'error': str(e) - } - ) - raise - finally: - db_session.close() - -@celery_app.task(bind=True, name='data_sync_tasks.sync_exploitdb') -def sync_exploitdb_task(self, batch_size: int = 30) -> Dict[str, Any]: - """ - Celery task for ExploitDB synchronization - - Args: - batch_size: Number of CVEs to process in each batch - - Returns: - Dictionary containing sync results - """ - db_session = get_db_session() - - try: - # Update task progress - self.update_state( - state='PROGRESS', - meta={ - 'stage': 'sync_exploitdb', - 'progress': 0, - 'message': 'Starting ExploitDB synchronization' - } - ) - - logger.info(f"Starting ExploitDB sync task with batch size: {batch_size}") - - # Create client instance - client = ExploitDBLocalClient(db_session) - - # Run the synchronization - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - - try: - result = loop.run_until_complete( - client.bulk_sync_exploitdb(batch_size=batch_size) - ) - finally: - loop.close() - - # Update final progress - self.update_state( - state='SUCCESS', - meta={ - 'stage': 'completed', - 'progress': 100, - 'message': 'ExploitDB synchronization completed successfully' - } - ) - - logger.info(f"ExploitDB sync task completed: {result}") - return result - - except Exception as e: - logger.error(f"ExploitDB sync task failed: {e}") - self.update_state( - state='FAILURE', - meta={ - 'stage': 'error', - 'progress': 0, - 'message': f'Task failed: {str(e)}', - 'error': str(e) - } - ) - raise - finally: - db_session.close() - - -@celery_app.task(bind=True, name='data_sync_tasks.sync_cisa_kev') -def sync_cisa_kev_task(self, batch_size: int = 100) -> Dict[str, Any]: - """ - Celery task for CISA KEV synchronization - - Args: - batch_size: Number of CVEs to process in each batch - - Returns: - Dictionary containing sync results - """ - db_session = get_db_session() - - try: - # Update task progress - self.update_state( - state='PROGRESS', - meta={ - 'stage': 'sync_cisa_kev', - 'progress': 0, - 'message': 'Starting CISA KEV synchronization' - } - ) - - logger.info(f"Starting CISA KEV sync task with batch size: {batch_size}") - - # Create client instance - client = CISAKEVClient(db_session) - - # Run the synchronization - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - - try: - result = loop.run_until_complete( - client.bulk_sync_kev_data(batch_size=batch_size) - ) - finally: - loop.close() - - # Update final progress - self.update_state( - state='SUCCESS', - meta={ - 'stage': 'completed', - 'progress': 100, - 'message': 'CISA KEV synchronization completed successfully' - } - ) - - logger.info(f"CISA KEV sync task completed: {result}") - return result - - except Exception as e: - logger.error(f"CISA KEV sync task failed: {e}") - self.update_state( - state='FAILURE', - meta={ - 'stage': 'error', - 'progress': 0, - 'message': f'Task failed: {str(e)}', - 'error': str(e) - } - ) - raise - finally: - db_session.close() - - -@celery_app.task(bind=True, name='data_sync_tasks.build_exploitdb_index') -def build_exploitdb_index_task(self) -> Dict[str, Any]: - """ - Celery task for building/rebuilding ExploitDB file index - - Returns: - Dictionary containing build results - """ - try: - # Update task progress - self.update_state( - state='PROGRESS', - meta={ - 'stage': 'build_exploitdb_index', - 'progress': 0, - 'message': 'Starting ExploitDB file index building' - } - ) - - logger.info("Starting ExploitDB index build task") - - # Import here to avoid circular dependencies - from exploitdb_client_local import ExploitDBLocalClient - - # Create client instance with lazy_load=False to force index building - client = ExploitDBLocalClient(None, lazy_load=False) - - # Update progress - self.update_state( - state='PROGRESS', - meta={ - 'stage': 'build_exploitdb_index', - 'progress': 50, - 'message': 'Building file index...' - } - ) - - # Force index rebuild - client._build_file_index() - - # Update progress to completion - self.update_state( - state='PROGRESS', - meta={ - 'stage': 'build_exploitdb_index', - 'progress': 100, - 'message': 'ExploitDB index building completed successfully' - } - ) - - result = { - 'status': 'completed', - 'total_exploits_indexed': len(client.file_index), - 'index_updated': True - } - - logger.info(f"ExploitDB index build task completed: {result}") - return result - - except Exception as e: - logger.error(f"ExploitDB index build task failed: {e}") - self.update_state( - state='FAILURE', - meta={ - 'stage': 'error', - 'progress': 0, - 'message': f'Task failed: {str(e)}', - 'error': str(e) - } - ) - raise diff --git a/backend/tasks/maintenance_tasks.py b/backend/tasks/maintenance_tasks.py deleted file mode 100644 index be1dcdb..0000000 --- a/backend/tasks/maintenance_tasks.py +++ /dev/null @@ -1,444 +0,0 @@ -""" -Maintenance tasks for Celery -""" -import logging -from datetime import datetime, timedelta -from typing import Dict, Any -from celery_config import celery_app, get_db_session - -logger = logging.getLogger(__name__) - -@celery_app.task(name='tasks.maintenance_tasks.cleanup_old_results') -def cleanup_old_results(): - """ - Periodic task to clean up old Celery results and logs - """ - try: - logger.info("Starting cleanup of old Celery results") - - # This would clean up old results from Redis - # For now, we'll just log the action - cutoff_date = datetime.utcnow() - timedelta(days=7) - - # Clean up old task results (this would be Redis cleanup) - # celery_app.backend.cleanup() - - logger.info(f"Cleanup completed for results older than {cutoff_date}") - - return { - 'status': 'completed', - 'cutoff_date': cutoff_date.isoformat(), - 'message': 'Old results cleanup completed' - } - - except Exception as e: - logger.error(f"Cleanup task failed: {e}") - raise - -@celery_app.task(name='tasks.maintenance_tasks.health_check') -def health_check(): - """ - Health check task to verify system components - """ - try: - db_session = get_db_session() - - # Check database connectivity - try: - from sqlalchemy import text - db_session.execute(text("SELECT 1")) - db_status = "healthy" - except Exception as e: - db_status = f"unhealthy: {e}" - finally: - db_session.close() - - # Check Redis connectivity - try: - import redis - redis_client = redis.Redis.from_url(celery_app.conf.broker_url) - redis_client.ping() - redis_status = "healthy" - except Exception as e: - redis_status = f"unhealthy: {e}" - - result = { - 'timestamp': datetime.utcnow().isoformat(), - 'database': db_status, - 'redis': redis_status, - 'celery': 'healthy' - } - - logger.info(f"Health check completed: {result}") - return result - - except Exception as e: - logger.error(f"Health check failed: {e}") - raise - -@celery_app.task(bind=True, name='tasks.maintenance_tasks.database_cleanup_comprehensive') -def database_cleanup_comprehensive(self, days_to_keep: int = 30, cleanup_failed_jobs: bool = True, - cleanup_logs: bool = True) -> Dict[str, Any]: - """ - Comprehensive database cleanup task - - Args: - days_to_keep: Number of days to keep old records - cleanup_failed_jobs: Whether to clean up failed job records - cleanup_logs: Whether to clean up old log entries - - Returns: - Dictionary containing cleanup results - """ - try: - from datetime import datetime, timedelta - from typing import Dict, Any - - db_session = get_db_session() - - # Update task progress - self.update_state( - state='PROGRESS', - meta={ - 'stage': 'database_cleanup', - 'progress': 0, - 'message': 'Starting comprehensive database cleanup' - } - ) - - logger.info(f"Starting comprehensive database cleanup - keeping {days_to_keep} days") - - cutoff_date = datetime.utcnow() - timedelta(days=days_to_keep) - cleanup_results = { - 'cutoff_date': cutoff_date.isoformat(), - 'cleaned_tables': {}, - 'total_records_cleaned': 0 - } - - try: - # Import models here to avoid circular imports - import sys - import os - sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - from main import BulkProcessingJob - - # Clean up old bulk processing jobs - self.update_state( - state='PROGRESS', - meta={ - 'stage': 'database_cleanup', - 'progress': 20, - 'message': 'Cleaning up old bulk processing jobs' - } - ) - - old_jobs_query = db_session.query(BulkProcessingJob).filter( - BulkProcessingJob.created_at < cutoff_date - ) - - if cleanup_failed_jobs: - # Clean all old jobs - old_jobs_count = old_jobs_query.count() - old_jobs_query.delete() - else: - # Only clean completed jobs - old_jobs_query = old_jobs_query.filter( - BulkProcessingJob.status.in_(['completed', 'cancelled']) - ) - old_jobs_count = old_jobs_query.count() - old_jobs_query.delete() - - cleanup_results['cleaned_tables']['bulk_processing_jobs'] = old_jobs_count - cleanup_results['total_records_cleaned'] += old_jobs_count - - # Clean up old Celery task results from Redis - self.update_state( - state='PROGRESS', - meta={ - 'stage': 'database_cleanup', - 'progress': 40, - 'message': 'Cleaning up old Celery task results' - } - ) - - try: - # This would clean up old results from Redis backend - # For now, we'll simulate this - celery_cleanup_count = 0 - # celery_app.backend.cleanup() - cleanup_results['cleaned_tables']['celery_results'] = celery_cleanup_count - except Exception as e: - logger.warning(f"Could not clean Celery results: {e}") - cleanup_results['cleaned_tables']['celery_results'] = 0 - - # Clean up old temporary data (if any) - self.update_state( - state='PROGRESS', - meta={ - 'stage': 'database_cleanup', - 'progress': 60, - 'message': 'Cleaning up temporary data' - } - ) - - # Add any custom temporary table cleanup here - # Example: Clean up old session data, temporary files, etc. - temp_cleanup_count = 0 - cleanup_results['cleaned_tables']['temporary_data'] = temp_cleanup_count - - # Vacuum/optimize database (PostgreSQL) - self.update_state( - state='PROGRESS', - meta={ - 'stage': 'database_cleanup', - 'progress': 80, - 'message': 'Optimizing database' - } - ) - - try: - # Run VACUUM on PostgreSQL to reclaim space - from sqlalchemy import text - db_session.execute(text("VACUUM;")) - cleanup_results['database_optimized'] = True - except Exception as e: - logger.warning(f"Could not vacuum database: {e}") - cleanup_results['database_optimized'] = False - - # Commit all changes - db_session.commit() - - # Update final progress - self.update_state( - state='SUCCESS', - meta={ - 'stage': 'completed', - 'progress': 100, - 'message': f'Database cleanup completed - removed {cleanup_results["total_records_cleaned"]} records', - 'results': cleanup_results - } - ) - - logger.info(f"Database cleanup completed: {cleanup_results}") - return cleanup_results - - finally: - db_session.close() - - except Exception as e: - logger.error(f"Database cleanup failed: {e}") - self.update_state( - state='FAILURE', - meta={ - 'stage': 'error', - 'progress': 0, - 'message': f'Cleanup failed: {str(e)}', - 'error': str(e) - } - ) - raise - -@celery_app.task(bind=True, name='tasks.maintenance_tasks.health_check_detailed') -def health_check_detailed(self) -> Dict[str, Any]: - """ - Detailed health check task for all system components - - Returns: - Dictionary containing detailed health status - """ - try: - from datetime import datetime - import psutil - import redis - - # Update task progress - self.update_state( - state='PROGRESS', - meta={ - 'stage': 'health_check', - 'progress': 0, - 'message': 'Starting detailed health check' - } - ) - - logger.info("Starting detailed health check") - - health_status = { - 'timestamp': datetime.utcnow().isoformat(), - 'overall_status': 'healthy', - 'components': {} - } - - # Check database connectivity and performance - self.update_state( - state='PROGRESS', - meta={ - 'stage': 'health_check', - 'progress': 20, - 'message': 'Checking database health' - } - ) - - db_session = get_db_session() - try: - from sqlalchemy import text - start_time = datetime.utcnow() - db_session.execute(text("SELECT 1")) - db_response_time = (datetime.utcnow() - start_time).total_seconds() - - # Check database size and connections - db_size_result = db_session.execute(text("SELECT pg_size_pretty(pg_database_size(current_database()));")).fetchone() - db_connections_result = db_session.execute(text("SELECT count(*) FROM pg_stat_activity;")).fetchone() - - health_status['components']['database'] = { - 'status': 'healthy', - 'response_time_seconds': db_response_time, - 'database_size': db_size_result[0] if db_size_result else 'unknown', - 'active_connections': db_connections_result[0] if db_connections_result else 0, - 'details': 'Database responsive and accessible' - } - except Exception as e: - health_status['components']['database'] = { - 'status': 'unhealthy', - 'error': str(e), - 'details': 'Database connection failed' - } - health_status['overall_status'] = 'degraded' - finally: - db_session.close() - - # Check Redis connectivity and performance - self.update_state( - state='PROGRESS', - meta={ - 'stage': 'health_check', - 'progress': 40, - 'message': 'Checking Redis health' - } - ) - - try: - import redis - start_time = datetime.utcnow() - redis_client = redis.Redis.from_url(celery_app.conf.broker_url) - redis_client.ping() - redis_response_time = (datetime.utcnow() - start_time).total_seconds() - - # Get Redis info - redis_client = redis.Redis.from_url(celery_app.conf.broker_url) - redis_info = redis_client.info() - - health_status['components']['redis'] = { - 'status': 'healthy', - 'response_time_seconds': redis_response_time, - 'memory_usage_mb': redis_info.get('used_memory', 0) / (1024 * 1024), - 'connected_clients': redis_info.get('connected_clients', 0), - 'uptime_seconds': redis_info.get('uptime_in_seconds', 0), - 'details': 'Redis responsive and accessible' - } - except Exception as e: - health_status['components']['redis'] = { - 'status': 'unhealthy', - 'error': str(e), - 'details': 'Redis connection failed' - } - health_status['overall_status'] = 'degraded' - - # Check system resources - self.update_state( - state='PROGRESS', - meta={ - 'stage': 'health_check', - 'progress': 60, - 'message': 'Checking system resources' - } - ) - - try: - cpu_percent = psutil.cpu_percent(interval=1) - memory = psutil.virtual_memory() - disk = psutil.disk_usage('/') - - health_status['components']['system'] = { - 'status': 'healthy', - 'cpu_percent': cpu_percent, - 'memory_percent': memory.percent, - 'memory_available_gb': memory.available / (1024**3), - 'disk_percent': disk.percent, - 'disk_free_gb': disk.free / (1024**3), - 'details': 'System resources within normal ranges' - } - - # Mark as degraded if resources are high - if cpu_percent > 80 or memory.percent > 85 or disk.percent > 90: - health_status['components']['system']['status'] = 'degraded' - health_status['overall_status'] = 'degraded' - health_status['components']['system']['details'] = 'High resource usage detected' - - except Exception as e: - health_status['components']['system'] = { - 'status': 'unknown', - 'error': str(e), - 'details': 'Could not check system resources' - } - - # Check Celery worker status - self.update_state( - state='PROGRESS', - meta={ - 'stage': 'health_check', - 'progress': 80, - 'message': 'Checking Celery workers' - } - ) - - try: - inspect = celery_app.control.inspect() - active_workers = inspect.active() - stats = inspect.stats() - - health_status['components']['celery'] = { - 'status': 'healthy', - 'active_workers': len(active_workers) if active_workers else 0, - 'worker_stats': stats, - 'details': 'Celery workers responding' - } - - if not active_workers: - health_status['components']['celery']['status'] = 'degraded' - health_status['components']['celery']['details'] = 'No active workers found' - health_status['overall_status'] = 'degraded' - - except Exception as e: - health_status['components']['celery'] = { - 'status': 'unknown', - 'error': str(e), - 'details': 'Could not check Celery workers' - } - - # Update final progress - self.update_state( - state='SUCCESS', - meta={ - 'stage': 'completed', - 'progress': 100, - 'message': f'Health check completed - overall status: {health_status["overall_status"]}', - 'results': health_status - } - ) - - logger.info(f"Detailed health check completed: {health_status['overall_status']}") - return health_status - - except Exception as e: - logger.error(f"Detailed health check failed: {e}") - self.update_state( - state='FAILURE', - meta={ - 'stage': 'error', - 'progress': 0, - 'message': f'Health check failed: {str(e)}', - 'error': str(e) - } - ) - raise \ No newline at end of file diff --git a/backend/tasks/sigma_tasks.py b/backend/tasks/sigma_tasks.py deleted file mode 100644 index be481e7..0000000 --- a/backend/tasks/sigma_tasks.py +++ /dev/null @@ -1,409 +0,0 @@ -""" -SIGMA rule generation tasks for Celery -""" -import asyncio -import logging -from typing import Dict, Any, List, Optional -from celery import current_task -from celery_config import celery_app, get_db_session -from enhanced_sigma_generator import EnhancedSigmaGenerator -from llm_client import LLMClient - -logger = logging.getLogger(__name__) - -@celery_app.task(bind=True, name='sigma_tasks.generate_enhanced_rules') -def generate_enhanced_rules_task(self, cve_ids: Optional[List[str]] = None) -> Dict[str, Any]: - """ - Celery task for enhanced SIGMA rule generation - - Args: - cve_ids: Optional list of specific CVE IDs to process - - Returns: - Dictionary containing generation results - """ - db_session = get_db_session() - - try: - # Import here to avoid circular imports - import sys - import os - sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - from main import CVE - - # Update task progress - self.update_state( - state='PROGRESS', - meta={ - 'stage': 'generating_rules', - 'progress': 0, - 'message': 'Starting enhanced SIGMA rule generation' - } - ) - - logger.info(f"Starting enhanced rule generation task for CVEs: {cve_ids}") - - # Create generator instance - generator = EnhancedSigmaGenerator(db_session) - - # Get CVEs to process - if cve_ids: - cves = db_session.query(CVE).filter(CVE.cve_id.in_(cve_ids)).all() - else: - cves = db_session.query(CVE).filter(CVE.poc_count > 0).all() - - total_cves = len(cves) - processed_cves = 0 - successful_rules = 0 - failed_rules = 0 - results = [] - - # Process each CVE - for i, cve in enumerate(cves): - try: - # Update progress - progress = int((i / total_cves) * 100) - self.update_state( - state='PROGRESS', - meta={ - 'stage': 'generating_rules', - 'progress': progress, - 'message': f'Processing CVE {cve.cve_id}', - 'current_cve': cve.cve_id, - 'processed': processed_cves, - 'total': total_cves - } - ) - - # Generate rule using asyncio - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - - try: - result = loop.run_until_complete( - generator.generate_enhanced_rule(cve) - ) - - if result.get('success', False): - successful_rules += 1 - else: - failed_rules += 1 - - results.append({ - 'cve_id': cve.cve_id, - 'success': result.get('success', False), - 'message': result.get('message', 'No message'), - 'rule_id': result.get('rule_id') - }) - - finally: - loop.close() - - processed_cves += 1 - - except Exception as e: - logger.error(f"Error processing CVE {cve.cve_id}: {e}") - failed_rules += 1 - results.append({ - 'cve_id': cve.cve_id, - 'success': False, - 'message': f'Error: {str(e)}', - 'rule_id': None - }) - - # Final results - final_result = { - 'total_processed': processed_cves, - 'successful_rules': successful_rules, - 'failed_rules': failed_rules, - 'results': results - } - - # Update final progress - self.update_state( - state='SUCCESS', - meta={ - 'stage': 'completed', - 'progress': 100, - 'message': f'Generated {successful_rules} rules from {processed_cves} CVEs', - 'results': final_result - } - ) - - logger.info(f"Enhanced rule generation task completed: {final_result}") - return final_result - - except Exception as e: - logger.error(f"Enhanced rule generation task failed: {e}") - self.update_state( - state='FAILURE', - meta={ - 'stage': 'error', - 'progress': 0, - 'message': f'Task failed: {str(e)}', - 'error': str(e) - } - ) - raise - finally: - db_session.close() - -@celery_app.task(bind=True, name='sigma_tasks.llm_enhanced_generation') -def llm_enhanced_generation_task(self, cve_id: str, provider: str = 'ollama', - model: Optional[str] = None) -> Dict[str, Any]: - """ - Celery task for LLM-enhanced rule generation - - Args: - cve_id: CVE identifier - provider: LLM provider (openai, anthropic, ollama, finetuned) - model: Specific model to use - - Returns: - Dictionary containing generation result - """ - db_session = get_db_session() - - try: - # Import here to avoid circular imports - import sys - import os - sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - from main import CVE - - # Update task progress - self.update_state( - state='PROGRESS', - meta={ - 'stage': 'llm_generation', - 'progress': 10, - 'message': f'Starting LLM rule generation for {cve_id}', - 'cve_id': cve_id, - 'provider': provider, - 'model': model - } - ) - - logger.info(f"Starting LLM rule generation for {cve_id} using {provider}") - - # Get CVE from database - cve = db_session.query(CVE).filter(CVE.cve_id == cve_id).first() - if not cve: - raise ValueError(f"CVE {cve_id} not found in database") - - # Update progress - self.update_state( - state='PROGRESS', - meta={ - 'stage': 'llm_generation', - 'progress': 25, - 'message': f'Initializing LLM client ({provider})', - 'cve_id': cve_id - } - ) - - # Create LLM client - llm_client = LLMClient(provider=provider, model=model) - - if not llm_client.is_available(): - raise ValueError(f"LLM client not available for provider: {provider}") - - # Update progress - self.update_state( - state='PROGRESS', - meta={ - 'stage': 'llm_generation', - 'progress': 50, - 'message': f'Generating rule with LLM for {cve_id}', - 'cve_id': cve_id - } - ) - - # Generate rule using asyncio - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - - try: - rule_content = loop.run_until_complete( - llm_client.generate_sigma_rule( - cve_id=cve.cve_id, - poc_content=cve.poc_data or '', - cve_description=cve.description or '' - ) - ) - finally: - loop.close() - - # Update progress - self.update_state( - state='PROGRESS', - meta={ - 'stage': 'llm_generation', - 'progress': 75, - 'message': f'Validating generated rule for {cve_id}', - 'cve_id': cve_id - } - ) - - # Validate the generated rule - is_valid = False - if rule_content: - is_valid = llm_client.validate_sigma_rule(rule_content, cve_id) - - # Prepare result - result = { - 'cve_id': cve_id, - 'rule_content': rule_content, - 'is_valid': is_valid, - 'provider': provider, - 'model': model or llm_client.model, - 'success': bool(rule_content and is_valid) - } - - # Update final progress - self.update_state( - state='SUCCESS', - meta={ - 'stage': 'completed', - 'progress': 100, - 'message': f'LLM rule generation completed for {cve_id}', - 'cve_id': cve_id, - 'success': result['success'], - 'result': result - } - ) - - logger.info(f"LLM rule generation task completed for {cve_id}: {result['success']}") - return result - - except Exception as e: - logger.error(f"LLM rule generation task failed for {cve_id}: {e}") - self.update_state( - state='FAILURE', - meta={ - 'stage': 'error', - 'progress': 0, - 'message': f'Task failed for {cve_id}: {str(e)}', - 'cve_id': cve_id, - 'error': str(e) - } - ) - raise - finally: - db_session.close() - -@celery_app.task(bind=True, name='sigma_tasks.batch_llm_generation') -def batch_llm_generation_task(self, cve_ids: List[str], provider: str = 'ollama', - model: Optional[str] = None) -> Dict[str, Any]: - """ - Celery task for batch LLM rule generation - - Args: - cve_ids: List of CVE identifiers - provider: LLM provider (openai, anthropic, ollama, finetuned) - model: Specific model to use - - Returns: - Dictionary containing batch generation results - """ - db_session = get_db_session() - - try: - # Update task progress - self.update_state( - state='PROGRESS', - meta={ - 'stage': 'batch_llm_generation', - 'progress': 0, - 'message': f'Starting batch LLM generation for {len(cve_ids)} CVEs', - 'total_cves': len(cve_ids), - 'provider': provider, - 'model': model - } - ) - - logger.info(f"Starting batch LLM generation for {len(cve_ids)} CVEs using {provider}") - - # Initialize results - results = [] - successful_rules = 0 - failed_rules = 0 - - # Process each CVE - for i, cve_id in enumerate(cve_ids): - try: - # Update progress - progress = int((i / len(cve_ids)) * 100) - self.update_state( - state='PROGRESS', - meta={ - 'stage': 'batch_llm_generation', - 'progress': progress, - 'message': f'Processing CVE {cve_id} ({i+1}/{len(cve_ids)})', - 'current_cve': cve_id, - 'processed': i, - 'total': len(cve_ids) - } - ) - - # Generate rule for this CVE - result = llm_enhanced_generation_task.apply( - args=[cve_id, provider, model] - ).get() - - if result.get('success', False): - successful_rules += 1 - else: - failed_rules += 1 - - results.append(result) - - except Exception as e: - logger.error(f"Error processing CVE {cve_id} in batch: {e}") - failed_rules += 1 - results.append({ - 'cve_id': cve_id, - 'success': False, - 'error': str(e), - 'provider': provider, - 'model': model - }) - - # Final results - final_result = { - 'total_processed': len(cve_ids), - 'successful_rules': successful_rules, - 'failed_rules': failed_rules, - 'provider': provider, - 'model': model, - 'results': results - } - - # Update final progress - self.update_state( - state='SUCCESS', - meta={ - 'stage': 'completed', - 'progress': 100, - 'message': f'Batch generation completed: {successful_rules} successful, {failed_rules} failed', - 'results': final_result - } - ) - - logger.info(f"Batch LLM generation task completed: {final_result}") - return final_result - - except Exception as e: - logger.error(f"Batch LLM generation task failed: {e}") - self.update_state( - state='FAILURE', - meta={ - 'stage': 'error', - 'progress': 0, - 'message': f'Batch task failed: {str(e)}', - 'error': str(e) - } - ) - raise - finally: - db_session.close() \ No newline at end of file diff --git a/backend/test_enhanced_generation.py b/backend/test_enhanced_generation.py deleted file mode 100644 index 6ef29e2..0000000 --- a/backend/test_enhanced_generation.py +++ /dev/null @@ -1,211 +0,0 @@ -#!/usr/bin/env python3 -""" -Test script for enhanced SIGMA rule generation -""" - -import asyncio -import json -from datetime import datetime -from main import SessionLocal, CVE, SigmaRule, Base, engine -from enhanced_sigma_generator import EnhancedSigmaGenerator -from nomi_sec_client import NomiSecClient -from initialize_templates import initialize_templates - -# Create tables if they don't exist -Base.metadata.create_all(bind=engine) - -async def test_enhanced_rule_generation(): - """Test the enhanced rule generation with mock data""" - - # Initialize templates - print("Initializing templates...") - initialize_templates() - - db = SessionLocal() - - try: - # Check if CVE already exists, if not create it - test_cve = db.query(CVE).filter(CVE.cve_id == "CVE-2014-7236").first() - - if not test_cve: - # Create a test CVE with mock PoC data - test_cve = CVE( - cve_id="CVE-2014-7236", - description="Remote code execution vulnerability in Microsoft Office", - cvss_score=8.5, - severity="high", - published_date=datetime(2014, 10, 15), - affected_products=["Microsoft Office", "Windows"], - poc_count=2, - poc_data=[ - { - "id": "test1", - "name": "CVE-2014-7236-exploit", - "owner": "security-researcher", - "full_name": "security-researcher/CVE-2014-7236-exploit", - "html_url": "https://github.com/security-researcher/CVE-2014-7236-exploit", - "description": "PowerShell exploit for CVE-2014-7236 using cmd.exe and powershell.exe", - "stargazers_count": 15, - "created_at": "2014-11-01T00:00:00Z", - "updated_at": "2014-11-15T00:00:00Z", - "quality_analysis": { - "quality_score": 75, - "quality_tier": "good", - "factors": { - "star_score": 30, - "recency_score": 10, - "description_score": 15, - "vuln_description_score": 15, - "name_relevance_score": 10 - } - }, - "exploit_indicators": { - "processes": ["powershell.exe", "cmd.exe"], - "files": ["exploit.ps1", "payload.exe"], - "commands": ["Invoke-Expression", "DownloadString", "whoami"], - "network": ["192.168.1.100", "8080"], - "urls": ["http://malicious.com/payload"], - "registry": ["HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft"] - } - }, - { - "id": "test2", - "name": "office-exploit-poc", - "owner": "hacker", - "full_name": "hacker/office-exploit-poc", - "html_url": "https://github.com/hacker/office-exploit-poc", - "description": "Office document exploit with malicious macro", - "stargazers_count": 8, - "created_at": "2014-12-01T00:00:00Z", - "updated_at": "2014-12-10T00:00:00Z", - "quality_analysis": { - "quality_score": 45, - "quality_tier": "fair", - "factors": { - "star_score": 16, - "recency_score": 8, - "description_score": 12, - "vuln_description_score": 0, - "name_relevance_score": 5 - } - }, - "exploit_indicators": { - "processes": ["winword.exe", "excel.exe"], - "files": ["document.docx", "malicious.xlsm"], - "commands": ["CreateObject", "Shell.Application"], - "network": ["10.0.0.1"], - "urls": ["http://evil.com/download"], - "registry": ["HKEY_CURRENT_USER\\Software\\Microsoft\\Office"] - } - } - ] - ) - - # Add to database - db.add(test_cve) - db.commit() - else: - # Update existing CVE with our mock PoC data - test_cve.poc_count = 2 - test_cve.poc_data = [ - { - "id": "test1", - "name": "CVE-2014-7236-exploit", - "owner": "security-researcher", - "full_name": "security-researcher/CVE-2014-7236-exploit", - "html_url": "https://github.com/security-researcher/CVE-2014-7236-exploit", - "description": "PowerShell exploit for CVE-2014-7236 using cmd.exe and powershell.exe", - "stargazers_count": 15, - "created_at": "2014-11-01T00:00:00Z", - "updated_at": "2014-11-15T00:00:00Z", - "quality_analysis": { - "quality_score": 75, - "quality_tier": "good", - "factors": { - "star_score": 30, - "recency_score": 10, - "description_score": 15, - "vuln_description_score": 15, - "name_relevance_score": 10 - } - }, - "exploit_indicators": { - "processes": ["powershell.exe", "cmd.exe"], - "files": ["exploit.ps1", "payload.exe"], - "commands": ["Invoke-Expression", "DownloadString", "whoami"], - "network": ["192.168.1.100", "8080"], - "urls": ["http://malicious.com/payload"], - "registry": ["HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft"] - } - }, - { - "id": "test2", - "name": "office-exploit-poc", - "owner": "hacker", - "full_name": "hacker/office-exploit-poc", - "html_url": "https://github.com/hacker/office-exploit-poc", - "description": "Office document exploit with malicious macro", - "stargazers_count": 8, - "created_at": "2014-12-01T00:00:00Z", - "updated_at": "2014-12-10T00:00:00Z", - "quality_analysis": { - "quality_score": 45, - "quality_tier": "fair", - "factors": { - "star_score": 16, - "recency_score": 8, - "description_score": 12, - "vuln_description_score": 0, - "name_relevance_score": 5 - } - }, - "exploit_indicators": { - "processes": ["winword.exe", "excel.exe"], - "files": ["document.docx", "malicious.xlsm"], - "commands": ["CreateObject", "Shell.Application"], - "network": ["10.0.0.1"], - "urls": ["http://evil.com/download"], - "registry": ["HKEY_CURRENT_USER\\Software\\Microsoft\\Office"] - } - } - ] - db.commit() - - print(f"Using CVE: {test_cve.cve_id} with {test_cve.poc_count} PoCs") - - # Generate enhanced rule - print("Generating enhanced SIGMA rule...") - generator = EnhancedSigmaGenerator(db) - result = await generator.generate_enhanced_rule(test_cve) - - print(f"Generation result: {result}") - - if result.get('success'): - # Fetch the generated rule - sigma_rule = db.query(SigmaRule).filter(SigmaRule.cve_id == test_cve.cve_id).first() - if sigma_rule: - print("\n" + "="*60) - print("GENERATED SIGMA RULE:") - print("="*60) - print(sigma_rule.rule_content) - print("="*60) - print(f"Detection Type: {sigma_rule.detection_type}") - print(f"Log Source: {sigma_rule.log_source}") - print(f"Confidence Level: {sigma_rule.confidence_level}") - print(f"PoC Quality Score: {sigma_rule.poc_quality_score}") - print(f"Exploit Indicators: {sigma_rule.exploit_indicators}") - print("="*60) - else: - print("No SIGMA rule found in database") - else: - print(f"Rule generation failed: {result.get('error')}") - - except Exception as e: - print(f"Error during test: {e}") - import traceback - traceback.print_exc() - finally: - db.close() - -if __name__ == "__main__": - asyncio.run(test_enhanced_rule_generation()) \ No newline at end of file diff --git a/backend/yaml_metadata_generator.py b/backend/yaml_metadata_generator.py new file mode 100644 index 0000000..c0aaca7 --- /dev/null +++ b/backend/yaml_metadata_generator.py @@ -0,0 +1,155 @@ +""" +YAML Metadata Generator for SIGMA Rules +Generates YAML metadata sections for SIGMA rules based on CVE and PoC data +""" + +import logging +from typing import Dict, List, Optional, Any +from sqlalchemy.orm import Session +from datetime import datetime + +logger = logging.getLogger(__name__) + +class YAMLMetadataGenerator: + """Generates YAML metadata sections for SIGMA rules""" + + def __init__(self, db_session: Session): + self.db_session = db_session + + def generate_metadata(self, cve, poc_data: List[Dict[str, Any]]) -> Dict[str, Any]: + """ + Generate YAML metadata for a SIGMA rule based on CVE and PoC data + + Args: + cve: CVE database object + poc_data: List of PoC data dictionaries + + Returns: + Dictionary containing YAML metadata sections + """ + try: + # Extract CVE information + cve_id = cve.cve_id + description = cve.description or "" + cvss_score = getattr(cve, 'cvss_score', None) + published_date = getattr(cve, 'published_date', None) + + # Determine attack techniques from PoC data + attack_techniques = self._extract_attack_techniques(poc_data) + + # Generate basic metadata + metadata = { + 'title': f"Potential {cve_id} Exploitation", + 'id': f"sigma-{cve_id.lower().replace('-', '_')}", + 'description': self._generate_description(cve_id, description), + 'references': [ + f"https://cve.mitre.org/cgi-bin/cvename.cgi?name={cve_id}", + f"https://nvd.nist.gov/vuln/detail/{cve_id}" + ], + 'author': "Auto-generated SIGMA Rule", + 'date': datetime.now().strftime("%Y/%m/%d"), + 'tags': self._generate_tags(cve_id, attack_techniques), + 'level': self._determine_level(cvss_score), + 'status': "experimental" + } + + # Add PoC-specific references + if poc_data: + for poc in poc_data[:3]: # Add up to 3 PoC references + if 'html_url' in poc: + metadata['references'].append(poc['html_url']) + + # Add MITRE ATT&CK techniques if available + if attack_techniques: + metadata['falsepositives'] = [ + "Legitimate use of affected software", + "Administrative activities" + ] + metadata['fields'] = ["CommandLine", "ProcessName", "ParentProcessName"] + + return metadata + + except Exception as e: + logger.error(f"Error generating metadata for {cve.cve_id}: {e}") + return self._generate_fallback_metadata(cve.cve_id) + + def _extract_attack_techniques(self, poc_data: List[Dict[str, Any]]) -> List[str]: + """Extract MITRE ATT&CK techniques from PoC data""" + techniques = [] + + for poc in poc_data: + # Look for common attack patterns in PoC descriptions + description = poc.get('description', '').lower() + + if 'remote code execution' in description or 'rce' in description: + techniques.append('T1203') # Exploitation for Client Execution + if 'privilege escalation' in description: + techniques.append('T1068') # Exploitation for Privilege Escalation + if 'sql injection' in description: + techniques.append('T1190') # Exploit Public-Facing Application + if 'xss' in description or 'cross-site scripting' in description: + techniques.append('T1185') # Browser Session Hijacking + if 'buffer overflow' in description: + techniques.append('T1203') # Exploitation for Client Execution + if 'deserialization' in description: + techniques.append('T1190') # Exploit Public-Facing Application + + return list(set(techniques)) + + def _generate_description(self, cve_id: str, description: str) -> str: + """Generate a concise description for the SIGMA rule""" + if description: + # Take first sentence or first 200 characters + first_sentence = description.split('.')[0] + if len(first_sentence) > 200: + return first_sentence[:200] + "..." + return first_sentence + "." + else: + return f"Detects potential exploitation of {cve_id}" + + def _generate_tags(self, cve_id: str, attack_techniques: List[str]) -> List[str]: + """Generate tags for the SIGMA rule""" + tags = [ + "attack.t1203", # Default to exploitation technique + "cve." + cve_id.lower().replace('-', '_') + ] + + # Add specific technique tags + for technique in attack_techniques: + tags.append(f"attack.{technique.lower()}") + + return tags + + def _determine_level(self, cvss_score: Optional[float]) -> str: + """Determine the severity level based on CVSS score""" + if cvss_score is None: + return "medium" + + if cvss_score >= 9.0: + return "critical" + elif cvss_score >= 7.0: + return "high" + elif cvss_score >= 4.0: + return "medium" + else: + return "low" + + def _generate_fallback_metadata(self, cve_id: str) -> Dict[str, Any]: + """Generate minimal fallback metadata when primary generation fails""" + return { + 'title': f"Potential {cve_id} Exploitation", + 'id': f"sigma-{cve_id.lower().replace('-', '_')}", + 'description': f"Detects potential exploitation of {cve_id}", + 'references': [ + f"https://cve.mitre.org/cgi-bin/cvename.cgi?name={cve_id}", + f"https://nvd.nist.gov/vuln/detail/{cve_id}" + ], + 'author': "Auto-generated SIGMA Rule", + 'date': datetime.now().strftime("%Y/%m/%d"), + 'tags': [ + "attack.t1203", + f"cve.{cve_id.lower().replace('-', '_')}" + ], + 'level': "medium", + 'status': "experimental" + } \ No newline at end of file diff --git a/cli/commands/migrate_commands.py b/cli/commands/migrate_commands.py index 64077de..c98609a 100644 --- a/cli/commands/migrate_commands.py +++ b/cli/commands/migrate_commands.py @@ -27,21 +27,21 @@ class MigrateCommands(BaseCommand): """Migrate data from existing database to file structure""" try: - # Import database components - from sqlalchemy import create_engine - from sqlalchemy.orm import sessionmaker - from main import CVE, SigmaRule, RuleTemplate # Import from existing main.py + # Import database components + from database_models import CVE, SigmaRule, RuleTemplate, SessionLocal - # Use provided database URL or default - if not database_url: - database_url = os.getenv("DATABASE_URL", "postgresql://cve_user:cve_password@localhost:5432/cve_sigma_db") - - self.info(f"Connecting to database: {database_url.split('@')[1] if '@' in database_url else database_url}") - - # Create database session - engine = create_engine(database_url) - SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) - db = SessionLocal() + # Use existing database session factory + if database_url: + self.info(f"Using provided database URL") + # Create new engine with provided URL + from sqlalchemy import create_engine + from sqlalchemy.orm import sessionmaker + engine = create_engine(database_url) + SessionFactory = sessionmaker(autocommit=False, autoflush=False, bind=engine) + db = SessionFactory() + else: + # Use default session factory + db = SessionLocal() # Get total counts cve_count = db.query(CVE).count() diff --git a/cli/commands/process_commands.py b/cli/commands/process_commands.py index ed8000c..4203a50 100644 --- a/cli/commands/process_commands.py +++ b/cli/commands/process_commands.py @@ -208,7 +208,7 @@ class ProcessCommands(BaseCommand): try: # Use the existing NVD bulk processor - from main import SessionLocal # Import session factory + from database_models import SessionLocal # Import session factory db_session = SessionLocal() try: @@ -242,7 +242,7 @@ class ProcessCommands(BaseCommand): self.info(f"Fetching data for {cve_id}...") try: - from main import SessionLocal + from database_models import SessionLocal db_session = SessionLocal() try: @@ -267,7 +267,7 @@ class ProcessCommands(BaseCommand): async def _sync_database_to_files(self, db_session, year: int): """Sync database records to file structure for a specific year""" try: - from main import CVE + from database_models import CVE # Get all CVEs for the year from database year_pattern = f"CVE-{year}-%" @@ -282,7 +282,7 @@ class ProcessCommands(BaseCommand): async def _sync_single_cve_to_files(self, db_session, cve_id: str): """Sync a single CVE from database to file structure""" try: - from main import CVE + from database_models import CVE cve = db_session.query(CVE).filter(CVE.cve_id == cve_id).first() if cve: @@ -368,7 +368,7 @@ class ProcessCommands(BaseCommand): async def _generate_template_rule(self, cve_id: str, metadata: Dict) -> bool: """Generate template-based SIGMA rule""" try: - from main import SessionLocal + from database_models import SessionLocal db_session = SessionLocal() try: @@ -407,7 +407,7 @@ class ProcessCommands(BaseCommand): async def _generate_llm_rule(self, cve_id: str, metadata: Dict, provider: str = 'openai') -> bool: """Generate LLM-based SIGMA rule""" try: - from main import SessionLocal + from database_models import SessionLocal db_session = SessionLocal() try: diff --git a/docker-compose.yml b/docker-compose.yml deleted file mode 100644 index fca8421..0000000 --- a/docker-compose.yml +++ /dev/null @@ -1,203 +0,0 @@ -services: - db: - image: postgres:15 - environment: - POSTGRES_DB: cve_sigma_db - POSTGRES_USER: cve_user - POSTGRES_PASSWORD: cve_password - volumes: - - postgres_data:/var/lib/postgresql/data - - ./init.sql:/docker-entrypoint-initdb.d/init.sql - ports: - - "5432:5432" - healthcheck: - test: ["CMD-SHELL", "pg_isready -U cve_user -d cve_sigma_db"] - interval: 30s - timeout: 10s - retries: 3 - - backend: - build: ./backend - ports: - - "8000:8000" - environment: - DATABASE_URL: postgresql://cve_user:cve_password@db:5432/cve_sigma_db - CELERY_BROKER_URL: redis://redis:6379/0 - CELERY_RESULT_BACKEND: redis://redis:6379/0 - NVD_API_KEY: ${NVD_API_KEY:-} - GITHUB_TOKEN: ${GITHUB_TOKEN} - OPENAI_API_KEY: ${OPENAI_API_KEY:-} - ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:-} - OLLAMA_BASE_URL: ${OLLAMA_BASE_URL:-http://ollama:11434} - LLM_PROVIDER: ${LLM_PROVIDER:-ollama} - LLM_MODEL: ${LLM_MODEL:-llama3.2} - LLM_ENABLED: ${LLM_ENABLED:-true} - FINETUNED_MODEL_PATH: ${FINETUNED_MODEL_PATH:-/app/models/sigma_llama_finetuned} - HUGGING_FACE_TOKEN: ${HUGGING_FACE_TOKEN} - depends_on: - db: - condition: service_healthy - redis: - condition: service_started - ollama-setup: - condition: service_completed_successfully - volumes: - - ./backend:/app - - ./github_poc_collector:/github_poc_collector - - ./exploit-db-mirror:/app/exploit-db-mirror - - ./models:/app/models - command: uvicorn main:app --host 0.0.0.0 --port 8000 --reload - - frontend: - build: ./frontend - ports: - - "3000:3000" - environment: - REACT_APP_API_URL: http://localhost:8000 - volumes: - - ./frontend:/app - - /app/node_modules - command: npm start - - redis: - image: redis:7-alpine - ports: - - "6379:6379" - command: redis-server --appendonly yes - volumes: - - redis_data:/data - - ollama: - image: ollama/ollama:latest - ports: - - "11434:11434" - volumes: - - ollama_data:/root/.ollama - environment: - - OLLAMA_HOST=0.0.0.0 - restart: unless-stopped - deploy: - resources: - limits: - memory: 5G - reservations: - memory: 3G - - ollama-setup: - build: ./backend - depends_on: - - ollama - environment: - OLLAMA_BASE_URL: http://ollama:11434 - LLM_MODEL: llama3.2 - volumes: - - ./backend:/app - - ./models:/app/models - command: python setup_ollama_with_sigma.py - restart: "no" - user: root - - initial-setup: - build: ./backend - depends_on: - db: - condition: service_healthy - redis: - condition: service_started - celery-worker: - condition: service_healthy - environment: - DATABASE_URL: postgresql://cve_user:cve_password@db:5432/cve_sigma_db - CELERY_BROKER_URL: redis://redis:6379/0 - CELERY_RESULT_BACKEND: redis://redis:6379/0 - volumes: - - ./backend:/app - command: python initial_setup.py - restart: "no" - - celery-worker: - build: ./backend - command: celery -A celery_config worker --loglevel=info --concurrency=4 - environment: - DATABASE_URL: postgresql://cve_user:cve_password@db:5432/cve_sigma_db - CELERY_BROKER_URL: redis://redis:6379/0 - CELERY_RESULT_BACKEND: redis://redis:6379/0 - NVD_API_KEY: ${NVD_API_KEY:-} - GITHUB_TOKEN: ${GITHUB_TOKEN} - OPENAI_API_KEY: ${OPENAI_API_KEY:-} - ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:-} - OLLAMA_BASE_URL: ${OLLAMA_BASE_URL:-http://ollama:11434} - LLM_PROVIDER: ${LLM_PROVIDER:-ollama} - LLM_MODEL: ${LLM_MODEL:-llama3.2} - LLM_ENABLED: ${LLM_ENABLED:-true} - FINETUNED_MODEL_PATH: ${FINETUNED_MODEL_PATH:-/app/models/sigma_llama_finetuned} - HUGGING_FACE_TOKEN: ${HUGGING_FACE_TOKEN} - depends_on: - db: - condition: service_healthy - redis: - condition: service_started - ollama-setup: - condition: service_completed_successfully - volumes: - - ./backend:/app - - ./github_poc_collector:/github_poc_collector - - ./exploit-db-mirror:/app/exploit-db-mirror - - ./models:/app/models - restart: unless-stopped - healthcheck: - test: ["CMD", "celery", "-A", "celery_config", "inspect", "ping"] - interval: 30s - timeout: 10s - retries: 3 - - celery-beat: - build: ./backend - command: celery -A celery_config beat --loglevel=info --pidfile=/tmp/celerybeat.pid - environment: - DATABASE_URL: postgresql://cve_user:cve_password@db:5432/cve_sigma_db - CELERY_BROKER_URL: redis://redis:6379/0 - CELERY_RESULT_BACKEND: redis://redis:6379/0 - NVD_API_KEY: ${NVD_API_KEY:-} - GITHUB_TOKEN: ${GITHUB_TOKEN} - OPENAI_API_KEY: ${OPENAI_API_KEY:-} - ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:-} - OLLAMA_BASE_URL: ${OLLAMA_BASE_URL:-http://ollama:11434} - LLM_PROVIDER: ${LLM_PROVIDER:-ollama} - LLM_MODEL: ${LLM_MODEL:-llama3.2} - LLM_ENABLED: ${LLM_ENABLED:-true} - FINETUNED_MODEL_PATH: ${FINETUNED_MODEL_PATH:-/app/models/sigma_llama_finetuned} - HUGGING_FACE_TOKEN: ${HUGGING_FACE_TOKEN} - depends_on: - db: - condition: service_healthy - redis: - condition: service_started - celery-worker: - condition: service_healthy - volumes: - - ./backend:/app - - ./github_poc_collector:/github_poc_collector - - ./exploit-db-mirror:/app/exploit-db-mirror - - ./models:/app/models - restart: unless-stopped - - flower: - build: ./backend - command: celery -A celery_config flower --port=5555 - ports: - - "5555:5555" - environment: - CELERY_BROKER_URL: redis://redis:6379/0 - CELERY_RESULT_BACKEND: redis://redis:6379/0 - depends_on: - redis: - condition: service_started - celery-worker: - condition: service_healthy - restart: unless-stopped - -volumes: - postgres_data: - redis_data: - ollama_data: diff --git a/frontend/Dockerfile b/frontend/Dockerfile deleted file mode 100644 index 4407e9b..0000000 --- a/frontend/Dockerfile +++ /dev/null @@ -1,24 +0,0 @@ -FROM node:18-alpine - -WORKDIR /app - -# Copy package files -COPY package*.json ./ - -# Install dependencies -RUN npm install - -# Copy source code -COPY . . - -# Create non-root user -RUN addgroup -g 1001 -S nodejs -RUN adduser -S reactuser -u 1001 - -# Change ownership -RUN chown -R reactuser:nodejs /app -USER reactuser - -EXPOSE 3000 - -CMD ["npm", "start"] diff --git a/frontend/package.json b/frontend/package.json deleted file mode 100644 index 172a63a..0000000 --- a/frontend/package.json +++ /dev/null @@ -1,47 +0,0 @@ -{ - "name": "cve-sigma-frontend", - "version": "0.1.0", - "private": true, - "dependencies": { - "@testing-library/jest-dom": "^5.16.4", - "@testing-library/react": "^13.3.0", - "@testing-library/user-event": "^13.5.0", - "react": "^18.2.0", - "react-dom": "^18.2.0", - "react-scripts": "5.0.1", - "axios": "^1.6.0", - "react-router-dom": "^6.8.0", - "react-syntax-highlighter": "^15.5.0", - "web-vitals": "^2.1.4" - }, - "devDependencies": { - "tailwindcss": "^3.3.0", - "autoprefixer": "^10.4.14", - "postcss": "^8.4.24" - }, - "scripts": { - "start": "react-scripts start", - "build": "react-scripts build", - "test": "react-scripts test", - "eject": "react-scripts eject" - }, - "eslintConfig": { - "extends": [ - "react-app", - "react-app/jest" - ] - }, - "browserslist": { - "production": [ - ">0.2%", - "not dead", - "not op_mini all" - ], - "development": [ - "last 1 chrome version", - "last 1 firefox version", - "last 1 safari version" - ] - }, - "proxy": "http://backend:8000" -} diff --git a/frontend/postcss.config.js b/frontend/postcss.config.js deleted file mode 100644 index 33ad091..0000000 --- a/frontend/postcss.config.js +++ /dev/null @@ -1,6 +0,0 @@ -module.exports = { - plugins: { - tailwindcss: {}, - autoprefixer: {}, - }, -} diff --git a/frontend/public/index.html b/frontend/public/index.html deleted file mode 100644 index da32bfe..0000000 --- a/frontend/public/index.html +++ /dev/null @@ -1,18 +0,0 @@ - - - - - - - - CVE-SIGMA Auto Generator - - - - -
- - diff --git a/frontend/src/App.css b/frontend/src/App.css deleted file mode 100644 index c7043fd..0000000 --- a/frontend/src/App.css +++ /dev/null @@ -1,126 +0,0 @@ -@tailwind base; -@tailwind components; -@tailwind utilities; - -.line-clamp-2 { - display: -webkit-box; - -webkit-line-clamp: 2; - -webkit-box-orient: vertical; - overflow: hidden; -} - -.animate-spin { - animation: spin 1s linear infinite; -} - -@keyframes spin { - from { - transform: rotate(0deg); - } - to { - transform: rotate(360deg); - } -} - -/* Custom scrollbar for syntax highlighter */ -.react-syntax-highlighter-line-number { - color: #6b7280 !important; -} - -/* Responsive table improvements */ -@media (max-width: 768px) { - .overflow-x-auto { - -webkit-overflow-scrolling: touch; - } - - table { - font-size: 0.875rem; - } - - .px-6 { - padding-left: 1rem; - padding-right: 1rem; - } -} - -/* Modal backdrop blur effect */ -.fixed.inset-0.bg-gray-600 { - backdrop-filter: blur(4px); -} - -/* Syntax highlighter theme overrides */ -.language-yaml { - border-radius: 0.375rem; - max-height: 400px; - overflow-y: auto; -} - -/* Loading spinner improvements */ -.animate-spin { - border-top-color: transparent; -} - -/* Badge hover effects */ -.inline-flex.px-2.py-1 { - transition: all 0.2s ease-in-out; -} - -.inline-flex.px-2.py-1:hover { - transform: translateY(-1px); - box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); -} - -/* Button hover effects */ -button { - transition: all 0.2s ease-in-out; -} - -button:hover { - transform: translateY(-1px); -} - -/* Card hover effects */ -.hover\:bg-gray-50:hover { - transform: translateY(-2px); - box-shadow: 0 4px 12px rgba(0, 0, 0, 0.05); - transition: all 0.2s ease-in-out; -} - -/* Smooth transitions for tab switching */ -.border-b-2 { - transition: border-color 0.2s ease-in-out; -} - -/* Custom focus styles for accessibility */ -button:focus, -.focus\:outline-none:focus { - outline: 2px solid #3b82f6; - outline-offset: 2px; -} - -/* Table row hover effects */ -tbody tr:hover { - background-color: #f9fafb; - transition: background-color 0.15s ease-in-out; -} - -/* Responsive grid improvements */ -@media (max-width: 640px) { - .grid-cols-1.md\:grid-cols-3 { - gap: 1rem; - } -} - -/* Loading state styles */ -.loading-pulse { - animation: pulse 2s cubic-bezier(0.4, 0, 0.6, 1) infinite; -} - -@keyframes pulse { - 0%, 100% { - opacity: 1; - } - 50% { - opacity: .5; - } -} diff --git a/frontend/src/App.js b/frontend/src/App.js deleted file mode 100644 index dae1e3b..0000000 --- a/frontend/src/App.js +++ /dev/null @@ -1,1226 +0,0 @@ -import React, { useState, useEffect } from 'react'; -import axios from 'axios'; -import { Prism as SyntaxHighlighter } from 'react-syntax-highlighter'; -import { tomorrow } from 'react-syntax-highlighter/dist/esm/styles/prism'; -import './App.css'; - -const API_BASE_URL = process.env.REACT_APP_API_URL || 'http://localhost:8000'; - -function App() { - const [cves, setCves] = useState([]); - const [cveSearch, setCveSearch] = useState(''); - const [cveFilters, setCveFilters] = useState({ severity: '' }); - const [cvePagination, setCvePagination] = useState({ skip: 0, limit: 20, total: 0, hasMore: false }); - const [loadingCves, setLoadingCves] = useState(false); - const [sigmaRules, setSigmaRules] = useState([]); - const [selectedCve, setSelectedCve] = useState(null); - const [stats, setStats] = useState({}); - const [loading, setLoading] = useState(true); - const [activeTab, setActiveTab] = useState('dashboard'); - const [fetchingCves, setFetchingCves] = useState(false); - const [testResult, setTestResult] = useState(null); - const [pocStats, setPocStats] = useState({}); - const [gitHubPocStats, setGitHubPocStats] = useState({}); - const [exploitdbStats, setExploitdbStats] = useState({}); - const [cisaKevStats, setCisaKevStats] = useState({}); - const [llmStatus, setLlmStatus] = useState({}); - const [exploitSyncDropdownOpen, setExploitSyncDropdownOpen] = useState(false); - - // Function to fetch CVEs with search and pagination - const fetchCves = async (search = '', filters = {}, pagination = { skip: 0, limit: 20 }) => { - setLoadingCves(true); - try { - const params = new URLSearchParams({ - skip: pagination.skip.toString(), - limit: pagination.limit.toString(), - }); - - if (search) params.append('search', search); - if (filters.severity) params.append('severity', filters.severity); - - const response = await axios.get(`${API_BASE_URL}/api/cves?${params}`); - setCves(response.data.cves || []); - setCvePagination({ - skip: response.data.skip || 0, - limit: response.data.limit || 20, - total: response.data.total || 0, - hasMore: response.data.has_more || false - }); - } catch (error) { - console.error('Error fetching CVEs:', error); - } finally { - setLoadingCves(false); - } - }; - - useEffect(() => { - fetchData(); - }, []); - - // Close dropdown when clicking outside - useEffect(() => { - const handleClickOutside = (event) => { - if (exploitSyncDropdownOpen && !event.target.closest('.relative')) { - setExploitSyncDropdownOpen(false); - } - }; - - document.addEventListener('mousedown', handleClickOutside); - return () => { - document.removeEventListener('mousedown', handleClickOutside); - }; - }, [exploitSyncDropdownOpen]); - - - // Note: Scheduler functionality removed - now handled by Celery Beat - // Monitoring available via Flower at http://localhost:5555 - - const fetchData = async () => { - try { - setLoading(true); - const [cvesRes, rulesRes, statsRes, pocStatsRes, githubPocStatsRes, exploitdbStatsRes, cisaKevStatsRes, llmStatusRes] = await Promise.all([ - axios.get(`${API_BASE_URL}/api/cves?limit=20`), - axios.get(`${API_BASE_URL}/api/sigma-rules`), - axios.get(`${API_BASE_URL}/api/stats`), - axios.get(`${API_BASE_URL}/api/poc-stats`), - axios.get(`${API_BASE_URL}/api/github-poc-stats`).catch(err => ({ data: {} })), - axios.get(`${API_BASE_URL}/api/exploitdb-stats`).catch(err => ({ data: {} })), - axios.get(`${API_BASE_URL}/api/cisa-kev-stats`).catch(err => ({ data: {} })), - axios.get(`${API_BASE_URL}/api/llm-status`).catch(err => ({ data: {} })) - ]); - - setCves(cvesRes.data.cves || []); - setCvePagination({ - skip: cvesRes.data.skip || 0, - limit: cvesRes.data.limit || 20, - total: cvesRes.data.total || 0, - hasMore: cvesRes.data.has_more || false - }); - setSigmaRules(rulesRes.data); - setStats(statsRes.data); - setPocStats(pocStatsRes.data); - setGitHubPocStats(githubPocStatsRes.data); - setExploitdbStats(exploitdbStatsRes.data); - setCisaKevStats(cisaKevStatsRes.data); - setLlmStatus(llmStatusRes.data); - } catch (error) { - console.error('Error fetching data:', error); - } finally { - setLoading(false); - } - }; - - - const handleFetchCves = async () => { - try { - setFetchingCves(true); - const response = await axios.post(`${API_BASE_URL}/api/fetch-cves`); - console.log('Fetch response:', response.data); - // Show success message and refresh after delay - setTimeout(() => { - fetchData(); - setFetchingCves(false); - }, 5000); // Wait a bit longer for background task to complete - } catch (error) { - console.error('Error fetching CVEs:', error); - setFetchingCves(false); - // Show error state - setTestResult({ - status: 'error', - message: 'Failed to initiate CVE fetch. Check console logs.' - }); - } - }; - - const testNvdConnection = async () => { - try { - const response = await axios.get(`${API_BASE_URL}/api/test-nvd`); - setTestResult(response.data); - } catch (error) { - console.error('Error testing NVD connection:', error); - setTestResult({ - status: 'error', - message: 'Failed to test NVD connection' - }); - } - }; - - const startBulkSeed = async (startYear = 2020, endYear = null) => { - try { - const response = await axios.post(`${API_BASE_URL}/api/bulk-seed`, { - start_year: startYear, - end_year: endYear, - skip_nomi_sec: true - }); - console.log('Bulk seed response:', response.data); - // Refresh data immediately to show job started - fetchData(); - } catch (error) { - console.error('Error starting bulk seed:', error); - } - }; - - const startIncrementalUpdate = async () => { - try { - const response = await axios.post(`${API_BASE_URL}/api/incremental-update`); - console.log('Incremental update response:', response.data); - fetchData(); - } catch (error) { - console.error('Error starting incremental update:', error); - } - }; - - const syncNomiSec = async (cveId = null) => { - try { - const response = await axios.post(`${API_BASE_URL}/api/sync-nomi-sec`, { - cve_id: cveId - }); - console.log('Nomi-sec sync response:', response.data); - fetchData(); - } catch (error) { - console.error('Error syncing nomi-sec:', error); - } - }; - - const syncGitHubPocs = async (cveId = null) => { - try { - const response = await axios.post(`${API_BASE_URL}/api/sync-github-pocs`, { - cve_id: cveId, - batch_size: 50 - }); - console.log('GitHub PoC sync response:', response.data); - - // Show success message with Celery task info - if (response.data.task_id) { - console.log(`GitHub PoC sync task started: ${response.data.task_id}`); - console.log(`Monitor at: ${response.data.monitor_url}`); - - // Show notification to user - alert(`GitHub PoC sync started successfully!\nTask ID: ${response.data.task_id}\n\nMonitor progress at http://localhost:5555 (Flower Dashboard)`); - } - - // Refresh data to show the task in bulk jobs - fetchData(); - } catch (error) { - console.error('Error syncing GitHub PoCs:', error); - alert('Failed to start GitHub PoC sync. Please check the console for details.'); - } - }; - - const syncExploitDB = async (cveId = null) => { - try { - const response = await axios.post(`${API_BASE_URL}/api/sync-exploitdb`, { - cve_id: cveId, - batch_size: 30 - }); - console.log('ExploitDB sync response:', response.data); - fetchData(); - } catch (error) { - console.error('Error syncing ExploitDB:', error); - } - }; - - const syncCISAKEV = async (cveId = null) => { - try { - const response = await axios.post(`${API_BASE_URL}/api/sync-cisa-kev`, { - cve_id: cveId, - batch_size: 100 - }); - console.log('CISA KEV sync response:', response.data); - fetchData(); - } catch (error) { - console.error('Error syncing CISA KEV:', error); - } - }; - - const syncReferences = async () => { - try { - const response = await axios.post('http://localhost:8000/api/sync-references', { - batch_size: 30, - max_cves: 100, - force_resync: false - }); - console.log('Reference sync response:', response.data); - fetchData(); - } catch (error) { - console.error('Error syncing references:', error); - alert('Error starting reference sync: ' + (error.response?.data?.detail || error.message)); - } - }; - - const regenerateRules = async (force = false) => { - try { - const response = await axios.post(`${API_BASE_URL}/api/regenerate-rules`, { - force: force - }); - console.log('Rule regeneration response:', response.data); - fetchData(); - } catch (error) { - console.error('Error regenerating rules:', error); - } - }; - - const generateLlmRules = async (force = false) => { - try { - const response = await axios.post(`${API_BASE_URL}/api/llm-enhanced-rules`, { - force: force - }); - console.log('LLM rule generation response:', response.data); - fetchData(); - } catch (error) { - console.error('Error generating LLM-enhanced rules:', error); - } - }; - - const switchLlmProvider = async (provider, model) => { - try { - const response = await axios.post(`${API_BASE_URL}/api/llm-switch`, { - provider: provider, - model: model - }); - console.log('LLM provider switch response:', response.data); - fetchData(); // Refresh to get updated status - } catch (error) { - console.error('Error switching LLM provider:', error); - alert('Failed to switch LLM provider. Please check configuration.'); - } - }; - - const getSeverityColor = (severity) => { - switch (severity?.toLowerCase()) { - case 'critical': return 'bg-red-100 text-red-800'; - case 'high': return 'bg-orange-100 text-orange-800'; - case 'medium': return 'bg-yellow-100 text-yellow-800'; - case 'low': return 'bg-green-100 text-green-800'; - default: return 'bg-gray-100 text-gray-800'; - } - }; - - const formatDate = (dateString) => { - return new Date(dateString).toLocaleDateString('en-US', { - year: 'numeric', - month: 'short', - day: 'numeric' - }); - }; - - const Dashboard = () => ( -
- {/* Celery Monitoring Notice */} -
-
-
- - - -
-
-

- Task Scheduling & Monitoring -

-
-

- Automated tasks are now managed by Celery Beat. Monitor real-time task execution at{' '} - - Flower Dashboard - -

-
-
-
-
- -
-
-

Total CVEs

-

{stats.total_cves || 0}

-

Bulk: {stats.bulk_processed_cves || 0}

-
-
-

SIGMA Rules

-

{stats.total_sigma_rules || 0}

-

Nomi-sec: {stats.nomi_sec_rules || 0}

-

GitHub PoCs: {gitHubPocStats.github_poc_rules || 0}

-

- LLM: {llmStatus.current_provider?.provider || 'Not Available'} -

-
-
-

CVEs with PoCs

-

{stats.cves_with_pocs || 0}

-

{(stats.poc_coverage || 0).toFixed(1)}% coverage

-

GitHub PoCs: {gitHubPocStats.cves_with_github_pocs || 0}

-

ExploitDB: {exploitdbStats.total_exploitdb_cves || 0}

-

CISA KEV: {cisaKevStats.total_kev_cves || 0}

-
-
-

Recent CVEs (7d)

-

{stats.recent_cves_7_days || 0}

-
-
-

High Quality PoCs

-

{pocStats.high_quality_cves || 0}

-

Avg: {(pocStats.avg_poc_count || 0).toFixed(1)}

-

GitHub: {(gitHubPocStats.average_quality_score || 0).toFixed(1)}

-

ExploitDB: {exploitdbStats.total_exploits || 0} exploits

-

CISA KEV: {(cisaKevStats.average_threat_score || 0).toFixed(1)} threat

-
-
- - {/* Data Synchronization Controls */} -
-

Data Synchronization

- - {/* Phase 1: CVE Data Syncing */} -
-

Phase 1: CVE Data Syncing

-
- - -
-
- - {/* Phase 2: Exploit Data Syncing */} -
-

Phase 2: Exploit Data Syncing

-
- - - {exploitSyncDropdownOpen && ( -
-
- - - - -
-
- )} -
-
- - {/* Phase 3: Reference Data Syncing */} -
-

Phase 3: Reference Data Syncing

- -
- - {/* Phase 4: Rule Generation */} -
-

Phase 4: Rule Generation

-
- - -
-
-
- - {/* LLM Configuration */} -
-

LLM Configuration

-
-
-

Current Provider

-
-

- Provider: {llmStatus.current_provider?.provider || 'Not configured'} -

-

- Model: {llmStatus.current_provider?.model || 'Not configured'} -

-

- Status: {llmStatus.status || 'Unknown'} -

-
-
-
-

Available Providers

-
- {llmStatus.available_providers?.map(provider => ( -
-
- {provider.name} - - {provider.available ? 'Available' : 'Not configured'} - -
- {provider.available && provider.name !== llmStatus.current_provider?.provider && ( - - )} -
- ))} -
-
-
-
- -
-
-

Recent CVEs

-
- - -
-
- - {testResult && ( -
-
- NVD API Test: - {testResult.message} -
- {testResult.status === 'success' && ( -
-

โœ… API Key: {testResult.has_api_key ? 'Present' : 'Not configured'}

-

โœ… Available results: {testResult.total_results || 0}

-
- )} -
- )} - - {fetchingCves && ( -
-
-
- Fetching CVEs from NVD API (30-day lookback)... This may take 1-2 minutes. -
-
- )} -
- - - - - - - - - - - - {cves.slice(0, 10).map((cve) => ( - - - - - - - - ))} - -
- CVE ID - - Severity - - CVSS Score - - Published - - Actions -
- {cve.cve_id} - - - {cve.severity || 'N/A'} - - - {cve.cvss_score || 'N/A'} - - {cve.published_date ? formatDate(cve.published_date) : 'N/A'} - - -
-
-
-
- ); - - const CVEList = () => { - const handleSearch = (e) => { - e.preventDefault(); - fetchCves(cveSearch, cveFilters, { skip: 0, limit: 20 }); - }; - - const handleFilterChange = (filterName, value) => { - const newFilters = { ...cveFilters, [filterName]: value }; - setCveFilters(newFilters); - fetchCves(cveSearch, newFilters, { skip: 0, limit: 20 }); - }; - - const handlePageChange = (newSkip) => { - fetchCves(cveSearch, cveFilters, { skip: newSkip, limit: 20 }); - }; - - return ( -
-
-
-

All CVEs

-
- {cvePagination.total} total CVEs -
-
- - {/* Search and Filters */} -
-
-
- setCveSearch(e.target.value)} - placeholder="Search CVEs by ID, description, or affected products..." - className="w-full px-4 py-2 border border-gray-300 rounded-md focus:ring-2 focus:ring-blue-500 focus:border-transparent" - /> - -
-
- - -
-
- - {/* CVE List */} -
- {loadingCves ? ( -
-
-

Loading CVEs...

-
- ) : cves.length === 0 ? ( -
- No CVEs found matching your search criteria. -
- ) : ( - cves.map((cve) => ( -
-
-
-

{cve.cve_id}

-

- {cve.description} -

-
- - {cve.severity || 'N/A'} - - - CVSS: {cve.cvss_score || 'N/A'} - - - {cve.published_date ? formatDate(cve.published_date) : 'N/A'} - - {cve.poc_count > 0 && ( - - ๐Ÿ” {cve.poc_count} PoC{cve.poc_count > 1 ? 's' : ''} - - )} -
-
- -
-
- )) - )} -
- - {/* Pagination */} - {!loadingCves && cves.length > 0 && ( -
-
- Showing {cvePagination.skip + 1} to {Math.min(cvePagination.skip + cvePagination.limit, cvePagination.total)} of {cvePagination.total} CVEs -
-
- - -
-
- )} -
- ); - }; - - const SigmaRulesList = () => ( -
-
-

Generated SIGMA Rules

-
-
- {sigmaRules.map((rule) => ( -
-
-
-

{rule.rule_name}

-

CVE: {rule.cve_id}

-
- - {rule.detection_type} - - - {rule.confidence_level} - - {rule.auto_generated && ( - - Auto-generated - - )} - {rule.exploit_based && ( - - ๐Ÿ” Exploit-Based - - )} -
- {rule.github_repos && rule.github_repos.length > 0 && ( -
-

- Based on {rule.github_repos.length} GitHub repository{rule.github_repos.length > 1 ? 's' : ''}: -

-
- {rule.github_repos.slice(0, 3).map((repo, index) => ( - - {repo.split('/').slice(-2).join('/')} - - ))} - {rule.github_repos.length > 3 && ( - - +{rule.github_repos.length - 3} more - - )} -
-
- )} -
- - {formatDate(rule.created_at)} - -
-
- - {rule.rule_content} - -
- {rule.exploit_indicators && ( -
-

Exploit Indicators Found:

- -
- )} -
- ))} -
-
- ); - - const ExploitIndicators = ({ indicators }) => { - try { - const parsed = JSON.parse(indicators); - return ( -
- {Object.entries(parsed).map(([category, items]) => ( - items.length > 0 && ( -
- - {category}: - -
- {items.slice(0, 5).map((item, index) => ( - - {typeof item === 'string' && item.length > 30 ? item.substring(0, 30) + '...' : item} - - ))} - {items.length > 5 && ( - +{items.length - 5} more - )} -
-
- ) - ))} -
- ); - } catch (e) { - return

Invalid indicator data

; - } - }; - - const CVEDetail = ({ cve, onClose }) => { - const [cveRules, setCveRules] = useState([]); - - useEffect(() => { - if (cve) { - fetchCveRules(cve.cve_id); - } - }, [cve]); - - const fetchCveRules = async (cveId) => { - try { - const response = await axios.get(`${API_BASE_URL}/api/sigma-rules/${cveId}`); - setCveRules(response.data); - } catch (error) { - console.error('Error fetching CVE rules:', error); - } - }; - - return ( -
-
-
-

{cve.cve_id}

- -
- -
-
-

Details

-
-
-
- Severity: - - {cve.severity || 'N/A'} - -
-
- CVSS Score: - {cve.cvss_score || 'N/A'} -
-
- Published: - - {cve.published_date ? formatDate(cve.published_date) : 'N/A'} - -
-
-
-
- -
-

Description

-

- {cve.description} -

-
- - {cve.affected_products && cve.affected_products.length > 0 && ( -
-

Affected Products

-
-
    - {cve.affected_products.slice(0, 5).map((product, index) => ( -
  • {product}
  • - ))} - {cve.affected_products.length > 5 && ( -
  • ... and {cve.affected_products.length - 5} more
  • - )} -
-
-
- )} - - {/* References Section */} - {cve.reference_urls && cve.reference_urls.length > 0 && ( -
-

References

-
-
    - {cve.reference_urls.map((url, index) => ( -
  • - - {url} - -
  • - ))} -
-
-
- )} - - {/* Exploit/PoC Links Section */} - {cve.poc_data && Object.keys(cve.poc_data).length > 0 && ( -
-

Exploits & Proof of Concepts

-
- {Object.entries(cve.poc_data).map(([source, data]) => ( -
-
-

{source}

- - {data.exploits?.length || data.pocs?.length || 0} items - -
- - {(data.exploits || data.pocs || []).slice(0, 5).map((item, index) => ( -
-
-
-
- {item.title || item.name || item.description || 'Untitled'} -
- {item.description && ( -

- {item.description} -

- )} -
- {item.html_url && ( - - View Source - - )} - {item.quality_analysis && ( - - {item.quality_analysis.quality_tier} - - )} - {item.stargazers_count && ( - - โญ {item.stargazers_count} - - )} -
-
-
-
- ))} - - {(data.exploits || data.pocs || []).length > 5 && ( -

- ... and {(data.exploits || data.pocs || []).length - 5} more items -

- )} -
- ))} -
-
- )} - -
-

Generated SIGMA Rules ({cveRules.length})

- {cveRules.length > 0 ? ( -
- {cveRules.map((rule) => ( -
-
-

{rule.rule_name}

-
- - {rule.detection_type} - - - {rule.confidence_level} - - {rule.exploit_based && ( - - ๐Ÿ” Exploit-Based - - )} -
-
- {rule.github_repos && rule.github_repos.length > 0 && ( -
-

- Based on GitHub exploit analysis: -

-
- {rule.github_repos.slice(0, 2).map((repo, index) => ( - - {repo.split('/').slice(-2).join('/')} - - ))} - {rule.github_repos.length > 2 && ( - - +{rule.github_repos.length - 2} more - - )} -
-
- )} - - {rule.rule_content} - - {rule.exploit_indicators && ( -
-

Exploit Indicators:

- -
- )} -
- ))} -
- ) : ( -

No SIGMA rules generated for this CVE yet.

- )} -
-
-
-
- ); - }; - - - // Note: SchedulerManager component removed - job scheduling now handled by Celery Beat - // Task monitoring available via Flower dashboard at http://localhost:5555 - - if (loading) { - return ( -
-
-
-

Loading...

-
-
- ); - } - - return ( -
- - -
-
- {activeTab === 'dashboard' && } - {activeTab === 'cves' && } - {activeTab === 'rules' && } -
-
- - {selectedCve && ( - setSelectedCve(null)} /> - )} -
- ); -} - -export default App; diff --git a/frontend/src/index.js b/frontend/src/index.js deleted file mode 100644 index 7006ec3..0000000 --- a/frontend/src/index.js +++ /dev/null @@ -1,11 +0,0 @@ -import React from 'react'; -import ReactDOM from 'react-dom/client'; -import './App.css'; -import App from './App'; - -const root = ReactDOM.createRoot(document.getElementById('root')); -root.render( - - - -); diff --git a/frontend/tailwind.config.js b/frontend/tailwind.config.js deleted file mode 100644 index 9514451..0000000 --- a/frontend/tailwind.config.js +++ /dev/null @@ -1,33 +0,0 @@ -/** @type {import('tailwindcss').Config} */ -module.exports = { - content: [ - "./src/**/*.{js,jsx,ts,tsx}", - "./public/index.html" - ], - theme: { - extend: { - colors: { - 'cve-blue': '#3b82f6', - 'cve-green': '#10b981', - 'cve-red': '#ef4444', - 'cve-orange': '#f97316', - 'cve-yellow': '#eab308', - }, - animation: { - 'fade-in': 'fadeIn 0.5s ease-in-out', - 'slide-up': 'slideUp 0.3s ease-out', - }, - keyframes: { - fadeIn: { - '0%': { opacity: '0' }, - '100%': { opacity: '1' }, - }, - slideUp: { - '0%': { transform: 'translateY(10px)', opacity: '0' }, - '100%': { transform: 'translateY(0)', opacity: '1' }, - }, - }, - }, - }, - plugins: [], -} diff --git a/init.sql b/init.sql deleted file mode 100644 index defed1d..0000000 --- a/init.sql +++ /dev/null @@ -1,191 +0,0 @@ --- Database initialization script - -CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; - --- CVEs table -CREATE TABLE cves ( - id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), - cve_id VARCHAR(20) UNIQUE NOT NULL, - description TEXT, - cvss_score DECIMAL(3,1), - severity VARCHAR(20), - published_date TIMESTAMP, - modified_date TIMESTAMP, - affected_products TEXT[], - reference_urls TEXT[], - -- Bulk processing fields - data_source VARCHAR(20) DEFAULT 'nvd_api', - nvd_json_version VARCHAR(10) DEFAULT '2.0', - bulk_processed BOOLEAN DEFAULT FALSE, - -- nomi-sec PoC fields - poc_count INTEGER DEFAULT 0, - poc_data JSON, - -- Reference data fields - reference_data JSON, - reference_sync_status VARCHAR(20) DEFAULT 'pending', - reference_last_synced TIMESTAMP, - created_at TIMESTAMP DEFAULT NOW(), - updated_at TIMESTAMP DEFAULT NOW() -); - --- SIGMA rules table -CREATE TABLE sigma_rules ( - id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), - cve_id VARCHAR(20) REFERENCES cves(cve_id), - rule_name VARCHAR(255) NOT NULL, - rule_content TEXT NOT NULL, - detection_type VARCHAR(50), - log_source VARCHAR(100), - confidence_level VARCHAR(20), - auto_generated BOOLEAN DEFAULT TRUE, - exploit_based BOOLEAN DEFAULT FALSE, - github_repos TEXT[], - exploit_indicators TEXT, - -- Enhanced fields for new data sources - poc_source VARCHAR(20) DEFAULT 'github_search', - poc_quality_score INTEGER DEFAULT 0, - nomi_sec_data JSON, - created_at TIMESTAMP DEFAULT NOW(), - updated_at TIMESTAMP DEFAULT NOW() -); - --- Rule templates table -CREATE TABLE rule_templates ( - id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), - template_name VARCHAR(255) NOT NULL, - template_content TEXT NOT NULL, - applicable_product_patterns TEXT[], - description TEXT, - created_at TIMESTAMP DEFAULT NOW() -); - --- Bulk processing jobs table -CREATE TABLE bulk_processing_jobs ( - id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), - job_type VARCHAR(50) NOT NULL, - status VARCHAR(20) DEFAULT 'pending', - year INTEGER, - total_items INTEGER DEFAULT 0, - processed_items INTEGER DEFAULT 0, - failed_items INTEGER DEFAULT 0, - error_message TEXT, - job_metadata JSON, - started_at TIMESTAMP, - completed_at TIMESTAMP, - cancelled_at TIMESTAMP, - created_at TIMESTAMP DEFAULT NOW() -); - --- Insert some basic rule templates -INSERT INTO rule_templates (template_name, template_content, applicable_product_patterns, description) VALUES -( - 'Windows Process Execution', - 'title: {title} -description: {description} -id: {rule_id} -status: experimental -author: CVE-SIGMA Auto Generator -date: {date} -references: - - {cve_url} -tags: - - {tags} -logsource: - category: process_creation - product: windows -detection: - selection: - Image|contains: {suspicious_processes} - condition: selection -falsepositives: - - Legitimate use of the software -level: {level}', - ARRAY['windows', 'microsoft'], - 'Template for Windows process execution detection' -), -( - 'Network Connection', - 'title: {title} -description: {description} -id: {rule_id} -status: experimental -author: CVE-SIGMA Auto Generator -date: {date} -references: - - {cve_url} -tags: - - {tags} -logsource: - category: network_connection - product: windows -detection: - selection: - Initiated: true - DestinationPort: {suspicious_ports} - condition: selection -falsepositives: - - Legitimate network connections -level: {level}', - ARRAY['network', 'connection', 'remote'], - 'Template for network connection detection' -), -( - 'File Modification', - 'title: {title} -description: {description} -id: {rule_id} -status: experimental -author: CVE-SIGMA Auto Generator -date: {date} -references: - - {cve_url} -tags: - - {tags} -logsource: - category: file_event - product: windows -detection: - selection: - EventType: creation - TargetFilename|contains: {file_patterns} - condition: selection -falsepositives: - - Legitimate file operations -level: {level}', - ARRAY['file', 'filesystem', 'modification'], - 'Template for file modification detection' -), -( - 'PowerShell Execution', - 'title: {title} -description: {description} -id: {rule_id} -status: experimental -author: CVE-SIGMA Auto Generator -date: {date} -references: - - {cve_url} -tags: - - {tags} -logsource: - product: windows - category: ps_script -detection: - selection: - ScriptBlockText|contains: {suspicious_processes} - condition: selection -falsepositives: - - Legitimate PowerShell scripts -level: {level}', - ARRAY['powershell', 'script', 'ps1'], - 'Template for PowerShell script execution detection' -); - --- Create indexes -CREATE INDEX idx_cves_cve_id ON cves(cve_id); -CREATE INDEX idx_cves_published_date ON cves(published_date); -CREATE INDEX idx_cves_severity ON cves(severity); -CREATE INDEX idx_cves_reference_sync_status ON cves(reference_sync_status); -CREATE INDEX idx_cves_reference_last_synced ON cves(reference_last_synced); -CREATE INDEX idx_sigma_rules_cve_id ON sigma_rules(cve_id); -CREATE INDEX idx_sigma_rules_detection_type ON sigma_rules(detection_type); diff --git a/start.sh b/start.sh deleted file mode 100755 index bc38c03..0000000 --- a/start.sh +++ /dev/null @@ -1,63 +0,0 @@ -#!/bin/bash - -# CVE-SIGMA Auto Generator Startup Script - -echo "๐Ÿš€ Starting CVE-SIGMA Auto Generator..." -echo "===============================================" - -# Check if Docker and Docker Compose are installed -if ! command -v docker &> /dev/null; then - echo "โŒ Docker is not installed. Please install Docker first." - exit 1 -fi - -if ! command -v docker-compose &> /dev/null; then - echo "โŒ Docker Compose is not installed. Please install Docker Compose first." - exit 1 -fi - -# Check if .env file exists, if not create from example -if [ ! -f .env ]; then - echo "๐Ÿ“ Creating .env file from .env.example..." - cp .env.example .env - echo "โœ… .env file created. Please edit it to add your NVD API key for better rate limits." -fi - -# Stop any existing containers -echo "๐Ÿ›‘ Stopping any existing containers..." -docker-compose down - -# Build and start the application -echo "๐Ÿ”จ Building and starting the application..." -docker-compose up -d --build - -# Wait for services to be ready -echo "โณ Waiting for services to start..." -sleep 10 - -# Check if services are running -echo "๐Ÿ” Checking service status..." -if docker-compose ps | grep -q "Up"; then - echo "โœ… Services are running!" - echo "" - echo "๐ŸŒ Access the application at:" - echo " Frontend: http://localhost:3000" - echo " Backend API: http://localhost:8000" - echo " API Documentation: http://localhost:8000/docs" - echo "" - echo "๐Ÿ“Š The application will automatically:" - echo " - Fetch recent CVEs from NVD" - echo " - Generate SIGMA rules" - echo " - Update every hour" - echo "" - echo "๐Ÿ’ก Tip: Add your NVD API key to .env for higher rate limits" - echo " Get one free at: https://nvd.nist.gov/developers/request-an-api-key" -else - echo "โŒ Some services failed to start. Check logs with:" - echo " docker-compose logs" -fi - -# Show logs -echo "" -echo "๐Ÿ“‹ Recent logs (press Ctrl+C to exit):" -docker-compose logs -f --tail=50