more updates for bulk

2025-07-08 17:50:01 -05:00 · 2025-07-08 17:50:01 -05:00 · 790e4bd91f
commit 790e4bd91f
parent 5a9ae34996
11 changed files with 2500 additions and 22 deletions
--- a/README.md
+++ b/README.md
@ -1,20 +1,27 @@
-# CVE-SIGMA Auto Generator
+# CVE-SIGMA Auto Generator (Enhanced)
-An automated platform that fetches CVE data and automatically generates SIGMA rules for threat detection.
+An advanced automated platform that processes comprehensive CVE data and generates enhanced SIGMA rules for threat detection using curated exploit intelligence.
-## Features
+## 🚀 Enhanced Features
- **Automated CVE Fetching**: Regularly polls the NVD (National Vulnerability Database) for CVEs from July 2025
+### Data Processing
- **GitHub Exploit Analysis**: Automatically searches GitHub for exploit code related to each CVE
+- **Bulk NVD Processing**: Downloads and processes complete NVD JSON datasets (2002-2025)
- **Intelligent SIGMA Rule Generation**: Creates SIGMA rules based on CVE characteristics AND actual exploit code
+- **nomi-sec PoC Integration**: Uses curated PoC data from github.com/nomi-sec/PoC-in-GitHub
- **Exploit-Based Detection**: Enhanced rules using real indicators extracted from GitHub exploits
+- **Incremental Updates**: Efficient updates using NVD modified/recent feeds
- **Modern Web Interface**: React-based UI for browsing CVEs and managing SIGMA rules
+- **Quality Assessment**: Advanced PoC quality scoring with star count, recency, and relevance analysis
- **Real-time Updates**: Background tasks keep CVE data current with current 2025 vulnerabilities
+
- **Rule Templates**: Configurable templates for different types of vulnerabilities
+### Intelligence Generation  
- **MITRE ATT&CK Mapping**: Automatic mapping to MITRE ATT&CK techniques
+- **Enhanced SIGMA Rules**: Creates rules using real exploit indicators from curated PoCs
- **API Testing**: Built-in NVD API connectivity testing
+- **Quality Tiers**: Excellent, Good, Fair, Poor, Very Poor classification system
- **Enhanced Error Handling**: Robust fallback mechanisms and detailed logging
+- **Smart Template Selection**: AI-driven template matching based on PoC characteristics
- **Docker Compose**: Easy deployment and orchestration
+- **Advanced Indicator Extraction**: Processes, files, network, registry, and command patterns
 - **MITRE ATT&CK Mapping**: Automatic technique identification based on exploit analysis
 ### User Experience
 - **Modern Web Interface**: React-based UI with enhanced bulk processing controls
 - **Real-time Monitoring**: Live job tracking and progress monitoring
 - **Comprehensive Statistics**: PoC coverage, quality metrics, and processing status
 - **Bulk Operations Dashboard**: Centralized control for all data processing operations
 ## Architecture
--- a/backend/bulk_seeder.py
+++ b/backend/bulk_seeder.py
@ -0,0 +1,340 @@
 """
 Bulk Data Seeding Coordinator
 Orchestrates the complete bulk seeding process using NVD JSON feeds and nomi-sec PoC data
 """
 import asyncio
 import logging
 from datetime import datetime
 from typing import Optional
 from sqlalchemy.orm import Session
 from nvd_bulk_processor import NVDBulkProcessor
 from nomi_sec_client import NomiSecClient
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class BulkSeeder:
    """Coordinates bulk seeding operations"""
    def __init__(self, db_session: Session):
        self.db_session = db_session
        self.nvd_processor = NVDBulkProcessor(db_session)
        self.nomi_sec_client = NomiSecClient(db_session)
    async def full_bulk_seed(self, start_year: int = 2002, 
                           end_year: Optional[int] = None,
                           skip_nvd: bool = False,
                           skip_nomi_sec: bool = False) -> dict:
        """
        Perform complete bulk seeding operation
        Args:
            start_year: Starting year for NVD data (default: 2002)
            end_year: Ending year for NVD data (default: current year)
            skip_nvd: Skip NVD bulk processing (default: False)
            skip_nomi_sec: Skip nomi-sec PoC synchronization (default: False)
        Returns:
            Dictionary containing operation results
        """
        if end_year is None:
            end_year = datetime.now().year
        results = {
            'start_time': datetime.utcnow(),
            'nvd_results': None,
            'nomi_sec_results': None,
            'total_time': None,
            'status': 'running'
        }
        logger.info(f"Starting full bulk seed operation ({start_year}-{end_year})")
        try:
            # Phase 1: NVD Bulk Processing
            if not skip_nvd:
                logger.info("Phase 1: Starting NVD bulk processing...")
                nvd_results = await self.nvd_processor.bulk_seed_database(
                    start_year=start_year,
                    end_year=end_year
                )
                results['nvd_results'] = nvd_results
                logger.info(f"Phase 1 complete: {nvd_results['total_processed']} CVEs processed")
            else:
                logger.info("Phase 1: Skipping NVD bulk processing")
            # Phase 2: nomi-sec PoC Synchronization
            if not skip_nomi_sec:
                logger.info("Phase 2: Starting nomi-sec PoC synchronization...")
                nomi_sec_results = await self.nomi_sec_client.bulk_sync_all_cves(
                    batch_size=50  # Smaller batches for API stability
                )
                results['nomi_sec_results'] = nomi_sec_results
                logger.info(f"Phase 2 complete: {nomi_sec_results['total_pocs_found']} PoCs found")
            else:
                logger.info("Phase 2: Skipping nomi-sec PoC synchronization")
            # Phase 3: Generate Enhanced SIGMA Rules
            logger.info("Phase 3: Generating enhanced SIGMA rules...")
            sigma_results = await self.generate_enhanced_sigma_rules()
            results['sigma_results'] = sigma_results
            logger.info(f"Phase 3 complete: {sigma_results['rules_generated']} rules generated")
            results['status'] = 'completed'
            results['end_time'] = datetime.utcnow()
            results['total_time'] = (results['end_time'] - results['start_time']).total_seconds()
            logger.info(f"Full bulk seed operation completed in {results['total_time']:.2f} seconds")
        except Exception as e:
            logger.error(f"Bulk seed operation failed: {e}")
            results['status'] = 'failed'
            results['error'] = str(e)
            results['end_time'] = datetime.utcnow()
        return results
    async def incremental_update(self) -> dict:
        """
        Perform incremental update operation
        Returns:
            Dictionary containing update results
        """
        results = {
            'start_time': datetime.utcnow(),
            'nvd_update': None,
            'nomi_sec_update': None,
            'status': 'running'
        }
        logger.info("Starting incremental update...")
        try:
            # Update NVD data using modified/recent feeds
            logger.info("Updating NVD data...")
            nvd_update = await self.nvd_processor.incremental_update()
            results['nvd_update'] = nvd_update
            # Update PoC data for newly added/modified CVEs
            if nvd_update['total_processed'] > 0:
                logger.info("Updating PoC data for modified CVEs...")
                # Get recently modified CVEs and sync their PoCs
                recent_cves = await self._get_recently_modified_cves()
                nomi_sec_update = await self._sync_specific_cves(recent_cves)
                results['nomi_sec_update'] = nomi_sec_update
            results['status'] = 'completed'
            results['end_time'] = datetime.utcnow()
        except Exception as e:
            logger.error(f"Incremental update failed: {e}")
            results['status'] = 'failed'
            results['error'] = str(e)
            results['end_time'] = datetime.utcnow()
        return results
    async def generate_enhanced_sigma_rules(self) -> dict:
        """Generate enhanced SIGMA rules using nomi-sec PoC data"""
        from main import CVE, SigmaRule
        # Import the enhanced rule generator
        from enhanced_sigma_generator import EnhancedSigmaGenerator
        generator = EnhancedSigmaGenerator(self.db_session)
        # Get all CVEs that have PoC data but no enhanced rules
        cves_with_pocs = self.db_session.query(CVE).filter(
            CVE.poc_count > 0
        ).all()
        rules_generated = 0
        rules_updated = 0
        for cve in cves_with_pocs:
            try:
                # Check if we need to generate/update the rule
                existing_rule = self.db_session.query(SigmaRule).filter(
                    SigmaRule.cve_id == cve.cve_id
                ).first()
                if existing_rule and existing_rule.poc_source == 'nomi_sec':
                    # Rule already exists and is up to date
                    continue
                # Generate enhanced rule
                rule_result = await generator.generate_enhanced_rule(cve)
                if rule_result['success']:
                    if existing_rule:
                        rules_updated += 1
                    else:
                        rules_generated += 1
            except Exception as e:
                logger.error(f"Error generating rule for {cve.cve_id}: {e}")
                continue
        self.db_session.commit()
        return {
            'rules_generated': rules_generated,
            'rules_updated': rules_updated,
            'total_processed': len(cves_with_pocs)
        }
    async def _get_recently_modified_cves(self, hours: int = 24) -> list:
        """Get CVEs modified within the last N hours"""
        from main import CVE
        cutoff_time = datetime.utcnow() - timedelta(hours=hours)
        recent_cves = self.db_session.query(CVE).filter(
            CVE.updated_at >= cutoff_time
        ).all()
        return [cve.cve_id for cve in recent_cves]
    async def _sync_specific_cves(self, cve_ids: list) -> dict:
        """Sync PoC data for specific CVEs"""
        total_processed = 0
        total_pocs_found = 0
        for cve_id in cve_ids:
            try:
                result = await self.nomi_sec_client.sync_cve_pocs(cve_id)
                total_processed += 1
                total_pocs_found += result.get('pocs_found', 0)
                # Small delay to avoid overwhelming the API
                await asyncio.sleep(0.5)
            except Exception as e:
                logger.error(f"Error syncing PoCs for {cve_id}: {e}")
                continue
        return {
            'total_processed': total_processed,
            'total_pocs_found': total_pocs_found
        }
    async def get_seeding_status(self) -> dict:
        """Get current seeding status and statistics"""
        from main import CVE, SigmaRule, BulkProcessingJob
        # Get database statistics
        total_cves = self.db_session.query(CVE).count()
        bulk_processed_cves = self.db_session.query(CVE).filter(
            CVE.bulk_processed == True
        ).count()
        cves_with_pocs = self.db_session.query(CVE).filter(
            CVE.poc_count > 0
        ).count()
        total_rules = self.db_session.query(SigmaRule).count()
        nomi_sec_rules = self.db_session.query(SigmaRule).filter(
            SigmaRule.poc_source == 'nomi_sec'
        ).count()
        # Get recent job status
        recent_jobs = self.db_session.query(BulkProcessingJob).order_by(
            BulkProcessingJob.created_at.desc()
        ).limit(5).all()
        job_status = []
        for job in recent_jobs:
            job_status.append({
                'id': str(job.id),
                'job_type': job.job_type,
                'status': job.status,
                'created_at': job.created_at,
                'completed_at': job.completed_at,
                'processed_items': job.processed_items,
                'total_items': job.total_items,
                'failed_items': job.failed_items
            })
        return {
            'database_stats': {
                'total_cves': total_cves,
                'bulk_processed_cves': bulk_processed_cves,
                'cves_with_pocs': cves_with_pocs,
                'total_rules': total_rules,
                'nomi_sec_rules': nomi_sec_rules,
                'poc_coverage': (cves_with_pocs / total_cves * 100) if total_cves > 0 else 0,
                'nomi_sec_coverage': (nomi_sec_rules / total_rules * 100) if total_rules > 0 else 0
            },
            'recent_jobs': job_status,
            'nvd_data_status': await self._get_nvd_data_status(),
            'nomi_sec_status': await self.nomi_sec_client.get_sync_status()
        }
    async def _get_nvd_data_status(self) -> dict:
        """Get NVD data status"""
        from main import CVE
        # Get year distribution
        year_counts = {}
        cves = self.db_session.query(CVE).all()
        for cve in cves:
            if cve.published_date:
                year = cve.published_date.year
                year_counts[year] = year_counts.get(year, 0) + 1
        # Get source distribution
        source_counts = {}
        for cve in cves:
            source = cve.data_source or 'unknown'
            source_counts[source] = source_counts.get(source, 0) + 1
        return {
            'year_distribution': year_counts,
            'source_distribution': source_counts,
            'total_cves': len(cves),
            'date_range': {
                'earliest': min(cve.published_date for cve in cves if cve.published_date),
                'latest': max(cve.published_date for cve in cves if cve.published_date)
            } if cves else None
        }
 # Standalone script functionality
 async def main():
    """Main function for standalone execution"""
    from main import SessionLocal, engine, Base
    # Create tables
    Base.metadata.create_all(bind=engine)
    # Create database session
    db_session = SessionLocal()
    try:
        # Create bulk seeder
        seeder = BulkSeeder(db_session)
        # Get current status
        status = await seeder.get_seeding_status()
        print(f"Current Status: {status['database_stats']['total_cves']} CVEs in database")
        # Perform full bulk seed if database is empty
        if status['database_stats']['total_cves'] == 0:
            print("Database is empty. Starting full bulk seed...")
            results = await seeder.full_bulk_seed(start_year=2020)  # Start from 2020 for faster testing
            print(f"Bulk seed completed: {results}")
        else:
            print("Database contains data. Running incremental update...")
            results = await seeder.incremental_update()
            print(f"Incremental update completed: {results}")
    finally:
        db_session.close()
 if __name__ == "__main__":
    asyncio.run(main())
--- a/backend/enhanced_sigma_generator.py
+++ b/backend/enhanced_sigma_generator.py
@ -0,0 +1,438 @@
 """
 Enhanced SIGMA Rule Generator
 Generates improved SIGMA rules using nomi-sec PoC data and traditional indicators
 """
 import json
 import logging
 from datetime import datetime
 from typing import Dict, List, Optional, Tuple
 from sqlalchemy.orm import Session
 import re
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class EnhancedSigmaGenerator:
    """Enhanced SIGMA rule generator using nomi-sec PoC data"""
    def __init__(self, db_session: Session):
        self.db_session = db_session
    async def generate_enhanced_rule(self, cve) -> dict:
        """Generate enhanced SIGMA rule for a CVE using PoC data"""
        from main import SigmaRule, RuleTemplate
        try:
            # Get PoC data
            poc_data = cve.poc_data or []
            # Find the best quality PoC
            best_poc = None
            if poc_data:
                best_poc = max(poc_data, key=lambda x: x.get('quality_analysis', {}).get('quality_score', 0))
            # Select appropriate template based on PoC analysis
            template = await self._select_template(cve, best_poc)
            if not template:
                logger.warning(f"No suitable template found for {cve.cve_id}")
                return {'success': False, 'error': 'No suitable template'}
            # Generate rule content
            rule_content = await self._generate_rule_content(cve, template, poc_data)
            # Calculate confidence level
            confidence_level = self._calculate_confidence_level(cve, poc_data)
            # Store or update SIGMA rule
            existing_rule = self.db_session.query(SigmaRule).filter(
                SigmaRule.cve_id == cve.cve_id
            ).first()
            rule_data = {
                'cve_id': cve.cve_id,
                'rule_name': f"{cve.cve_id} Enhanced Detection",
                'rule_content': rule_content,
                'detection_type': template.template_name,
                'log_source': self._extract_log_source(template.template_name),
                'confidence_level': confidence_level,
                'auto_generated': True,
                'exploit_based': len(poc_data) > 0,
                'poc_source': 'nomi_sec',
                'poc_quality_score': best_poc.get('quality_analysis', {}).get('quality_score', 0) if best_poc else 0,
                'nomi_sec_data': {
                    'total_pocs': len(poc_data),
                    'best_poc_quality': best_poc.get('quality_analysis', {}).get('quality_score', 0) if best_poc else 0,
                    'total_stars': sum(p.get('stargazers_count', 0) for p in poc_data),
                    'avg_stars': sum(p.get('stargazers_count', 0) for p in poc_data) / len(poc_data) if poc_data else 0
                },
                'github_repos': [p.get('html_url', '') for p in poc_data],
                'exploit_indicators': json.dumps(self._combine_exploit_indicators(poc_data)),
                'updated_at': datetime.utcnow()
            }
            if existing_rule:
                # Update existing rule
                for key, value in rule_data.items():
                    setattr(existing_rule, key, value)
                logger.info(f"Updated SIGMA rule for {cve.cve_id}")
            else:
                # Create new rule
                new_rule = SigmaRule(**rule_data)
                self.db_session.add(new_rule)
                logger.info(f"Created new SIGMA rule for {cve.cve_id}")
            self.db_session.commit()
            return {
                'success': True,
                'cve_id': cve.cve_id,
                'template': template.template_name,
                'confidence_level': confidence_level,
                'poc_count': len(poc_data),
                'quality_score': best_poc.get('quality_analysis', {}).get('quality_score', 0) if best_poc else 0
            }
        except Exception as e:
            logger.error(f"Error generating enhanced rule for {cve.cve_id}: {e}")
            return {'success': False, 'error': str(e)}
    async def _select_template(self, cve, best_poc: Optional[dict]) -> Optional[object]:
        """Select the most appropriate template based on CVE and PoC analysis"""
        from main import RuleTemplate
        templates = self.db_session.query(RuleTemplate).all()
        if not templates:
            logger.warning("No rule templates found in database")
            return None
        # Score templates based on relevance
        template_scores = {}
        for template in templates:
            score = 0
            # Score based on PoC indicators (highest priority)
            if best_poc:
                indicators = best_poc.get('exploit_indicators', {})
                score += self._score_template_poc_match(template, indicators)
            # Score based on CVE description
            score += self._score_template_cve_match(template, cve)
            # Score based on affected products
            if cve.affected_products:
                score += self._score_template_product_match(template, cve.affected_products)
            template_scores[template] = score
        # Return template with highest score
        if template_scores:
            best_template = max(template_scores, key=template_scores.get)
            logger.info(f"Selected template {best_template.template_name} with score {template_scores[best_template]}")
            return best_template
        return None
    def _score_template_poc_match(self, template: object, indicators: dict) -> int:
        """Score template based on PoC indicators"""
        score = 0
        template_name = template.template_name.lower()
        # Process-based templates
        if 'process' in template_name or 'execution' in template_name:
            if indicators.get('processes') or indicators.get('commands'):
                score += 30
        # Network-based templates
        if 'network' in template_name or 'connection' in template_name:
            if indicators.get('network') or indicators.get('urls'):
                score += 30
        # File-based templates
        if 'file' in template_name or 'modification' in template_name:
            if indicators.get('files'):
                score += 30
        # PowerShell templates
        if 'powershell' in template_name:
            processes = indicators.get('processes', [])
            if any('powershell' in p.lower() for p in processes):
                score += 35
        return score
    def _score_template_cve_match(self, template: object, cve) -> int:
        """Score template based on CVE description"""
        score = 0
        template_name = template.template_name.lower()
        description = (cve.description or '').lower()
        # Keyword matching
        if 'remote' in description and 'execution' in description:
            if 'process' in template_name or 'execution' in template_name:
                score += 20
        if 'powershell' in description:
            if 'powershell' in template_name:
                score += 25
        if 'network' in description or 'http' in description:
            if 'network' in template_name:
                score += 20
        if 'file' in description or 'upload' in description:
            if 'file' in template_name:
                score += 20
        return score
    def _score_template_product_match(self, template: object, affected_products: list) -> int:
        """Score template based on affected products"""
        score = 0
        if not template.applicable_product_patterns:
            return 0
        for pattern in template.applicable_product_patterns:
            pattern_lower = pattern.lower()
            for product in affected_products:
                product_lower = product.lower()
                if pattern_lower in product_lower:
                    score += 10
                    break
        return score
    async def _generate_rule_content(self, cve, template: object, poc_data: list) -> str:
        """Generate the actual SIGMA rule content"""
        # Combine all exploit indicators
        combined_indicators = self._combine_exploit_indicators(poc_data)
        # Get base template content
        rule_content = template.template_content
        # Replace template placeholders
        replacements = {
            '{{CVE_ID}}': cve.cve_id,
            '{{TITLE}}': f"{cve.cve_id} Enhanced Detection",
            '{{DESCRIPTION}}': self._generate_description(cve, poc_data),
            '{{LEVEL}}': self._calculate_confidence_level(cve, poc_data).lower(),
            '{{REFERENCES}}': self._generate_references(cve, poc_data),
            '{{TAGS}}': self._generate_tags(cve, poc_data),
            '{{PROCESSES}}': self._format_indicators(combined_indicators.get('processes', [])),
            '{{FILES}}': self._format_indicators(combined_indicators.get('files', [])),
            '{{COMMANDS}}': self._format_indicators(combined_indicators.get('commands', [])),
            '{{NETWORK}}': self._format_indicators(combined_indicators.get('network', [])),
            '{{URLS}}': self._format_indicators(combined_indicators.get('urls', [])),
            '{{REGISTRY}}': self._format_indicators(combined_indicators.get('registry', []))
        }
        # Apply replacements
        for placeholder, value in replacements.items():
            rule_content = rule_content.replace(placeholder, value)
        # Add enhanced detection based on PoC quality
        if poc_data:
            rule_content = self._enhance_detection_logic(rule_content, combined_indicators, poc_data)
        return rule_content
    def _combine_exploit_indicators(self, poc_data: list) -> dict:
        """Combine exploit indicators from all PoCs"""
        combined = {
            'processes': [],
            'files': [],
            'commands': [],
            'network': [],
            'urls': [],
            'registry': []
        }
        for poc in poc_data:
            indicators = poc.get('exploit_indicators', {})
            for key in combined.keys():
                if key in indicators:
                    combined[key].extend(indicators[key])
        # Deduplicate and filter
        for key in combined.keys():
            combined[key] = list(set(combined[key]))
            # Remove empty and invalid entries
            combined[key] = [item for item in combined[key] if item and len(item) > 2]
        return combined
    def _generate_description(self, cve, poc_data: list) -> str:
        """Generate enhanced rule description"""
        base_desc = f"Detection for {cve.cve_id}"
        if cve.description:
            # Extract key terms from CVE description
            desc_words = cve.description.lower().split()
            key_terms = [word for word in desc_words if word in [
                'remote', 'execution', 'injection', 'bypass', 'privilege', 'escalation',
                'overflow', 'disclosure', 'traversal', 'deserialization'
            ]]
            if key_terms:
                base_desc += f" involving {', '.join(set(key_terms[:3]))}"
        if poc_data:
            total_pocs = len(poc_data)
            total_stars = sum(p.get('stargazers_count', 0) for p in poc_data)
            base_desc += f" [Enhanced with {total_pocs} PoC(s), {total_stars} stars]"
        return base_desc
    def _generate_references(self, cve, poc_data: list) -> str:
        """Generate references section"""
        refs = []
        # Add CVE reference
        refs.append(f"https://nvd.nist.gov/vuln/detail/{cve.cve_id}")
        # Add top PoC references (max 3)
        if poc_data:
            sorted_pocs = sorted(poc_data, key=lambda x: x.get('stargazers_count', 0), reverse=True)
            for poc in sorted_pocs[:3]:
                if poc.get('html_url'):
                    refs.append(poc['html_url'])
        return '\\n'.join(f"    - {ref}" for ref in refs)
    def _generate_tags(self, cve, poc_data: list) -> str:
        """Generate MITRE ATT&CK tags and other tags"""
        tags = []
        # CVE tag
        tags.append(cve.cve_id.lower())
        # Add technique tags based on indicators
        combined_indicators = self._combine_exploit_indicators(poc_data)
        if combined_indicators.get('processes'):
            tags.append('attack.t1059')  # Command and Scripting Interpreter
        if combined_indicators.get('network'):
            tags.append('attack.t1071')  # Application Layer Protocol
        if combined_indicators.get('files'):
            tags.append('attack.t1105')  # Ingress Tool Transfer
        if any('powershell' in p.lower() for p in combined_indicators.get('processes', [])):
            tags.append('attack.t1059.001')  # PowerShell
        # Add PoC quality tags
        if poc_data:
            tags.append('exploit.poc')
            best_poc = max(poc_data, key=lambda x: x.get('quality_analysis', {}).get('quality_score', 0))
            quality_tier = best_poc.get('quality_analysis', {}).get('quality_tier', 'poor')
            tags.append(f'poc.quality.{quality_tier}')
        return '\\n'.join(f"    - {tag}" for tag in tags)
    def _format_indicators(self, indicators: list) -> str:
        """Format indicators for SIGMA rule"""
        if not indicators:
            return ''
        # Limit indicators to avoid overly complex rules
        limited_indicators = indicators[:10]
        formatted = []
        for indicator in limited_indicators:
            # Escape special characters for SIGMA
            escaped = indicator.replace('\\\\', '\\\\\\\\').replace('*', '\\\\*').replace('?', '\\\\?')
            formatted.append(f'            - "{escaped}"')
        return '\\n'.join(formatted)
    def _enhance_detection_logic(self, rule_content: str, indicators: dict, poc_data: list) -> str:
        """Enhance detection logic based on PoC quality and indicators"""
        # If we have high-quality PoCs, add additional detection conditions
        best_poc = max(poc_data, key=lambda x: x.get('quality_analysis', {}).get('quality_score', 0))
        quality_score = best_poc.get('quality_analysis', {}).get('quality_score', 0)
        if quality_score > 60:  # High quality PoC
            # Add more specific detection conditions
            if indicators.get('processes') and indicators.get('commands'):
                additional_condition = """
        process_and_command:
            Image|contains: {{PROCESSES}}
            CommandLine|contains: {{COMMANDS}}"""
                # Insert before the condition line
                rule_content = rule_content.replace(
                    'condition: selection',
                    additional_condition + '\\n    condition: selection or process_and_command'
                )
        return rule_content
    def _calculate_confidence_level(self, cve, poc_data: list) -> str:
        """Calculate confidence level based on CVE and PoC data"""
        score = 0
        # CVSS score factor
        if cve.cvss_score:
            if cve.cvss_score >= 9.0:
                score += 40
            elif cve.cvss_score >= 7.0:
                score += 30
            elif cve.cvss_score >= 5.0:
                score += 20
            else:
                score += 10
        # PoC quality factor
        if poc_data:
            total_stars = sum(p.get('stargazers_count', 0) for p in poc_data)
            poc_count = len(poc_data)
            score += min(total_stars, 30)  # Max 30 points for stars
            score += min(poc_count * 5, 20)  # Max 20 points for PoC count
            # Quality tier bonus
            best_poc = max(poc_data, key=lambda x: x.get('quality_analysis', {}).get('quality_score', 0))
            quality_tier = best_poc.get('quality_analysis', {}).get('quality_tier', 'poor')
            tier_bonus = {
                'excellent': 20,
                'good': 15,
                'fair': 10,
                'poor': 5,
                'very_poor': 0
            }
            score += tier_bonus.get(quality_tier, 0)
        # Determine confidence level
        if score >= 80:
            return 'HIGH'
        elif score >= 60:
            return 'MEDIUM'
        elif score >= 40:
            return 'LOW'
        else:
            return 'INFORMATIONAL'
    def _extract_log_source(self, template_name: str) -> str:
        """Extract log source from template name"""
        template_lower = template_name.lower()
        if 'process' in template_lower or 'execution' in template_lower:
            return 'process_creation'
        elif 'network' in template_lower:
            return 'network_connection'
        elif 'file' in template_lower:
            return 'file_event'
        elif 'powershell' in template_lower:
            return 'powershell'
        elif 'registry' in template_lower:
            return 'registry_event'
        else:
            return 'generic'
--- a/backend/main.py
+++ b/backend/main.py
@ -1,7 +1,7 @@
 from fastapi import FastAPI, HTTPException, BackgroundTasks, Depends
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
-from sqlalchemy import create_engine, Column, String, Text, DECIMAL, TIMESTAMP, Boolean, ARRAY
+from sqlalchemy import create_engine, Column, String, Text, DECIMAL, TIMESTAMP, Boolean, ARRAY, Integer, JSON, func
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import sessionmaker, Session
 from sqlalchemy.dialects.postgresql import UUID
@ -19,6 +19,16 @@ import base64
 from github import Github
 from urllib.parse import urlparse
 import hashlib
 import logging
 import threading
 # Setup logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Global job tracking
 running_jobs = {}
 job_cancellation_flags = {}
 # Database setup
 DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://cve_user:cve_password@localhost:5432/cve_sigma_db")
@ -39,6 +49,13 @@ class CVE(Base):
    modified_date = Column(TIMESTAMP)
    affected_products = Column(ARRAY(String))
    reference_urls = Column(ARRAY(String))
    # Bulk processing fields
    data_source = Column(String(20), default='nvd_api')  # 'nvd_api', 'nvd_bulk', 'manual'
    nvd_json_version = Column(String(10), default='2.0')
    bulk_processed = Column(Boolean, default=False)
    # nomi-sec PoC fields
    poc_count = Column(Integer, default=0)
    poc_data = Column(JSON)  # Store nomi-sec PoC metadata
    created_at = Column(TIMESTAMP, default=datetime.utcnow)
    updated_at = Column(TIMESTAMP, default=datetime.utcnow)
@ -56,6 +73,10 @@ class SigmaRule(Base):
    exploit_based = Column(Boolean, default=False)
    github_repos = Column(ARRAY(String))
    exploit_indicators = Column(Text)  # JSON string of extracted indicators
    # Enhanced fields for new data sources
    poc_source = Column(String(20), default='github_search')  # 'github_search', 'nomi_sec', 'manual'
    poc_quality_score = Column(Integer, default=0)  # Based on star count, activity, etc.
    nomi_sec_data = Column(JSON)  # Store nomi-sec PoC metadata
    created_at = Column(TIMESTAMP, default=datetime.utcnow)
    updated_at = Column(TIMESTAMP, default=datetime.utcnow)
@ -69,6 +90,23 @@ class RuleTemplate(Base):
    description = Column(Text)
    created_at = Column(TIMESTAMP, default=datetime.utcnow)
 class BulkProcessingJob(Base):
    __tablename__ = "bulk_processing_jobs"
    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    job_type = Column(String(50), nullable=False)  # 'nvd_bulk_seed', 'nomi_sec_sync', 'incremental_update'
    status = Column(String(20), default='pending')  # 'pending', 'running', 'completed', 'failed', 'cancelled'
    year = Column(Integer)  # For year-based processing
    total_items = Column(Integer, default=0)
    processed_items = Column(Integer, default=0)
    failed_items = Column(Integer, default=0)
    error_message = Column(Text)
    job_metadata = Column(JSON)  # Additional job-specific data
    started_at = Column(TIMESTAMP)
    completed_at = Column(TIMESTAMP)
    cancelled_at = Column(TIMESTAMP)
    created_at = Column(TIMESTAMP, default=datetime.utcnow)
 # Pydantic models
 class CVEResponse(BaseModel):
    id: str
@ -941,12 +979,341 @@ async def get_stats(db: Session = Depends(get_db)):
    total_rules = db.query(SigmaRule).count()
    recent_cves = db.query(CVE).filter(CVE.published_date >= datetime.utcnow() - timedelta(days=7)).count()
    # Enhanced stats with bulk processing info
    bulk_processed_cves = db.query(CVE).filter(CVE.bulk_processed == True).count()
    cves_with_pocs = db.query(CVE).filter(CVE.poc_count > 0).count()
    nomi_sec_rules = db.query(SigmaRule).filter(SigmaRule.poc_source == 'nomi_sec').count()
    return {
        "total_cves": total_cves,
        "total_sigma_rules": total_rules,
-        "recent_cves_7_days": recent_cves
+        "recent_cves_7_days": recent_cves,
        "bulk_processed_cves": bulk_processed_cves,
        "cves_with_pocs": cves_with_pocs,
        "nomi_sec_rules": nomi_sec_rules,
        "poc_coverage": (cves_with_pocs / total_cves * 100) if total_cves > 0 else 0,
        "nomi_sec_coverage": (nomi_sec_rules / total_rules * 100) if total_rules > 0 else 0
    }
 # New bulk processing endpoints
@app.post("/api/bulk-seed")
 async def start_bulk_seed(background_tasks: BackgroundTasks, 
                         start_year: int = 2002, 
                         end_year: Optional[int] = None,
                         skip_nvd: bool = False,
                         skip_nomi_sec: bool = False,
                         db: Session = Depends(get_db)):
    """Start bulk seeding process"""
    async def bulk_seed_task():
        try:
            from bulk_seeder import BulkSeeder
            seeder = BulkSeeder(db)
            result = await seeder.full_bulk_seed(
                start_year=start_year,
                end_year=end_year,
                skip_nvd=skip_nvd,
                skip_nomi_sec=skip_nomi_sec
            )
            logger.info(f"Bulk seed completed: {result}")
        except Exception as e:
            logger.error(f"Bulk seed failed: {e}")
            import traceback
            traceback.print_exc()
    background_tasks.add_task(bulk_seed_task)
    return {
        "message": "Bulk seeding process started",
        "status": "started",
        "start_year": start_year,
        "end_year": end_year or datetime.now().year,
        "skip_nvd": skip_nvd,
        "skip_nomi_sec": skip_nomi_sec
    }
@app.post("/api/incremental-update")
 async def start_incremental_update(background_tasks: BackgroundTasks, db: Session = Depends(get_db)):
    """Start incremental update process"""
    async def incremental_update_task():
        try:
            from bulk_seeder import BulkSeeder
            seeder = BulkSeeder(db)
            result = await seeder.incremental_update()
            logger.info(f"Incremental update completed: {result}")
        except Exception as e:
            logger.error(f"Incremental update failed: {e}")
            import traceback
            traceback.print_exc()
    background_tasks.add_task(incremental_update_task)
    return {
        "message": "Incremental update process started",
        "status": "started"
    }
@app.post("/api/sync-nomi-sec")
 async def sync_nomi_sec(background_tasks: BackgroundTasks, 
                       cve_id: Optional[str] = None,
                       batch_size: int = 50,
                       db: Session = Depends(get_db)):
    """Synchronize nomi-sec PoC data"""
    # Create job record
    job = BulkProcessingJob(
        job_type='nomi_sec_sync',
        status='pending',
        job_metadata={
            'cve_id': cve_id,
            'batch_size': batch_size
        }
    )
    db.add(job)
    db.commit()
    db.refresh(job)
    job_id = str(job.id)
    running_jobs[job_id] = job
    job_cancellation_flags[job_id] = False
    async def sync_task():
        try:
            job.status = 'running'
            job.started_at = datetime.utcnow()
            db.commit()
            from nomi_sec_client import NomiSecClient
            client = NomiSecClient(db)
            if cve_id:
                # Sync specific CVE
                if job_cancellation_flags.get(job_id, False):
                    logger.info(f"Job {job_id} cancelled before starting")
                    return
                result = await client.sync_cve_pocs(cve_id)
                logger.info(f"Nomi-sec sync for {cve_id}: {result}")
            else:
                # Sync all CVEs with cancellation support
                result = await client.bulk_sync_all_cves(
                    batch_size=batch_size,
                    cancellation_flag=lambda: job_cancellation_flags.get(job_id, False)
                )
                logger.info(f"Nomi-sec bulk sync completed: {result}")
            # Update job status if not cancelled
            if not job_cancellation_flags.get(job_id, False):
                job.status = 'completed'
                job.completed_at = datetime.utcnow()
                db.commit()
        except Exception as e:
            if not job_cancellation_flags.get(job_id, False):
                job.status = 'failed'
                job.error_message = str(e)
                job.completed_at = datetime.utcnow()
                db.commit()
            logger.error(f"Nomi-sec sync failed: {e}")
            import traceback
            traceback.print_exc()
        finally:
            # Clean up tracking
            running_jobs.pop(job_id, None)
            job_cancellation_flags.pop(job_id, None)
    background_tasks.add_task(sync_task)
    return {
        "message": f"Nomi-sec sync started" + (f" for {cve_id}" if cve_id else " for all CVEs"),
        "status": "started",
        "job_id": job_id,
        "cve_id": cve_id,
        "batch_size": batch_size
    }
@app.get("/api/bulk-jobs")
 async def get_bulk_jobs(limit: int = 10, db: Session = Depends(get_db)):
    """Get bulk processing job status"""
    jobs = db.query(BulkProcessingJob).order_by(
        BulkProcessingJob.created_at.desc()
    ).limit(limit).all()
    result = []
    for job in jobs:
        job_dict = {
            'id': str(job.id),
            'job_type': job.job_type,
            'status': job.status,
            'year': job.year,
            'total_items': job.total_items,
            'processed_items': job.processed_items,
            'failed_items': job.failed_items,
            'error_message': job.error_message,
            'metadata': job.job_metadata,
            'started_at': job.started_at,
            'completed_at': job.completed_at,
            'created_at': job.created_at
        }
        result.append(job_dict)
    return result
@app.get("/api/bulk-status")
 async def get_bulk_status(db: Session = Depends(get_db)):
    """Get comprehensive bulk processing status"""
    try:
        from bulk_seeder import BulkSeeder
        seeder = BulkSeeder(db)
        status = await seeder.get_seeding_status()
        return status
    except Exception as e:
        logger.error(f"Error getting bulk status: {e}")
        return {"error": str(e)}
@app.get("/api/poc-stats")
 async def get_poc_stats(db: Session = Depends(get_db)):
    """Get PoC-related statistics"""
    try:
        from nomi_sec_client import NomiSecClient
        client = NomiSecClient(db)
        stats = await client.get_sync_status()
        # Additional PoC statistics
        high_quality_cves = db.query(CVE).filter(
            CVE.poc_count > 0,
            func.json_extract_path_text(CVE.poc_data, '0', 'quality_analysis', 'quality_score').cast(Integer) > 60
        ).count()
        stats.update({
            'high_quality_cves': high_quality_cves,
            'avg_poc_count': db.query(func.avg(CVE.poc_count)).filter(CVE.poc_count > 0).scalar() or 0
        })
        return stats
    except Exception as e:
        logger.error(f"Error getting PoC stats: {e}")
        return {"error": str(e)}
@app.post("/api/regenerate-rules")
 async def regenerate_sigma_rules(background_tasks: BackgroundTasks, 
                                force: bool = False,
                                db: Session = Depends(get_db)):
    """Regenerate SIGMA rules using enhanced nomi-sec data"""
    async def regenerate_task():
        try:
            from enhanced_sigma_generator import EnhancedSigmaGenerator
            generator = EnhancedSigmaGenerator(db)
            # Get CVEs with PoC data
            cves_with_pocs = db.query(CVE).filter(CVE.poc_count > 0).all()
            rules_generated = 0
            rules_updated = 0
            for cve in cves_with_pocs:
                # Check if we should regenerate
                existing_rule = db.query(SigmaRule).filter(
                    SigmaRule.cve_id == cve.cve_id
                ).first()
                if existing_rule and existing_rule.poc_source == 'nomi_sec' and not force:
                    continue
                # Generate enhanced rule
                result = await generator.generate_enhanced_rule(cve)
                if result['success']:
                    if existing_rule:
                        rules_updated += 1
                    else:
                        rules_generated += 1
            logger.info(f"Rule regeneration completed: {rules_generated} new, {rules_updated} updated")
        except Exception as e:
            logger.error(f"Rule regeneration failed: {e}")
            import traceback
            traceback.print_exc()
    background_tasks.add_task(regenerate_task)
    return {
        "message": "SIGMA rule regeneration started",
        "status": "started",
        "force": force
    }
@app.post("/api/cancel-job/{job_id}")
 async def cancel_job(job_id: str, db: Session = Depends(get_db)):
    """Cancel a running job"""
    try:
        # Find the job in the database
        job = db.query(BulkProcessingJob).filter(BulkProcessingJob.id == job_id).first()
        if not job:
            raise HTTPException(status_code=404, detail="Job not found")
        if job.status not in ['pending', 'running']:
            raise HTTPException(status_code=400, detail=f"Cannot cancel job with status: {job.status}")
        # Set cancellation flag
        job_cancellation_flags[job_id] = True
        # Update job status
        job.status = 'cancelled'
        job.cancelled_at = datetime.utcnow()
        job.error_message = "Job cancelled by user"
        db.commit()
        logger.info(f"Job {job_id} cancellation requested")
        return {
            "message": f"Job {job_id} cancellation requested",
            "status": "cancelled",
            "job_id": job_id
        }
    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Error cancelling job {job_id}: {e}")
        raise HTTPException(status_code=500, detail=str(e))
@app.get("/api/running-jobs")
 async def get_running_jobs(db: Session = Depends(get_db)):
    """Get all currently running jobs"""
    try:
        jobs = db.query(BulkProcessingJob).filter(
            BulkProcessingJob.status.in_(['pending', 'running'])
        ).order_by(BulkProcessingJob.created_at.desc()).all()
        result = []
        for job in jobs:
            result.append({
                'id': str(job.id),
                'job_type': job.job_type,
                'status': job.status,
                'year': job.year,
                'total_items': job.total_items,
                'processed_items': job.processed_items,
                'failed_items': job.failed_items,
                'error_message': job.error_message,
                'started_at': job.started_at,
                'created_at': job.created_at,
                'can_cancel': job.status in ['pending', 'running']
            })
        return result
    except Exception as e:
        logger.error(f"Error getting running jobs: {e}")
        raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/backend/nomi_sec_client.py
+++ b/backend/nomi_sec_client.py
@ -0,0 +1,477 @@
 """
 Nomi-sec PoC-in-GitHub Integration Client
 Interfaces with the nomi-sec PoC-in-GitHub API for curated exploit data
 """
 import aiohttp
 import asyncio
 import json
 import logging
 from datetime import datetime, timedelta
 from typing import Dict, List, Optional, Tuple
 from sqlalchemy.orm import Session
 from sqlalchemy import and_, or_
 import time
 import re
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class NomiSecClient:
    """Client for interacting with nomi-sec PoC-in-GitHub API"""
    def __init__(self, db_session: Session):
        self.db_session = db_session
        self.base_url = "https://poc-in-github.motikan2010.net/api/v1"
        self.rss_url = "https://poc-in-github.motikan2010.net/rss"
        # Rate limiting
        self.rate_limit_delay = 1.0  # 1 second between requests
        self.last_request_time = 0
        # Cache for recently fetched data
        self.cache = {}
        self.cache_ttl = 300  # 5 minutes
    async def _make_request(self, session: aiohttp.ClientSession, 
                          url: str, params: dict = None) -> Optional[dict]:
        """Make a rate-limited request to the API"""
        try:
            # Rate limiting
            current_time = time.time()
            time_since_last = current_time - self.last_request_time
            if time_since_last < self.rate_limit_delay:
                await asyncio.sleep(self.rate_limit_delay - time_since_last)
            async with session.get(url, params=params, timeout=30) as response:
                self.last_request_time = time.time()
                if response.status == 200:
                    return await response.json()
                else:
                    logger.warning(f"API request failed: {response.status} for {url}")
                    return None
        except Exception as e:
            logger.error(f"Error making request to {url}: {e}")
            return None
    async def get_pocs_for_cve(self, cve_id: str) -> List[dict]:
        """Get all PoC repositories for a specific CVE"""
        cache_key = f"cve_{cve_id}"
        # Check cache
        if cache_key in self.cache:
            cached_data, timestamp = self.cache[cache_key]
            if time.time() - timestamp < self.cache_ttl:
                return cached_data
        async with aiohttp.ClientSession() as session:
            params = {"cve_id": cve_id}
            data = await self._make_request(session, self.base_url, params)
            if data and "pocs" in data:
                pocs = data["pocs"]
                # Cache the result
                self.cache[cache_key] = (pocs, time.time())
                logger.info(f"Found {len(pocs)} PoCs for {cve_id}")
                return pocs
            else:
                logger.info(f"No PoCs found for {cve_id}")
                return []
    async def get_recent_pocs(self, limit: int = 100) -> List[dict]:
        """Get recent PoCs from the API"""
        async with aiohttp.ClientSession() as session:
            params = {"limit": limit, "sort": "created_at"}
            data = await self._make_request(session, self.base_url, params)
            if data and "pocs" in data:
                return data["pocs"]
            else:
                return []
    async def get_high_quality_pocs(self, min_stars: int = 5, limit: int = 100) -> List[dict]:
        """Get high-quality PoCs sorted by star count"""
        async with aiohttp.ClientSession() as session:
            params = {"limit": limit, "sort": "stargazers_count"}
            data = await self._make_request(session, self.base_url, params)
            if data and "pocs" in data:
                # Filter by star count
                filtered_pocs = [
                    poc for poc in data["pocs"]
                    if int(poc.get("stargazers_count", "0")) >= min_stars
                ]
                return filtered_pocs
            else:
                return []
    async def search_pocs(self, query: str, limit: int = 50) -> List[dict]:
        """Search for PoCs using a query string"""
        async with aiohttp.ClientSession() as session:
            params = {"limit": limit, "q": query}
            data = await self._make_request(session, self.base_url, params)
            if data and "pocs" in data:
                return data["pocs"]
            else:
                return []
    def analyze_poc_quality(self, poc: dict) -> dict:
        """Analyze the quality of a PoC repository"""
        quality_score = 0
        factors = {}
        # Star count factor (0-40 points)
        stars = int(poc.get("stargazers_count", "0"))
        star_score = min(stars * 2, 40)  # 2 points per star, max 40
        quality_score += star_score
        factors["star_score"] = star_score
        # Recency factor (0-20 points)
        try:
            updated_at = datetime.fromisoformat(poc.get("updated_at", "").replace('Z', '+00:00'))
            days_old = (datetime.now(updated_at.tzinfo) - updated_at).days
            recency_score = max(20 - (days_old // 30), 0)  # Lose 1 point per month
            quality_score += recency_score
            factors["recency_score"] = recency_score
        except:
            factors["recency_score"] = 0
        # Description quality factor (0-15 points)
        description = poc.get("description", "")
        desc_score = 0
        if description:
            desc_score = min(len(description) // 10, 15)  # 1 point per 10 chars, max 15
        quality_score += desc_score
        factors["description_score"] = desc_score
        # Vulnerability description factor (0-15 points)
        vuln_desc = poc.get("vuln_description", "")
        vuln_score = 0
        if vuln_desc:
            vuln_score = min(len(vuln_desc) // 20, 15)  # 1 point per 20 chars, max 15
        quality_score += vuln_score
        factors["vuln_description_score"] = vuln_score
        # Repository name relevance factor (0-10 points)
        repo_name = poc.get("name", "").lower()
        cve_id = poc.get("cve_id", "").lower()
        name_score = 0
        if cve_id and cve_id.replace("-", "") in repo_name.replace("-", ""):
            name_score = 10
        elif any(keyword in repo_name for keyword in ["exploit", "poc", "cve", "vuln"]):
            name_score = 5
        quality_score += name_score
        factors["name_relevance_score"] = name_score
        return {
            "quality_score": quality_score,
            "factors": factors,
            "quality_tier": self._get_quality_tier(quality_score)
        }
    def _get_quality_tier(self, score: int) -> str:
        """Get quality tier based on score"""
        if score >= 80:
            return "excellent"
        elif score >= 60:
            return "good"
        elif score >= 40:
            return "fair"
        elif score >= 20:
            return "poor"
        else:
            return "very_poor"
    def extract_exploit_indicators(self, poc: dict) -> dict:
        """Extract exploit indicators from PoC metadata"""
        indicators = {
            "processes": [],
            "files": [],
            "network": [],
            "registry": [],
            "commands": [],
            "urls": [],
            "techniques": []
        }
        # Extract from description and vulnerability description
        text_sources = [
            poc.get("description", ""),
            poc.get("vuln_description", ""),
            poc.get("name", "")
        ]
        full_text = " ".join(text_sources).lower()
        # Process patterns
        process_patterns = [
            r'\b(cmd\.exe|powershell\.exe|bash|sh|python\.exe|java\.exe)\b',
            r'\b(createprocess|shellexecute|system)\b',
            r'\b(reverse.?shell|bind.?shell)\b'
        ]
        for pattern in process_patterns:
            matches = re.findall(pattern, full_text, re.IGNORECASE)
            indicators["processes"].extend(matches)
        # File patterns
        file_patterns = [
            r'\b([a-zA-Z]:\\[^\\]+\\[^\\]+\.[a-zA-Z0-9]+)\b',  # Windows paths
            r'\b(/[^/\s]+/[^/\s]+\.[a-zA-Z0-9]+)\b',  # Unix paths
            r'\b(\w+\.(exe|dll|bat|ps1|py|sh|jar))\b'  # Common executable files
        ]
        for pattern in file_patterns:
            matches = re.findall(pattern, full_text, re.IGNORECASE)
            if isinstance(matches[0], tuple) if matches else False:
                indicators["files"].extend([m[0] for m in matches])
            else:
                indicators["files"].extend(matches)
        # Network patterns
        network_patterns = [
            r'\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\b',  # IP addresses
            r'\b((?:\d{1,5})|(?:0x[a-fA-F0-9]{1,4}))\b',  # Ports
            r'\b(http[s]?://[^\s]+)\b'  # URLs
        ]
        for pattern in network_patterns:
            matches = re.findall(pattern, full_text, re.IGNORECASE)
            if pattern.startswith(r'\b(http'):
                indicators["urls"].extend(matches)
            else:
                indicators["network"].extend(matches)
        # Command patterns
        command_patterns = [
            r'\b(curl|wget|nc|netcat|ncat)\b',
            r'\b(whoami|id|uname|systeminfo)\b',
            r'\b(cat|type|more|less)\b'
        ]
        for pattern in command_patterns:
            matches = re.findall(pattern, full_text, re.IGNORECASE)
            indicators["commands"].extend(matches)
        # Clean up and deduplicate
        for key in indicators:
            indicators[key] = list(set(indicators[key]))
        return indicators
    async def sync_cve_pocs(self, cve_id: str) -> dict:
        """Synchronize PoC data for a specific CVE"""
        from main import CVE, SigmaRule
        # Get existing CVE
        cve = self.db_session.query(CVE).filter(CVE.cve_id == cve_id).first()
        if not cve:
            logger.warning(f"CVE {cve_id} not found in database")
            return {"error": "CVE not found"}
        # Fetch PoCs from nomi-sec API
        pocs = await self.get_pocs_for_cve(cve_id)
        if not pocs:
            logger.info(f"No PoCs found for {cve_id}")
            return {"cve_id": cve_id, "pocs_found": 0}
        # Analyze and store PoC data
        poc_data = []
        github_repos = []
        total_quality_score = 0
        for poc in pocs:
            quality_analysis = self.analyze_poc_quality(poc)
            exploit_indicators = self.extract_exploit_indicators(poc)
            poc_entry = {
                "id": poc.get("id"),
                "name": poc.get("name"),
                "owner": poc.get("owner"),
                "full_name": poc.get("full_name"),
                "html_url": poc.get("html_url"),
                "description": poc.get("description"),
                "stargazers_count": int(poc.get("stargazers_count", "0")),
                "created_at": poc.get("created_at"),
                "updated_at": poc.get("updated_at"),
                "quality_analysis": quality_analysis,
                "exploit_indicators": exploit_indicators
            }
            poc_data.append(poc_entry)
            github_repos.append(poc.get("html_url", ""))
            total_quality_score += quality_analysis["quality_score"]
        # Update CVE with PoC data
        cve.poc_count = len(pocs)
        cve.poc_data = poc_data
        cve.updated_at = datetime.utcnow()
        # Update or create SIGMA rule with enhanced PoC data
        sigma_rule = self.db_session.query(SigmaRule).filter(
            SigmaRule.cve_id == cve_id
        ).first()
        if sigma_rule:
            sigma_rule.poc_source = 'nomi_sec'
            sigma_rule.poc_quality_score = total_quality_score // len(pocs) if pocs else 0
            sigma_rule.nomi_sec_data = {
                "total_pocs": len(pocs),
                "average_quality": total_quality_score // len(pocs) if pocs else 0,
                "best_poc": max(poc_data, key=lambda x: x["quality_analysis"]["quality_score"]) if poc_data else None,
                "total_stars": sum(p["stargazers_count"] for p in poc_data)
            }
            sigma_rule.github_repos = github_repos
            sigma_rule.updated_at = datetime.utcnow()
            # Extract best exploit indicators
            best_indicators = {}
            for poc in poc_data:
                for key, values in poc["exploit_indicators"].items():
                    if key not in best_indicators:
                        best_indicators[key] = []
                    best_indicators[key].extend(values)
            # Deduplicate and store
            for key in best_indicators:
                best_indicators[key] = list(set(best_indicators[key]))
            sigma_rule.exploit_indicators = json.dumps(best_indicators)
        self.db_session.commit()
        logger.info(f"Synchronized {len(pocs)} PoCs for {cve_id}")
        return {
            "cve_id": cve_id,
            "pocs_found": len(pocs),
            "total_quality_score": total_quality_score,
            "average_quality": total_quality_score // len(pocs) if pocs else 0,
            "github_repos": github_repos
        }
    async def bulk_sync_all_cves(self, batch_size: int = 100, cancellation_flag: Optional[callable] = None) -> dict:
        """Synchronize PoC data for all CVEs in database"""
        from main import CVE, BulkProcessingJob
        # Create bulk processing job
        job = BulkProcessingJob(
            job_type='nomi_sec_sync',
            status='running',
            started_at=datetime.utcnow(),
            job_metadata={'batch_size': batch_size}
        )
        self.db_session.add(job)
        self.db_session.commit()
        total_processed = 0
        total_found = 0
        results = []
        try:
            # Get all CVEs from database
            cves = self.db_session.query(CVE).all()
            job.total_items = len(cves)
            self.db_session.commit()
            # Process in batches
            for i in range(0, len(cves), batch_size):
                # Check for cancellation before each batch
                if cancellation_flag and cancellation_flag():
                    logger.info("Bulk sync cancelled by user")
                    job.status = 'cancelled'
                    job.cancelled_at = datetime.utcnow()
                    job.error_message = "Job cancelled by user"
                    break
                batch = cves[i:i + batch_size]
                for cve in batch:
                    # Check for cancellation before each CVE
                    if cancellation_flag and cancellation_flag():
                        logger.info("Bulk sync cancelled by user")
                        job.status = 'cancelled'
                        job.cancelled_at = datetime.utcnow()
                        job.error_message = "Job cancelled by user"
                        break
                    try:
                        result = await self.sync_cve_pocs(cve.cve_id)
                        total_processed += 1
                        if result.get("pocs_found", 0) > 0:
                            total_found += result["pocs_found"]
                            results.append(result)
                        job.processed_items += 1
                        # Small delay to avoid overwhelming the API
                        await asyncio.sleep(0.5)
                    except Exception as e:
                        logger.error(f"Error syncing PoCs for {cve.cve_id}: {e}")
                        job.failed_items += 1
                # Break out of outer loop if cancelled
                if job.status == 'cancelled':
                    break
                # Commit after each batch
                self.db_session.commit()
                logger.info(f"Processed batch {i//batch_size + 1}/{(len(cves) + batch_size - 1)//batch_size}")
            # Update job status (only if not cancelled)
            if job.status != 'cancelled':
                job.status = 'completed'
                job.completed_at = datetime.utcnow()
            job.job_metadata.update({
                'total_processed': total_processed,
                'total_pocs_found': total_found,
                'cves_with_pocs': len(results)
            })
        except Exception as e:
            job.status = 'failed'
            job.error_message = str(e)
            job.completed_at = datetime.utcnow()
            logger.error(f"Bulk PoC sync job failed: {e}")
        finally:
            self.db_session.commit()
        return {
            'job_id': str(job.id),
            'status': job.status,
            'total_processed': total_processed,
            'total_pocs_found': total_found,
            'cves_with_pocs': len(results)
        }
    async def get_sync_status(self) -> dict:
        """Get synchronization status"""
        from main import CVE, SigmaRule
        # Count CVEs with PoC data
        total_cves = self.db_session.query(CVE).count()
        cves_with_pocs = self.db_session.query(CVE).filter(CVE.poc_count > 0).count()
        # Count SIGMA rules with nomi-sec data
        total_rules = self.db_session.query(SigmaRule).count()
        rules_with_nomi_sec = self.db_session.query(SigmaRule).filter(
            SigmaRule.poc_source == 'nomi_sec'
        ).count()
        return {
            'total_cves': total_cves,
            'cves_with_pocs': cves_with_pocs,
            'poc_coverage': (cves_with_pocs / total_cves * 100) if total_cves > 0 else 0,
            'total_rules': total_rules,
            'rules_with_nomi_sec': rules_with_nomi_sec,
            'nomi_sec_coverage': (rules_with_nomi_sec / total_rules * 100) if total_rules > 0 else 0
        }
--- a/backend/nvd_bulk_processor.py
+++ b/backend/nvd_bulk_processor.py
@ -0,0 +1,483 @@
 """
 NVD JSON Dataset Bulk Processor
 Downloads and processes NVD JSON data feeds for comprehensive CVE seeding
 """
 import requests
 import json
 import gzip
 import zipfile
 import os
 import logging
 from datetime import datetime, timedelta
 from typing import Dict, List, Optional, Tuple
 from sqlalchemy.orm import Session
 from sqlalchemy import and_, or_
 import asyncio
 import aiohttp
 from pathlib import Path
 import hashlib
 import time
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class NVDBulkProcessor:
    """Handles bulk downloading and processing of NVD JSON data feeds"""
    def __init__(self, db_session: Session, data_dir: str = "./nvd_data"):
        self.db_session = db_session
        self.data_dir = Path(data_dir)
        self.data_dir.mkdir(exist_ok=True)
        self.api_key = os.getenv("NVD_API_KEY")
        # NVD JSON 2.0 feed URLs
        self.base_url = "https://nvd.nist.gov/feeds/json/cve/1.1"
        self.feed_urls = {
            "modified": f"{self.base_url}/nvdcve-1.1-modified.json.gz",
            "recent": f"{self.base_url}/nvdcve-1.1-recent.json.gz"
        }
        # Rate limiting
        self.rate_limit_delay = 0.6  # 600ms between requests
        self.last_request_time = 0
    def get_year_feed_url(self, year: int) -> str:
        """Get the URL for a specific year's CVE feed"""
        return f"{self.base_url}/nvdcve-1.1-{year}.json.gz"
    def get_meta_url(self, feed_url: str) -> str:
        """Get the metadata URL for a feed"""
        return feed_url.replace(".json.gz", ".meta")
    async def download_file(self, session: aiohttp.ClientSession, url: str, 
                          destination: Path, check_meta: bool = True) -> bool:
        """Download a file with metadata checking"""
        try:
            # Check if we should download based on metadata
            if check_meta:
                meta_url = self.get_meta_url(url)
                should_download = await self._should_download_file(session, meta_url, destination)
                if not should_download:
                    logger.info(f"Skipping {url} - file is up to date")
                    return True
            # Rate limiting
            current_time = time.time()
            time_since_last = current_time - self.last_request_time
            if time_since_last < self.rate_limit_delay:
                await asyncio.sleep(self.rate_limit_delay - time_since_last)
            # Download the file
            headers = {}
            if self.api_key:
                headers["apiKey"] = self.api_key
            async with session.get(url, headers=headers, timeout=30) as response:
                if response.status == 200:
                    content = await response.read()
                    destination.write_bytes(content)
                    logger.info(f"Downloaded {url} -> {destination}")
                    self.last_request_time = time.time()
                    return True
                else:
                    logger.error(f"Failed to download {url}: HTTP {response.status}")
                    return False
        except Exception as e:
            logger.error(f"Error downloading {url}: {e}")
            return False
    async def _should_download_file(self, session: aiohttp.ClientSession, 
                                  meta_url: str, destination: Path) -> bool:
        """Check if file should be downloaded based on metadata"""
        try:
            # Download metadata
            async with session.get(meta_url, timeout=10) as response:
                if response.status != 200:
                    return True  # Download if we can't get metadata
                meta_content = await response.text()
                # Parse metadata
                meta_data = {}
                for line in meta_content.strip().split('\n'):
                    if ':' in line:
                        key, value = line.split(':', 1)
                        meta_data[key.strip()] = value.strip()
                # Check if local file exists and matches
                if destination.exists():
                    local_size = destination.stat().st_size
                    remote_size = int(meta_data.get('size', 0))
                    remote_sha256 = meta_data.get('sha256', '')
                    if local_size == remote_size and remote_sha256:
                        # Verify SHA256 if available
                        local_sha256 = self._calculate_sha256(destination)
                        if local_sha256 == remote_sha256:
                            return False  # File is up to date
                return True  # Download needed
        except Exception as e:
            logger.warning(f"Error checking metadata for {meta_url}: {e}")
            return True  # Download if metadata check fails
    def _calculate_sha256(self, file_path: Path) -> str:
        """Calculate SHA256 hash of a file"""
        sha256_hash = hashlib.sha256()
        with open(file_path, "rb") as f:
            for chunk in iter(lambda: f.read(4096), b""):
                sha256_hash.update(chunk)
        return sha256_hash.hexdigest()
    async def download_all_feeds(self, start_year: int = 2002, 
                               end_year: Optional[int] = None) -> List[Path]:
        """Download all NVD JSON feeds"""
        if end_year is None:
            end_year = datetime.now().year
        downloaded_files = []
        async with aiohttp.ClientSession() as session:
            # Download year-based feeds
            for year in range(start_year, end_year + 1):
                url = self.get_year_feed_url(year)
                filename = f"nvdcve-1.1-{year}.json.gz"
                destination = self.data_dir / filename
                if await self.download_file(session, url, destination):
                    downloaded_files.append(destination)
            # Download modified and recent feeds
            for feed_name, url in self.feed_urls.items():
                filename = f"nvdcve-1.1-{feed_name}.json.gz"
                destination = self.data_dir / filename
                if await self.download_file(session, url, destination):
                    downloaded_files.append(destination)
        return downloaded_files
    def extract_json_file(self, compressed_file: Path) -> Path:
        """Extract JSON from compressed file"""
        json_file = compressed_file.with_suffix('.json')
        try:
            if compressed_file.suffix == '.gz':
                with gzip.open(compressed_file, 'rt', encoding='utf-8') as f_in:
                    with open(json_file, 'w', encoding='utf-8') as f_out:
                        f_out.write(f_in.read())
            elif compressed_file.suffix == '.zip':
                with zipfile.ZipFile(compressed_file, 'r') as zip_ref:
                    zip_ref.extractall(self.data_dir)
            else:
                # File is already uncompressed
                return compressed_file
            logger.info(f"Extracted {compressed_file} -> {json_file}")
            return json_file
        except Exception as e:
            logger.error(f"Error extracting {compressed_file}: {e}")
            raise
    def process_json_file(self, json_file: Path) -> Tuple[int, int]:
        """Process a single JSON file and return (processed, failed) counts"""
        from main import CVE, BulkProcessingJob
        processed_count = 0
        failed_count = 0
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            cve_items = data.get('CVE_Items', [])
            logger.info(f"Processing {len(cve_items)} CVEs from {json_file}")
            for cve_item in cve_items:
                try:
                    cve_data = self._extract_cve_data(cve_item)
                    if cve_data:
                        self._store_cve_data(cve_data)
                        processed_count += 1
                    else:
                        failed_count += 1
                except Exception as e:
                    logger.error(f"Error processing CVE item: {e}")
                    failed_count += 1
            # Commit changes
            self.db_session.commit()
            logger.info(f"Processed {processed_count} CVEs, failed: {failed_count}")
        except Exception as e:
            logger.error(f"Error processing {json_file}: {e}")
            self.db_session.rollback()
            raise
        return processed_count, failed_count
    def _extract_cve_data(self, cve_item: dict) -> Optional[dict]:
        """Extract CVE data from JSON item"""
        try:
            cve = cve_item.get('cve', {})
            impact = cve_item.get('impact', {})
            cve_id = cve.get('CVE_data_meta', {}).get('ID', '')
            if not cve_id:
                return None
            # Description
            description_data = cve.get('description', {}).get('description_data', [])
            description = ''
            if description_data:
                description = description_data[0].get('value', '')
            # CVSS Score
            cvss_score = None
            severity = None
            if 'baseMetricV3' in impact:
                cvss_v3 = impact['baseMetricV3'].get('cvssV3', {})
                cvss_score = cvss_v3.get('baseScore')
                severity = cvss_v3.get('baseSeverity', '').lower()
            elif 'baseMetricV2' in impact:
                cvss_v2 = impact['baseMetricV2'].get('cvssV2', {})
                cvss_score = cvss_v2.get('baseScore')
                severity = impact['baseMetricV2'].get('severity', '').lower()
            # Dates
            published_date = None
            modified_date = None
            if 'publishedDate' in cve_item:
                published_date = datetime.fromisoformat(
                    cve_item['publishedDate'].replace('Z', '+00:00')
                )
            if 'lastModifiedDate' in cve_item:
                modified_date = datetime.fromisoformat(
                    cve_item['lastModifiedDate'].replace('Z', '+00:00')
                )
            # Affected products (from CPE data)
            affected_products = []
            configurations = cve_item.get('configurations', {})
            for node in configurations.get('nodes', []):
                for cpe_match in node.get('cpe_match', []):
                    if cpe_match.get('vulnerable', False):
                        cpe_uri = cpe_match.get('cpe23Uri', '')
                        if cpe_uri:
                            affected_products.append(cpe_uri)
            # Reference URLs
            reference_urls = []
            references = cve.get('references', {}).get('reference_data', [])
            for ref in references:
                url = ref.get('url', '')
                if url:
                    reference_urls.append(url)
            return {
                'cve_id': cve_id,
                'description': description,
                'cvss_score': cvss_score,
                'severity': severity,
                'published_date': published_date,
                'modified_date': modified_date,
                'affected_products': affected_products,
                'reference_urls': reference_urls,
                'data_source': 'nvd_bulk',
                'nvd_json_version': '1.1',
                'bulk_processed': True
            }
        except Exception as e:
            logger.error(f"Error extracting CVE data: {e}")
            return None
    def _store_cve_data(self, cve_data: dict):
        """Store CVE data in database"""
        from main import CVE
        # Check if CVE already exists
        existing_cve = self.db_session.query(CVE).filter(
            CVE.cve_id == cve_data['cve_id']
        ).first()
        if existing_cve:
            # Update existing CVE
            for key, value in cve_data.items():
                setattr(existing_cve, key, value)
            existing_cve.updated_at = datetime.utcnow()
            logger.debug(f"Updated CVE {cve_data['cve_id']}")
        else:
            # Create new CVE
            new_cve = CVE(**cve_data)
            self.db_session.add(new_cve)
            logger.debug(f"Created new CVE {cve_data['cve_id']}")
    async def bulk_seed_database(self, start_year: int = 2002, 
                               end_year: Optional[int] = None) -> dict:
        """Perform complete bulk seeding of the database"""
        from main import BulkProcessingJob
        if end_year is None:
            end_year = datetime.now().year
        # Create bulk processing job
        job = BulkProcessingJob(
            job_type='nvd_bulk_seed',
            status='running',
            started_at=datetime.utcnow(),
            job_metadata={
                'start_year': start_year,
                'end_year': end_year,
                'total_years': end_year - start_year + 1
            }
        )
        self.db_session.add(job)
        self.db_session.commit()
        total_processed = 0
        total_failed = 0
        results = []
        try:
            # Download all feeds
            logger.info(f"Starting bulk seed from {start_year} to {end_year}")
            downloaded_files = await self.download_all_feeds(start_year, end_year)
            job.total_items = len(downloaded_files)
            self.db_session.commit()
            # Process each file
            for file_path in downloaded_files:
                try:
                    # Extract JSON file
                    json_file = self.extract_json_file(file_path)
                    # Process the JSON file
                    processed, failed = self.process_json_file(json_file)
                    total_processed += processed
                    total_failed += failed
                    job.processed_items += 1
                    results.append({
                        'file': file_path.name,
                        'processed': processed,
                        'failed': failed
                    })
                    # Clean up extracted file if it's different from original
                    if json_file != file_path:
                        json_file.unlink()
                    self.db_session.commit()
                except Exception as e:
                    logger.error(f"Error processing {file_path}: {e}")
                    job.failed_items += 1
                    total_failed += 1
                    self.db_session.commit()
            # Update job status
            job.status = 'completed'
            job.completed_at = datetime.utcnow()
            job.job_metadata.update({
                'total_processed': total_processed,
                'total_failed': total_failed,
                'results': results
            })
        except Exception as e:
            job.status = 'failed'
            job.error_message = str(e)
            job.completed_at = datetime.utcnow()
            logger.error(f"Bulk seed job failed: {e}")
        finally:
            self.db_session.commit()
        return {
            'job_id': str(job.id),
            'status': job.status,
            'total_processed': total_processed,
            'total_failed': total_failed,
            'results': results
        }
    async def incremental_update(self) -> dict:
        """Perform incremental update using modified and recent feeds"""
        from main import BulkProcessingJob
        # Create incremental update job
        job = BulkProcessingJob(
            job_type='incremental_update',
            status='running',
            started_at=datetime.utcnow(),
            job_metadata={'feeds': ['modified', 'recent']}
        )
        self.db_session.add(job)
        self.db_session.commit()
        total_processed = 0
        total_failed = 0
        results = []
        try:
            # Download modified and recent feeds
            async with aiohttp.ClientSession() as session:
                for feed_name, url in self.feed_urls.items():
                    filename = f"nvdcve-1.1-{feed_name}.json.gz"
                    destination = self.data_dir / filename
                    if await self.download_file(session, url, destination):
                        try:
                            json_file = self.extract_json_file(destination)
                            processed, failed = self.process_json_file(json_file)
                            total_processed += processed
                            total_failed += failed
                            results.append({
                                'feed': feed_name,
                                'processed': processed,
                                'failed': failed
                            })
                            # Clean up
                            if json_file != destination:
                                json_file.unlink()
                        except Exception as e:
                            logger.error(f"Error processing {feed_name} feed: {e}")
                            total_failed += 1
            job.status = 'completed'
            job.completed_at = datetime.utcnow()
            job.job_metadata.update({
                'total_processed': total_processed,
                'total_failed': total_failed,
                'results': results
            })
        except Exception as e:
            job.status = 'failed'
            job.error_message = str(e)
            job.completed_at = datetime.utcnow()
            logger.error(f"Incremental update job failed: {e}")
        finally:
            self.db_session.commit()
        return {
            'job_id': str(job.id),
            'status': job.status,
            'total_processed': total_processed,
            'total_failed': total_failed,
            'results': results
        }
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@ -12,3 +12,5 @@ pygithub==2.1.1
 gitpython==3.1.40
 beautifulsoup4==4.12.2
 lxml==4.9.3
 aiohttp==3.9.1
 aiofiles
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -1,5 +1,3 @@
 version: '3.8'
 services:
  db:
    image: postgres:15
@ -25,6 +23,7 @@ services:
    environment:
      DATABASE_URL: postgresql://cve_user:cve_password@db:5432/cve_sigma_db
      NVD_API_KEY: ${NVD_API_KEY:-}
      GITHUB_TOKEN: ${GITHUB_TOKEN}
    depends_on:
      db:
        condition: service_healthy
--- a/frontend/src/App.js
+++ b/frontend/src/App.js
@ -15,6 +15,10 @@ function App() {
  const [activeTab, setActiveTab] = useState('dashboard');
  const [fetchingCves, setFetchingCves] = useState(false);
  const [testResult, setTestResult] = useState(null);
  const [bulkJobs, setBulkJobs] = useState([]);
  const [bulkStatus, setBulkStatus] = useState({});
  const [pocStats, setPocStats] = useState({});
  const [bulkProcessing, setBulkProcessing] = useState(false);
  useEffect(() => {
    fetchData();
@ -23,15 +27,21 @@ function App() {
  const fetchData = async () => {
    try {
      setLoading(true);
-      const [cvesRes, rulesRes, statsRes] = await Promise.all([
+      const [cvesRes, rulesRes, statsRes, bulkJobsRes, bulkStatusRes, pocStatsRes] = await Promise.all([
        axios.get(`${API_BASE_URL}/api/cves`),
        axios.get(`${API_BASE_URL}/api/sigma-rules`),
-        axios.get(`${API_BASE_URL}/api/stats`)
+        axios.get(`${API_BASE_URL}/api/stats`),
        axios.get(`${API_BASE_URL}/api/bulk-jobs`),
        axios.get(`${API_BASE_URL}/api/bulk-status`),
        axios.get(`${API_BASE_URL}/api/poc-stats`)
      ]);
      setCves(cvesRes.data);
      setSigmaRules(rulesRes.data);
      setStats(statsRes.data);
      setBulkJobs(bulkJobsRes.data);
      setBulkStatus(bulkStatusRes.data);
      setPocStats(pocStatsRes.data);
    } catch (error) {
      console.error('Error fetching data:', error);
    } finally {
@ -39,6 +49,20 @@ function App() {
    }
  };
  const cancelJob = async (jobId) => {
    try {
      const response = await axios.post(`${API_BASE_URL}/api/cancel-job/${jobId}`);
      console.log('Cancel job response:', response.data);
      // Refresh data after cancelling
      setTimeout(() => {
        fetchData();
      }, 1000);
    } catch (error) {
      console.error('Error cancelling job:', error);
      alert('Failed to cancel job. Please try again.');
    }
  };
  const handleFetchCves = async () => {
    try {
      setFetchingCves(true);
@ -73,6 +97,73 @@ function App() {
    }
  };
  const startBulkSeed = async (startYear = 2020, endYear = null) => {
    try {
      setBulkProcessing(true);
      const response = await axios.post(`${API_BASE_URL}/api/bulk-seed`, {
        start_year: startYear,
        end_year: endYear
      });
      console.log('Bulk seed response:', response.data);
      // Refresh data after starting
      setTimeout(() => {
        fetchData();
      }, 2000);
    } catch (error) {
      console.error('Error starting bulk seed:', error);
      setBulkProcessing(false);
    }
  };
  const startIncrementalUpdate = async () => {
    try {
      setBulkProcessing(true);
      const response = await axios.post(`${API_BASE_URL}/api/incremental-update`);
      console.log('Incremental update response:', response.data);
      setTimeout(() => {
        fetchData();
        setBulkProcessing(false);
      }, 2000);
    } catch (error) {
      console.error('Error starting incremental update:', error);
      setBulkProcessing(false);
    }
  };
  const syncNomiSec = async (cveId = null) => {
    try {
      setBulkProcessing(true);
      const response = await axios.post(`${API_BASE_URL}/api/sync-nomi-sec`, {
        cve_id: cveId
      });
      console.log('Nomi-sec sync response:', response.data);
      setTimeout(() => {
        fetchData();
        setBulkProcessing(false);
      }, 2000);
    } catch (error) {
      console.error('Error syncing nomi-sec:', error);
      setBulkProcessing(false);
    }
  };
  const regenerateRules = async (force = false) => {
    try {
      setBulkProcessing(true);
      const response = await axios.post(`${API_BASE_URL}/api/regenerate-rules`, {
        force: force
      });
      console.log('Rule regeneration response:', response.data);
      setTimeout(() => {
        fetchData();
        setBulkProcessing(false);
      }, 2000);
    } catch (error) {
      console.error('Error regenerating rules:', error);
      setBulkProcessing(false);
    }
  };
  const getSeverityColor = (severity) => {
    switch (severity?.toLowerCase()) {
      case 'critical': return 'bg-red-100 text-red-800';
@ -93,18 +184,81 @@ function App() {
  const Dashboard = () => (
    <div className="space-y-6">
-      <div className="grid grid-cols-1 md:grid-cols-3 gap-6">
+      <div className="grid grid-cols-1 md:grid-cols-5 gap-6">
        <div className="bg-white p-6 rounded-lg shadow">
          <h3 className="text-lg font-medium text-gray-900">Total CVEs</h3>
          <p className="text-3xl font-bold text-blue-600">{stats.total_cves || 0}</p>
          <p className="text-sm text-gray-500">Bulk: {stats.bulk_processed_cves || 0}</p>
        </div>
        <div className="bg-white p-6 rounded-lg shadow">
          <h3 className="text-lg font-medium text-gray-900">SIGMA Rules</h3>
          <p className="text-3xl font-bold text-green-600">{stats.total_sigma_rules || 0}</p>
          <p className="text-sm text-gray-500">Nomi-sec: {stats.nomi_sec_rules || 0}</p>
        </div>
        <div className="bg-white p-6 rounded-lg shadow">
          <h3 className="text-lg font-medium text-gray-900">CVEs with PoCs</h3>
          <p className="text-3xl font-bold text-purple-600">{stats.cves_with_pocs || 0}</p>
          <p className="text-sm text-gray-500">{(stats.poc_coverage || 0).toFixed(1)}% coverage</p>
        </div>
        <div className="bg-white p-6 rounded-lg shadow">
          <h3 className="text-lg font-medium text-gray-900">Recent CVEs (7d)</h3>
-          <p className="text-3xl font-bold text-purple-600">{stats.recent_cves_7_days || 0}</p>
+          <p className="text-3xl font-bold text-orange-600">{stats.recent_cves_7_days || 0}</p>
        </div>
        <div className="bg-white p-6 rounded-lg shadow">
          <h3 className="text-lg font-medium text-gray-900">High Quality PoCs</h3>
          <p className="text-3xl font-bold text-indigo-600">{pocStats.high_quality_cves || 0}</p>
          <p className="text-sm text-gray-500">Avg: {(pocStats.avg_poc_count || 0).toFixed(1)}</p>
        </div>
      </div>
      {/* Bulk Processing Controls */}
      <div className="bg-white rounded-lg shadow p-6">
        <h2 className="text-xl font-bold text-gray-900 mb-4">Bulk Processing</h2>
        <div className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-4 gap-4">
          <button
            onClick={() => startBulkSeed(2020)}
            disabled={bulkProcessing}
            className={`px-4 py-2 rounded-md text-white ${
              bulkProcessing 
                ? 'bg-gray-400 cursor-not-allowed' 
                : 'bg-blue-600 hover:bg-blue-700'
            }`}
          >
            {bulkProcessing ? 'Processing...' : 'Bulk Seed (2020+)'}
          </button>
          <button
            onClick={startIncrementalUpdate}
            disabled={bulkProcessing}
            className={`px-4 py-2 rounded-md text-white ${
              bulkProcessing 
                ? 'bg-gray-400 cursor-not-allowed' 
                : 'bg-green-600 hover:bg-green-700'
            }`}
          >
            {bulkProcessing ? 'Processing...' : 'Incremental Update'}
          </button>
          <button
            onClick={() => syncNomiSec()}
            disabled={bulkProcessing}
            className={`px-4 py-2 rounded-md text-white ${
              bulkProcessing 
                ? 'bg-gray-400 cursor-not-allowed' 
                : 'bg-purple-600 hover:bg-purple-700'
            }`}
          >
            {bulkProcessing ? 'Processing...' : 'Sync nomi-sec PoCs'}
          </button>
          <button
            onClick={() => regenerateRules()}
            disabled={bulkProcessing}
            className={`px-4 py-2 rounded-md text-white ${
              bulkProcessing 
                ? 'bg-gray-400 cursor-not-allowed' 
                : 'bg-indigo-600 hover:bg-indigo-700'
            }`}
          >
            {bulkProcessing ? 'Processing...' : 'Regenerate Rules'}
          </button>
        </div>
      </div>
@ -522,6 +676,178 @@ function App() {
    );
  };
  const BulkJobsList = () => (
    <div className="space-y-6">
      <div className="flex justify-between items-center">
        <h1 className="text-2xl font-bold text-gray-900">Bulk Processing Jobs</h1>
        <button
          onClick={fetchData}
          className="bg-blue-600 hover:bg-blue-700 text-white px-4 py-2 rounded-md text-sm"
        >
          Refresh
        </button>
      </div>
      {/* Bulk Status Overview */}
      <div className="bg-white rounded-lg shadow p-6">
        <h2 className="text-lg font-bold text-gray-900 mb-4">System Status</h2>
        {bulkStatus.database_stats && (
          <div className="grid grid-cols-2 md:grid-cols-4 gap-4">
            <div className="text-center">
              <div className="text-2xl font-bold text-blue-600">{bulkStatus.database_stats.total_cves}</div>
              <div className="text-sm text-gray-500">Total CVEs</div>
            </div>
            <div className="text-center">
              <div className="text-2xl font-bold text-green-600">{bulkStatus.database_stats.bulk_processed_cves}</div>
              <div className="text-sm text-gray-500">Bulk Processed</div>
            </div>
            <div className="text-center">
              <div className="text-2xl font-bold text-purple-600">{bulkStatus.database_stats.cves_with_pocs}</div>
              <div className="text-sm text-gray-500">With PoCs</div>
            </div>
            <div className="text-center">
              <div className="text-2xl font-bold text-indigo-600">{bulkStatus.database_stats.nomi_sec_rules}</div>
              <div className="text-sm text-gray-500">Enhanced Rules</div>
            </div>
          </div>
        )}
      </div>
      {/* Running Jobs */}
      {bulkJobs.some(job => job.status === 'running' || job.status === 'pending') && (
        <div className="bg-white rounded-lg shadow">
          <div className="px-6 py-4 border-b border-gray-200">
            <h2 className="text-lg font-bold text-gray-900">Running Jobs</h2>
          </div>
          <div className="divide-y divide-gray-200">
            {bulkJobs
              .filter(job => job.status === 'running' || job.status === 'pending')
              .map((job) => (
                <div key={job.id} className="px-6 py-4 bg-blue-50">
                  <div className="flex items-center justify-between">
                    <div className="flex-1">
                      <div className="flex items-center space-x-3">
                        <h3 className="text-lg font-medium text-gray-900">{job.job_type}</h3>
                        <span className={`inline-flex px-2 py-1 text-xs font-semibold rounded-full ${
                          job.status === 'running' ? 'bg-blue-100 text-blue-800' :
                          'bg-gray-100 text-gray-800'
                        }`}>
                          {job.status}
                        </span>
                      </div>
                      <div className="mt-2 flex items-center space-x-6 text-sm text-gray-500">
                        <span>Started: {formatDate(job.started_at)}</span>
                        {job.year && <span>Year: {job.year}</span>}
                      </div>
                      {job.total_items > 0 && (
                        <div className="mt-2">
                          <div className="flex items-center space-x-4 text-sm text-gray-600">
                            <span>Progress: {job.processed_items}/{job.total_items}</span>
                            {job.failed_items > 0 && (
                              <span className="text-red-600">Failed: {job.failed_items}</span>
                            )}
                          </div>
                          <div className="mt-1 w-full bg-gray-200 rounded-full h-2">
                            <div 
                              className="bg-blue-600 h-2 rounded-full" 
                              style={{ width: `${(job.processed_items / job.total_items) * 100}%` }}
                            ></div>
                          </div>
                        </div>
                      )}
                    </div>
                    <div className="flex-shrink-0 ml-4">
                      <button
                        onClick={() => cancelJob(job.id)}
                        className="bg-red-600 hover:bg-red-700 text-white px-3 py-1 rounded-md text-sm font-medium"
                      >
                        Cancel
                      </button>
                    </div>
                  </div>
                </div>
              ))}
          </div>
        </div>
      )}
      {/* Recent Jobs */}
      <div className="bg-white rounded-lg shadow">
        <div className="px-6 py-4 border-b border-gray-200">
          <h2 className="text-lg font-bold text-gray-900">Recent Jobs</h2>
        </div>
        <div className="divide-y divide-gray-200">
          {bulkJobs.length === 0 ? (
            <div className="px-6 py-8 text-center text-gray-500">
              No bulk processing jobs found
            </div>
          ) : (
            bulkJobs.map((job) => (
              <div key={job.id} className="px-6 py-4">
                <div className="flex items-center justify-between">
                  <div className="flex-1">
                    <div className="flex items-center space-x-3">
                      <h3 className="text-lg font-medium text-gray-900">{job.job_type}</h3>
                      <span className={`inline-flex px-2 py-1 text-xs font-semibold rounded-full ${
                        job.status === 'completed' ? 'bg-green-100 text-green-800' :
                        job.status === 'running' ? 'bg-blue-100 text-blue-800' :
                        job.status === 'failed' ? 'bg-red-100 text-red-800' :
                        job.status === 'cancelled' ? 'bg-orange-100 text-orange-800' :
                        'bg-gray-100 text-gray-800'
                      }`}>
                        {job.status}
                      </span>
                    </div>
                    <div className="mt-2 flex items-center space-x-6 text-sm text-gray-500">
                      <span>Started: {formatDate(job.started_at)}</span>
                      {job.completed_at && (
                        <span>Completed: {formatDate(job.completed_at)}</span>
                      )}
                      {job.year && (
                        <span>Year: {job.year}</span>
                      )}
                    </div>
                    {job.total_items > 0 && (
                      <div className="mt-2">
                        <div className="flex items-center space-x-4 text-sm text-gray-600">
                          <span>Progress: {job.processed_items}/{job.total_items}</span>
                          {job.failed_items > 0 && (
                            <span className="text-red-600">Failed: {job.failed_items}</span>
                          )}
                        </div>
                        <div className="mt-1 w-full bg-gray-200 rounded-full h-2">
                          <div 
                            className="bg-blue-600 h-2 rounded-full" 
                            style={{ width: `${(job.processed_items / job.total_items) * 100}%` }}
                          ></div>
                        </div>
                      </div>
                    )}
                    {job.error_message && (
                      <div className="mt-2 p-2 bg-red-50 border border-red-200 rounded text-sm text-red-700">
                        {job.error_message}
                      </div>
                    )}
                  </div>
                  <div className="flex-shrink-0 ml-4">
                    {(job.status === 'running' || job.status === 'pending') && (
                      <button
                        onClick={() => cancelJob(job.id)}
                        className="bg-red-600 hover:bg-red-700 text-white px-3 py-1 rounded-md text-sm font-medium"
                      >
                        Cancel
                      </button>
                    )}
                  </div>
                </div>
              </div>
            ))
          )}
        </div>
      </div>
    </div>
  );
  if (loading) {
    return (
      <div className="min-h-screen bg-gray-100 flex items-center justify-center">
@ -573,6 +899,16 @@ function App() {
                >
                  SIGMA Rules
                </button>
                <button
                  onClick={() => setActiveTab('bulk-jobs')}
                  className={`inline-flex items-center px-1 pt-1 border-b-2 text-sm font-medium ${
                    activeTab === 'bulk-jobs'
                      ? 'border-blue-500 text-gray-900'
                      : 'border-transparent text-gray-500 hover:text-gray-700 hover:border-gray-300'
                  }`}
                >
                  Bulk Jobs
                </button>
              </div>
            </div>
          </div>
@ -584,6 +920,7 @@ function App() {
          {activeTab === 'dashboard' && <Dashboard />}
          {activeTab === 'cves' && <CVEList />}
          {activeTab === 'rules' && <SigmaRulesList />}
          {activeTab === 'bulk-jobs' && <BulkJobsList />}
        </div>
      </main>
--- a/init.sql
+++ b/init.sql
@ -13,6 +13,13 @@ CREATE TABLE cves (
    modified_date TIMESTAMP,
    affected_products TEXT[],
    reference_urls TEXT[],
    -- Bulk processing fields
    data_source VARCHAR(20) DEFAULT 'nvd_api',
    nvd_json_version VARCHAR(10) DEFAULT '2.0',
    bulk_processed BOOLEAN DEFAULT FALSE,
    -- nomi-sec PoC fields
    poc_count INTEGER DEFAULT 0,
    poc_data JSON,
    created_at TIMESTAMP DEFAULT NOW(),
    updated_at TIMESTAMP DEFAULT NOW()
 );
@ -30,6 +37,10 @@ CREATE TABLE sigma_rules (
    exploit_based BOOLEAN DEFAULT FALSE,
    github_repos TEXT[],
    exploit_indicators TEXT,
    -- Enhanced fields for new data sources
    poc_source VARCHAR(20) DEFAULT 'github_search',
    poc_quality_score INTEGER DEFAULT 0,
    nomi_sec_data JSON,
    created_at TIMESTAMP DEFAULT NOW(),
    updated_at TIMESTAMP DEFAULT NOW()
 );
@ -44,6 +55,23 @@ CREATE TABLE rule_templates (
    created_at TIMESTAMP DEFAULT NOW()
 );
 -- Bulk processing jobs table
 CREATE TABLE bulk_processing_jobs (
    id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
    job_type VARCHAR(50) NOT NULL,
    status VARCHAR(20) DEFAULT 'pending',
    year INTEGER,
    total_items INTEGER DEFAULT 0,
    processed_items INTEGER DEFAULT 0,
    failed_items INTEGER DEFAULT 0,
    error_message TEXT,
    job_metadata JSON,
    started_at TIMESTAMP,
    completed_at TIMESTAMP,
    cancelled_at TIMESTAMP,
    created_at TIMESTAMP DEFAULT NOW()
 );
 -- Insert some basic rule templates
 INSERT INTO rule_templates (template_name, template_content, applicable_product_patterns, description) VALUES 
 (
--- a/start.sh
+++ b/start.sh