auto_sigma_rule_generator/backend/tasks/data_sync_tasks.py

"""
Data synchronization tasks for Celery
"""
import asyncio
import logging
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from typing import Dict, Any
from celery import current_task
from celery_config import celery_app, get_db_session
from nomi_sec_client import NomiSecClient
from exploitdb_client_local import ExploitDBLocalClient
from cisa_kev_client import CISAKEVClient
from mcdevitt_poc_client import GitHubPoCClient

logger = logging.getLogger(__name__)

@celery_app.task(bind=True, name='data_sync_tasks.sync_nomi_sec')
def sync_nomi_sec_task(self, batch_size: int = 50) -> Dict[str, Any]:
    """
    Celery task for nomi-sec PoC synchronization

    Args:
        batch_size: Number of CVEs to process in each batch

    Returns:
        Dictionary containing sync results
    """
    db_session = get_db_session()

    try:
        # Update task progress
        self.update_state(
            state='PROGRESS',
            meta={
                'stage': 'sync_nomi_sec',
                'progress': 0,
                'message': 'Starting nomi-sec PoC synchronization'
            }
        )

        logger.info(f"Starting nomi-sec sync task with batch size: {batch_size}")

        # Create client instance
        client = NomiSecClient(db_session)

        # Run the synchronization
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)

        try:
            result = loop.run_until_complete(
                client.bulk_sync_all_cves(batch_size=batch_size)
            )
        finally:
            loop.close()

        # Update final progress
        self.update_state(
            state='SUCCESS',
            meta={
                'stage': 'completed',
                'progress': 100,
                'message': 'Nomi-sec synchronization completed successfully'
            }
        )

        logger.info(f"Nomi-sec sync task completed: {result}")
        return result

    except Exception as e:
        logger.error(f"Nomi-sec sync task failed: {e}")
        self.update_state(
            state='FAILURE',
            meta={
                'stage': 'error',
                'progress': 0,
                'message': f'Task failed: {str(e)}',
                'error': str(e)
            }
        )
        raise
    finally:
        db_session.close()


@celery_app.task(bind=True, name='data_sync_tasks.sync_cve2capec')
def sync_cve2capec_task(self, force_refresh: bool = False) -> Dict[str, Any]:
    """
    Celery task for CVE2CAPEC MITRE ATT&CK mapping synchronization

    Args:
        force_refresh: Whether to force refresh the cache regardless of expiry

    Returns:
        Dictionary containing sync results
    """
    try:
        # Update task progress
        self.update_state(
            state='PROGRESS',
            meta={
                'stage': 'sync_cve2capec',
                'progress': 0,
                'message': 'Starting CVE2CAPEC MITRE ATT&CK mapping synchronization'
            }
        )

        logger.info(f"Starting CVE2CAPEC sync task with force_refresh: {force_refresh}")

        # Import here to avoid circular dependencies
        from cve2capec_client import CVE2CAPECClient

        # Create client instance
        client = CVE2CAPECClient()

        # Update progress
        self.update_state(
            state='PROGRESS',
            meta={
                'stage': 'sync_cve2capec',
                'progress': 10,
                'message': 'Fetching MITRE ATT&CK mappings...'
            }
        )

        # Force refresh if requested
        if force_refresh:
            client._fetch_fresh_data()

        # Update progress
        self.update_state(
            state='PROGRESS',
            meta={
                'stage': 'sync_cve2capec',
                'progress': 50,
                'message': 'Processing CVE mappings...'
            }
        )

        # Get statistics about the loaded data
        stats = client.get_stats()

        # Update progress to completion
        self.update_state(
            state='PROGRESS',
            meta={
                'stage': 'sync_cve2capec',
                'progress': 100,
                'message': 'CVE2CAPEC synchronization completed successfully'
            }
        )

        result = {
            'status': 'completed',
            'total_mappings': stats.get('total_mappings', 0),
            'total_techniques': stats.get('unique_techniques', 0),
            'cache_updated': True
        }

        logger.info(f"CVE2CAPEC sync task completed: {result}")
        return result

    except Exception as e:
        logger.error(f"CVE2CAPEC sync task failed: {e}")
        self.update_state(
            state='FAILURE',
            meta={
                'stage': 'error',
                'progress': 0,
                'message': f'Task failed: {str(e)}',
                'error': str(e)
            }
        )
        raise


@celery_app.task(bind=True, name='data_sync_tasks.sync_github_poc')
def sync_github_poc_task(self, batch_size: int = 50) -> Dict[str, Any]:
    """
    Celery task for GitHub PoC synchronization

    Args:
        batch_size: Number of CVEs to process in each batch

    Returns:
        Dictionary containing sync results
    """
    db_session = get_db_session()

    try:
        # Update task progress
        self.update_state(
            state='PROGRESS',
            meta={
                'stage': 'sync_github_poc',
                'progress': 0,
                'message': 'Starting GitHub PoC synchronization'
            }
        )

        logger.info(f"Starting GitHub PoC sync task with batch size: {batch_size}")

        # Create client instance
        client = GitHubPoCClient(db_session)

        # Run the synchronization
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)

        try:
            result = loop.run_until_complete(
                client.bulk_sync_all_cves(batch_size=batch_size)
            )
        finally:
            loop.close()

        # Update final progress
        self.update_state(
            state='SUCCESS',
            meta={
                'stage': 'completed',
                'progress': 100,
                'message': 'GitHub PoC synchronization completed successfully'
            }
        )

        logger.info(f"GitHub PoC sync task completed: {result}")
        return result

    except Exception as e:
        logger.error(f"GitHub PoC sync task failed: {e}")
        self.update_state(
            state='FAILURE',
            meta={
                'stage': 'error',
                'progress': 0,
                'message': f'Task failed: {str(e)}',
                'error': str(e)
            }
        )
        raise
    finally:
        db_session.close()


@celery_app.task(bind=True, name='data_sync_tasks.sync_cve2capec')
def sync_cve2capec_task(self, force_refresh: bool = False) -> Dict[str, Any]:
    """
    Celery task for CVE2CAPEC MITRE ATT&CK mapping synchronization

    Args:
        force_refresh: Whether to force refresh the cache regardless of expiry

    Returns:
        Dictionary containing sync results
    """
    try:
        # Update task progress
        self.update_state(
            state='PROGRESS',
            meta={
                'stage': 'sync_cve2capec',
                'progress': 0,
                'message': 'Starting CVE2CAPEC MITRE ATT&CK mapping synchronization'
            }
        )

        logger.info(f"Starting CVE2CAPEC sync task with force_refresh: {force_refresh}")

        # Import here to avoid circular dependencies
        from cve2capec_client import CVE2CAPECClient

        # Create client instance
        client = CVE2CAPECClient()

        # Update progress
        self.update_state(
            state='PROGRESS',
            meta={
                'stage': 'sync_cve2capec',
                'progress': 10,
                'message': 'Fetching MITRE ATT&CK mappings...'
            }
        )

        # Force refresh if requested
        if force_refresh:
            client._fetch_fresh_data()

        # Update progress
        self.update_state(
            state='PROGRESS',
            meta={
                'stage': 'sync_cve2capec',
                'progress': 50,
                'message': 'Processing CVE mappings...'
            }
        )

        # Get statistics about the loaded data
        stats = client.get_stats()

        # Update progress to completion
        self.update_state(
            state='PROGRESS',
            meta={
                'stage': 'sync_cve2capec',
                'progress': 100,
                'message': 'CVE2CAPEC synchronization completed successfully'
            }
        )

        result = {
            'status': 'completed',
            'total_mappings': stats.get('total_mappings', 0),
            'total_techniques': stats.get('unique_techniques', 0),
            'cache_updated': True
        }

        logger.info(f"CVE2CAPEC sync task completed: {result}")
        return result

    except Exception as e:
        logger.error(f"CVE2CAPEC sync task failed: {e}")
        self.update_state(
            state='FAILURE',
            meta={
                'stage': 'error',
                'progress': 0,
                'message': f'Task failed: {str(e)}',
                'error': str(e)
            }
        )
        raise


@celery_app.task(bind=True, name='data_sync_tasks.sync_reference_content')
def sync_reference_content_task(self, batch_size: int = 30, max_cves: int = 200,
                               force_resync: bool = False) -> Dict[str, Any]:
    """
    Celery task for CVE reference content extraction and analysis

    Args:
        batch_size: Number of CVEs to process in each batch
        max_cves: Maximum number of CVEs to process
        force_resync: Force re-sync of recently processed CVEs

    Returns:
        Dictionary containing sync results
    """
    db_session = get_db_session()

    try:
        # Import here to avoid circular imports
        import sys
        import os
        sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
        from main import CVE

        # Update task progress
        self.update_state(
            state='PROGRESS',
            meta={
                'stage': 'sync_reference_content',
                'progress': 0,
                'message': 'Starting CVE reference content extraction'
            }
        )

        logger.info(f"Starting reference content sync task - batch_size: {batch_size}, max_cves: {max_cves}")

        # Get CVEs to process (prioritize those with references but no extracted content)
        query = db_session.query(CVE)

        if not force_resync:
            # Skip CVEs that were recently processed
            from datetime import datetime, timedelta
            cutoff_date = datetime.utcnow() - timedelta(days=7)
            query = query.filter(
                (CVE.reference_content_extracted_at.is_(None)) |
                (CVE.reference_content_extracted_at < cutoff_date)
            )

        # Prioritize CVEs with references
        cves = query.filter(CVE.references.isnot(None)).limit(max_cves).all()

        if not cves:
            logger.info("No CVEs found for reference content extraction")
            return {'total_processed': 0, 'successful_extractions': 0, 'failed_extractions': 0}

        total_processed = 0
        successful_extractions = 0
        failed_extractions = 0

        # Process CVEs in batches
        for i in range(0, len(cves), batch_size):
            batch = cves[i:i + batch_size]

            for j, cve in enumerate(batch):
                try:
                    # Update progress
                    overall_progress = int(((i + j) / len(cves)) * 100)
                    self.update_state(
                        state='PROGRESS',
                        meta={
                            'stage': 'sync_reference_content',
                            'progress': overall_progress,
                            'message': f'Processing CVE {cve.cve_id} ({i + j + 1}/{len(cves)})',
                            'current_cve': cve.cve_id,
                            'processed': i + j,
                            'total': len(cves)
                        }
                    )

                    # For now, simulate reference content extraction
                    # In a real implementation, you would create a ReferenceContentExtractor
                    # and extract content from CVE references

                    # Mark CVE as processed
                    from datetime import datetime
                    cve.reference_content_extracted_at = datetime.utcnow()

                    successful_extractions += 1
                    total_processed += 1

                    # Small delay between requests
                    import time
                    time.sleep(2)

                except Exception as e:
                    logger.error(f"Error processing reference content for CVE {cve.cve_id}: {e}")
                    failed_extractions += 1
                    total_processed += 1

            # Commit after each batch
            db_session.commit()
            logger.info(f"Processed batch {i//batch_size + 1}/{(len(cves) + batch_size - 1)//batch_size}")

        # Final results
        result = {
            'total_processed': total_processed,
            'successful_extractions': successful_extractions,
            'failed_extractions': failed_extractions,
            'extraction_rate': (successful_extractions / total_processed * 100) if total_processed > 0 else 0
        }

        # Update final progress
        self.update_state(
            state='SUCCESS',
            meta={
                'stage': 'completed',
                'progress': 100,
                'message': f'Reference content extraction completed: {successful_extractions} successful, {failed_extractions} failed',
                'results': result
            }
        )

        logger.info(f"Reference content sync task completed: {result}")
        return result

    except Exception as e:
        logger.error(f"Reference content sync task failed: {e}")
        self.update_state(
            state='FAILURE',
            meta={
                'stage': 'error',
                'progress': 0,
                'message': f'Task failed: {str(e)}',
                'error': str(e)
            }
        )
        raise
    finally:
        db_session.close()


@celery_app.task(bind=True, name='data_sync_tasks.sync_cve2capec')
def sync_cve2capec_task(self, force_refresh: bool = False) -> Dict[str, Any]:
    """
    Celery task for CVE2CAPEC MITRE ATT&CK mapping synchronization

    Args:
        force_refresh: Whether to force refresh the cache regardless of expiry

    Returns:
        Dictionary containing sync results
    """
    try:
        # Update task progress
        self.update_state(
            state='PROGRESS',
            meta={
                'stage': 'sync_cve2capec',
                'progress': 0,
                'message': 'Starting CVE2CAPEC MITRE ATT&CK mapping synchronization'
            }
        )

        logger.info(f"Starting CVE2CAPEC sync task with force_refresh: {force_refresh}")

        # Import here to avoid circular dependencies
        from cve2capec_client import CVE2CAPECClient

        # Create client instance
        client = CVE2CAPECClient()

        # Update progress
        self.update_state(
            state='PROGRESS',
            meta={
                'stage': 'sync_cve2capec',
                'progress': 10,
                'message': 'Fetching MITRE ATT&CK mappings...'
            }
        )

        # Force refresh if requested
        if force_refresh:
            client._fetch_fresh_data()

        # Update progress
        self.update_state(
            state='PROGRESS',
            meta={
                'stage': 'sync_cve2capec',
                'progress': 50,
                'message': 'Processing CVE mappings...'
            }
        )

        # Get statistics about the loaded data
        stats = client.get_stats()

        # Update progress to completion
        self.update_state(
            state='PROGRESS',
            meta={
                'stage': 'sync_cve2capec',
                'progress': 100,
                'message': 'CVE2CAPEC synchronization completed successfully'
            }
        )

        result = {
            'status': 'completed',
            'total_mappings': stats.get('total_mappings', 0),
            'total_techniques': stats.get('unique_techniques', 0),
            'cache_updated': True
        }

        logger.info(f"CVE2CAPEC sync task completed: {result}")
        return result

    except Exception as e:
        logger.error(f"CVE2CAPEC sync task failed: {e}")
        self.update_state(
            state='FAILURE',
            meta={
                'stage': 'error',
                'progress': 0,
                'message': f'Task failed: {str(e)}',
                'error': str(e)
            }
        )
        raise

@celery_app.task(bind=True, name='data_sync_tasks.sync_exploitdb')
def sync_exploitdb_task(self, batch_size: int = 30) -> Dict[str, Any]:
    """
    Celery task for ExploitDB synchronization

    Args:
        batch_size: Number of CVEs to process in each batch

    Returns:
        Dictionary containing sync results
    """
    db_session = get_db_session()

    try:
        # Update task progress
        self.update_state(
            state='PROGRESS',
            meta={
                'stage': 'sync_exploitdb',
                'progress': 0,
                'message': 'Starting ExploitDB synchronization'
            }
        )

        logger.info(f"Starting ExploitDB sync task with batch size: {batch_size}")

        # Create client instance
        client = ExploitDBLocalClient(db_session)

        # Run the synchronization
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)

        try:
            result = loop.run_until_complete(
                client.bulk_sync_exploitdb(batch_size=batch_size)
            )
        finally:
            loop.close()

        # Update final progress
        self.update_state(
            state='SUCCESS',
            meta={
                'stage': 'completed',
                'progress': 100,
                'message': 'ExploitDB synchronization completed successfully'
            }
        )

        logger.info(f"ExploitDB sync task completed: {result}")
        return result

    except Exception as e:
        logger.error(f"ExploitDB sync task failed: {e}")
        self.update_state(
            state='FAILURE',
            meta={
                'stage': 'error',
                'progress': 0,
                'message': f'Task failed: {str(e)}',
                'error': str(e)
            }
        )
        raise
    finally:
        db_session.close()


@celery_app.task(bind=True, name='data_sync_tasks.sync_cve2capec')
def sync_cve2capec_task(self, force_refresh: bool = False) -> Dict[str, Any]:
    """
    Celery task for CVE2CAPEC MITRE ATT&CK mapping synchronization

    Args:
        force_refresh: Whether to force refresh the cache regardless of expiry

    Returns:
        Dictionary containing sync results
    """
    try:
        # Update task progress
        self.update_state(
            state='PROGRESS',
            meta={
                'stage': 'sync_cve2capec',
                'progress': 0,
                'message': 'Starting CVE2CAPEC MITRE ATT&CK mapping synchronization'
            }
        )

        logger.info(f"Starting CVE2CAPEC sync task with force_refresh: {force_refresh}")

        # Import here to avoid circular dependencies
        from cve2capec_client import CVE2CAPECClient

        # Create client instance
        client = CVE2CAPECClient()

        # Update progress
        self.update_state(
            state='PROGRESS',
            meta={
                'stage': 'sync_cve2capec',
                'progress': 10,
                'message': 'Fetching MITRE ATT&CK mappings...'
            }
        )

        # Force refresh if requested
        if force_refresh:
            client._fetch_fresh_data()

        # Update progress
        self.update_state(
            state='PROGRESS',
            meta={
                'stage': 'sync_cve2capec',
                'progress': 50,
                'message': 'Processing CVE mappings...'
            }
        )

        # Get statistics about the loaded data
        stats = client.get_stats()

        # Update progress to completion
        self.update_state(
            state='PROGRESS',
            meta={
                'stage': 'sync_cve2capec',
                'progress': 100,
                'message': 'CVE2CAPEC synchronization completed successfully'
            }
        )

        result = {
            'status': 'completed',
            'total_mappings': stats.get('total_mappings', 0),
            'total_techniques': stats.get('unique_techniques', 0),
            'cache_updated': True
        }

        logger.info(f"CVE2CAPEC sync task completed: {result}")
        return result

    except Exception as e:
        logger.error(f"CVE2CAPEC sync task failed: {e}")
        self.update_state(
            state='FAILURE',
            meta={
                'stage': 'error',
                'progress': 0,
                'message': f'Task failed: {str(e)}',
                'error': str(e)
            }
        )
        raise


@celery_app.task(bind=True, name='data_sync_tasks.sync_cisa_kev')
def sync_cisa_kev_task(self, batch_size: int = 100) -> Dict[str, Any]:
    """
    Celery task for CISA KEV synchronization

    Args:
        batch_size: Number of CVEs to process in each batch

    Returns:
        Dictionary containing sync results
    """
    db_session = get_db_session()

    try:
        # Update task progress
        self.update_state(
            state='PROGRESS',
            meta={
                'stage': 'sync_cisa_kev',
                'progress': 0,
                'message': 'Starting CISA KEV synchronization'
            }
        )

        logger.info(f"Starting CISA KEV sync task with batch size: {batch_size}")

        # Create client instance
        client = CISAKEVClient(db_session)

        # Run the synchronization
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)

        try:
            result = loop.run_until_complete(
                client.bulk_sync_kev_data(batch_size=batch_size)
            )
        finally:
            loop.close()

        # Update final progress
        self.update_state(
            state='SUCCESS',
            meta={
                'stage': 'completed',
                'progress': 100,
                'message': 'CISA KEV synchronization completed successfully'
            }
        )

        logger.info(f"CISA KEV sync task completed: {result}")
        return result

    except Exception as e:
        logger.error(f"CISA KEV sync task failed: {e}")
        self.update_state(
            state='FAILURE',
            meta={
                'stage': 'error',
                'progress': 0,
                'message': f'Task failed: {str(e)}',
                'error': str(e)
            }
        )
        raise
    finally:
        db_session.close()


@celery_app.task(bind=True, name='data_sync_tasks.sync_cve2capec')
def sync_cve2capec_task(self, force_refresh: bool = False) -> Dict[str, Any]:
    """
    Celery task for CVE2CAPEC MITRE ATT&CK mapping synchronization

    Args:
        force_refresh: Whether to force refresh the cache regardless of expiry

    Returns:
        Dictionary containing sync results
    """
    try:
        # Update task progress
        self.update_state(
            state='PROGRESS',
            meta={
                'stage': 'sync_cve2capec',
                'progress': 0,
                'message': 'Starting CVE2CAPEC MITRE ATT&CK mapping synchronization'
            }
        )

        logger.info(f"Starting CVE2CAPEC sync task with force_refresh: {force_refresh}")

        # Import here to avoid circular dependencies
        from cve2capec_client import CVE2CAPECClient

        # Create client instance
        client = CVE2CAPECClient()

        # Update progress
        self.update_state(
            state='PROGRESS',
            meta={
                'stage': 'sync_cve2capec',
                'progress': 10,
                'message': 'Fetching MITRE ATT&CK mappings...'
            }
        )

        # Force refresh if requested
        if force_refresh:
            client._fetch_fresh_data()

        # Update progress
        self.update_state(
            state='PROGRESS',
            meta={
                'stage': 'sync_cve2capec',
                'progress': 50,
                'message': 'Processing CVE mappings...'
            }
        )

        # Get statistics about the loaded data
        stats = client.get_stats()

        # Update progress to completion
        self.update_state(
            state='PROGRESS',
            meta={
                'stage': 'sync_cve2capec',
                'progress': 100,
                'message': 'CVE2CAPEC synchronization completed successfully'
            }
        )

        result = {
            'status': 'completed',
            'total_mappings': stats.get('total_mappings', 0),
            'total_techniques': stats.get('unique_techniques', 0),
            'cache_updated': True
        }

        logger.info(f"CVE2CAPEC sync task completed: {result}")
        return result

    except Exception as e:
        logger.error(f"CVE2CAPEC sync task failed: {e}")
        self.update_state(
            state='FAILURE',
            meta={
                'stage': 'error',
                'progress': 0,
                'message': f'Task failed: {str(e)}',
                'error': str(e)
            }
        )
        raise


@celery_app.task(bind=True, name='data_sync_tasks.build_exploitdb_index')
def build_exploitdb_index_task(self) -> Dict[str, Any]:
    """
    Celery task for building/rebuilding ExploitDB file index

    Returns:
        Dictionary containing build results
    """
    try:
        # Update task progress
        self.update_state(
            state='PROGRESS',
            meta={
                'stage': 'build_exploitdb_index',
                'progress': 0,
                'message': 'Starting ExploitDB file index building'
            }
        )

        logger.info("Starting ExploitDB index build task")

        # Import here to avoid circular dependencies
        from exploitdb_client_local import ExploitDBLocalClient

        # Create client instance with lazy_load=False to force index building
        client = ExploitDBLocalClient(None, lazy_load=False)

        # Update progress
        self.update_state(
            state='PROGRESS',
            meta={
                'stage': 'build_exploitdb_index',
                'progress': 50,
                'message': 'Building file index...'
            }
        )

        # Force index rebuild
        client._build_file_index()

        # Update progress to completion
        self.update_state(
            state='PROGRESS',
            meta={
                'stage': 'build_exploitdb_index',
                'progress': 100,
                'message': 'ExploitDB index building completed successfully'
            }
        )

        result = {
            'status': 'completed',
            'total_exploits_indexed': len(client.file_index),
            'index_updated': True
        }

        logger.info(f"ExploitDB index build task completed: {result}")
        return result

    except Exception as e:
        logger.error(f"ExploitDB index build task failed: {e}")
        self.update_state(
            state='FAILURE',
            meta={
                'stage': 'error',
                'progress': 0,
                'message': f'Task failed: {str(e)}',
                'error': str(e)
            }
        )
        raise