add reference data gathering

2025-07-10 17:30:12 -05:00 · 2025-07-10 17:30:12 -05:00 · 3c120462ac
commit 3c120462ac
parent c1bbea09fe
5 changed files with 197 additions and 5 deletions
--- a/backend/cisa_kev_client.py
+++ b/backend/cisa_kev_client.py
@ -172,7 +172,10 @@ class CISAKEVClient:
            'required_actions': [],
            'ransomware_indicators': [],
            'vulnerability_types': [],
-            'mitigation_techniques': []
+            'mitigation_techniques': [],
+            'technical_details': [],
+            'impact_analysis': [],
+            'urgency_indicators': []
        }
        
        # Extract from vulnerability name and description
@ -247,6 +250,46 @@ class CISAKEVClient:
            matches = re.findall(pattern, full_text, re.IGNORECASE)
            indicators['vulnerability_types'].extend(matches)
        
+        # Technical details extraction
+        technical_patterns = [
+            r'\b(port|service)\s+(\d+)\b',
+            r'\b(protocol)\s+(\w+)\b',
+            r'\b(version)\s+([\d\.]+)\b',
+            r'\b(cve-\d{4}-\d{4,7})\b',
+            r'\b(application|software|system)\s+(\w+)\b'
+        ]
+        
+        for pattern in technical_patterns:
+            matches = re.findall(pattern, full_text, re.IGNORECASE)
+            for match in matches:
+                if isinstance(match, tuple):
+                    indicators['technical_details'].append(' '.join(match))
+                else:
+                    indicators['technical_details'].append(match)
+        
+        # Impact analysis
+        impact_keywords = [
+            'critical system', 'data exfiltration', 'system compromise',
+            'unauthorized access', 'privilege escalation', 'lateral movement',
+            'ransomware deployment', 'data encryption', 'service disruption'
+        ]
+        
+        for keyword in impact_keywords:
+            if keyword in full_text:
+                indicators['impact_analysis'].append(keyword)
+        
+        # Urgency indicators
+        urgency_patterns = [
+            r'\b(immediate|urgent|critical|emergency)\b',
+            r'\b(actively exploited|in-the-wild|widespread)\b',
+            r'\b(patch.{0,10}available|fix.{0,10}available)\b',
+            r'\b(due.{0,10}date|deadline|must.{0,10}complete)\b'
+        ]
+        
+        for pattern in urgency_patterns:
+            matches = re.findall(pattern, full_text, re.IGNORECASE)
+            indicators['urgency_indicators'].extend(matches)
+        
        # Clean up and deduplicate
        for key in indicators:
            indicators[key] = list(set([item.strip() for item in indicators[key] if item and len(item.strip()) > 2]))
--- a/backend/main.py
+++ b/backend/main.py
@ -57,6 +57,10 @@ class CVE(Base):
    # nomi-sec PoC fields
    poc_count = Column(Integer, default=0)
    poc_data = Column(JSON)  # Store nomi-sec PoC metadata
+    # Reference data fields
+    reference_data = Column(JSON)  # Store extracted reference content and analysis
+    reference_sync_status = Column(String(20), default='pending')  # 'pending', 'processing', 'completed', 'failed'
+    reference_last_synced = Column(TIMESTAMP)
    created_at = Column(TIMESTAMP, default=datetime.utcnow)
    updated_at = Column(TIMESTAMP, default=datetime.utcnow)

@ -162,6 +166,12 @@ class CISAKEVSyncRequest(BaseModel):
    cve_id: Optional[str] = None
    batch_size: int = 100

+class ReferenceSyncRequest(BaseModel):
+    cve_id: Optional[str] = None
+    batch_size: int = 30
+    max_cves: Optional[int] = None
+    force_resync: bool = False
+
 class RuleRegenRequest(BaseModel):
    force: bool = False

@ -1414,6 +1424,133 @@ async def sync_cisa_kev(background_tasks: BackgroundTasks,
        "batch_size": request.batch_size
    }

+@app.post("/api/sync-references")
+async def sync_references(request: ReferenceSyncRequest, background_tasks: BackgroundTasks, db: Session = Depends(get_db)):
+    """Start reference data synchronization"""
+    
+    try:
+        from reference_client import ReferenceClient
+        client = ReferenceClient(db)
+        
+        # Create job ID
+        job_id = str(uuid.uuid4())
+        
+        # Add job to tracking
+        running_jobs[job_id] = {
+            'type': 'reference_sync',
+            'status': 'running',
+            'cve_id': request.cve_id,
+            'batch_size': request.batch_size,
+            'max_cves': request.max_cves,
+            'force_resync': request.force_resync,
+            'started_at': datetime.utcnow()
+        }
+        
+        # Create cancellation flag
+        job_cancellation_flags[job_id] = False
+        
+        async def sync_task():
+            try:
+                if request.cve_id:
+                    # Single CVE sync
+                    result = await client.sync_cve_references(request.cve_id)
+                    running_jobs[job_id]['result'] = result
+                    running_jobs[job_id]['status'] = 'completed'
+                else:
+                    # Bulk sync
+                    result = await client.bulk_sync_references(
+                        batch_size=request.batch_size,
+                        max_cves=request.max_cves,
+                        cancellation_flag=lambda: job_cancellation_flags.get(job_id, False)
+                    )
+                    running_jobs[job_id]['result'] = result
+                    running_jobs[job_id]['status'] = 'completed'
+                
+                running_jobs[job_id]['completed_at'] = datetime.utcnow()
+                
+            except Exception as e:
+                logger.error(f"Reference sync task failed: {e}")
+                running_jobs[job_id]['status'] = 'failed'
+                running_jobs[job_id]['error'] = str(e)
+                running_jobs[job_id]['completed_at'] = datetime.utcnow()
+            finally:
+                # Clean up cancellation flag
+                job_cancellation_flags.pop(job_id, None)
+        
+        background_tasks.add_task(sync_task)
+        
+        return {
+            "message": f"Reference sync started" + (f" for {request.cve_id}" if request.cve_id else " for all CVEs"),
+            "status": "started",
+            "job_id": job_id,
+            "cve_id": request.cve_id,
+            "batch_size": request.batch_size,
+            "max_cves": request.max_cves,
+            "force_resync": request.force_resync
+        }
+        
+    except Exception as e:
+        logger.error(f"Failed to start reference sync: {e}")
+        raise HTTPException(status_code=500, detail=f"Failed to start reference sync: {str(e)}")
+
+@app.get("/api/reference-stats")
+async def get_reference_stats(db: Session = Depends(get_db)):
+    """Get reference synchronization statistics"""
+    
+    try:
+        from reference_client import ReferenceClient
+        client = ReferenceClient(db)
+        
+        # Get sync status
+        status = await client.get_reference_sync_status()
+        
+        # Get quality distribution from reference data
+        quality_distribution = {}
+        from sqlalchemy import text
+        cves_with_references = db.query(CVE).filter(
+            text("reference_data::text LIKE '%\"reference_analysis\"%'")
+        ).all()
+        
+        for cve in cves_with_references:
+            if cve.reference_data and 'reference_analysis' in cve.reference_data:
+                ref_analysis = cve.reference_data['reference_analysis']
+                high_conf_refs = ref_analysis.get('high_confidence_references', 0)
+                total_refs = ref_analysis.get('reference_count', 0)
+                
+                if total_refs > 0:
+                    quality_ratio = high_conf_refs / total_refs
+                    if quality_ratio >= 0.8:
+                        quality_tier = 'excellent'
+                    elif quality_ratio >= 0.6:
+                        quality_tier = 'good'
+                    elif quality_ratio >= 0.4:
+                        quality_tier = 'fair'
+                    else:
+                        quality_tier = 'poor'
+                    
+                    quality_distribution[quality_tier] = quality_distribution.get(quality_tier, 0) + 1
+        
+        # Get reference type distribution
+        reference_type_distribution = {}
+        for cve in cves_with_references:
+            if cve.reference_data and 'reference_analysis' in cve.reference_data:
+                ref_analysis = cve.reference_data['reference_analysis']
+                ref_types = ref_analysis.get('reference_types', [])
+                for ref_type in ref_types:
+                    reference_type_distribution[ref_type] = reference_type_distribution.get(ref_type, 0) + 1
+        
+        return {
+            'reference_sync_status': status,
+            'quality_distribution': quality_distribution,
+            'reference_type_distribution': reference_type_distribution,
+            'total_with_reference_analysis': len(cves_with_references),
+            'source': 'reference_extraction'
+        }
+        
+    except Exception as e:
+        logger.error(f"Failed to get reference stats: {e}")
+        raise HTTPException(status_code=500, detail=f"Failed to get reference stats: {str(e)}")
+
@app.get("/api/exploitdb-stats")
 async def get_exploitdb_stats(db: Session = Depends(get_db)):
    """Get ExploitDB-related statistics"""
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@ -22,3 +22,4 @@ langchain-community==0.2.0
 langchain-core>=0.2.20
 openai>=1.32.0
 anthropic==0.40.0
+certifi==2024.2.2
--- a/frontend/src/App.js
+++ b/frontend/src/App.js
@ -248,11 +248,16 @@ function App() {

  const syncReferences = async () => {
    try {
-      // Placeholder for future implementation
-      console.log('Sync References - Not implemented yet');
-      alert('Sync References functionality will be implemented in a future update');
+      const response = await axios.post('http://localhost:8000/api/sync-references', {
+        batch_size: 30,
+        max_cves: 100,
+        force_resync: false
+      });
+      console.log('Reference sync response:', response.data);
+      fetchData();
    } catch (error) {
      console.error('Error syncing references:', error);
+      alert('Error starting reference sync: ' + (error.response?.data?.detail || error.message));
    }
  };

@ -499,7 +504,7 @@ function App() {
                : 'bg-orange-600 hover:bg-orange-700'
            }`}
          >
-            {hasRunningJobs ? 'Processing...' : 'Sync References (Coming Soon)'}
+            {hasRunningJobs ? 'Processing...' : 'Sync References'}
          </button>
        </div>

--- a/init.sql
+++ b/init.sql
@ -20,6 +20,10 @@ CREATE TABLE cves (
    -- nomi-sec PoC fields
    poc_count INTEGER DEFAULT 0,
    poc_data JSON,
+    -- Reference data fields
+    reference_data JSON,
+    reference_sync_status VARCHAR(20) DEFAULT 'pending',
+    reference_last_synced TIMESTAMP,
    created_at TIMESTAMP DEFAULT NOW(),
    updated_at TIMESTAMP DEFAULT NOW()
 );
@ -181,5 +185,7 @@ level: {level}',
 CREATE INDEX idx_cves_cve_id ON cves(cve_id);
 CREATE INDEX idx_cves_published_date ON cves(published_date);
 CREATE INDEX idx_cves_severity ON cves(severity);
+CREATE INDEX idx_cves_reference_sync_status ON cves(reference_sync_status);
+CREATE INDEX idx_cves_reference_last_synced ON cves(reference_last_synced);
 CREATE INDEX idx_sigma_rules_cve_id ON sigma_rules(cve_id);
 CREATE INDEX idx_sigma_rules_detection_type ON sigma_rules(detection_type);