From 3c120462ac9118d3ffeb117358e6147f9035a184 Mon Sep 17 00:00:00 2001
From: bpmcdevitt <brendan@mcdevitt.tech>
Date: Thu, 10 Jul 2025 17:30:12 -0500
Subject: [PATCH] add reference data gathering

---
 backend/cisa_kev_client.py |  45 +++++++++++-
 backend/main.py            | 137 +++++++++++++++++++++++++++++++++++++
 backend/requirements.txt   |   1 +
 frontend/src/App.js        |  13 ++--
 init.sql                   |   6 ++
 5 files changed, 197 insertions(+), 5 deletions(-)

diff --git a/backend/cisa_kev_client.py b/backend/cisa_kev_client.py
index 204bde9..cb51228 100644
--- a/backend/cisa_kev_client.py
+++ b/backend/cisa_kev_client.py
@@ -172,7 +172,10 @@ class CISAKEVClient:
             'required_actions': [],
             'ransomware_indicators': [],
             'vulnerability_types': [],
-            'mitigation_techniques': []
+            'mitigation_techniques': [],
+            'technical_details': [],
+            'impact_analysis': [],
+            'urgency_indicators': []
         }
         
         # Extract from vulnerability name and description
@@ -247,6 +250,46 @@ class CISAKEVClient:
             matches = re.findall(pattern, full_text, re.IGNORECASE)
             indicators['vulnerability_types'].extend(matches)
         
+        # Technical details extraction
+        technical_patterns = [
+            r'\b(port|service)\s+(\d+)\b',
+            r'\b(protocol)\s+(\w+)\b',
+            r'\b(version)\s+([\d\.]+)\b',
+            r'\b(cve-\d{4}-\d{4,7})\b',
+            r'\b(application|software|system)\s+(\w+)\b'
+        ]
+        
+        for pattern in technical_patterns:
+            matches = re.findall(pattern, full_text, re.IGNORECASE)
+            for match in matches:
+                if isinstance(match, tuple):
+                    indicators['technical_details'].append(' '.join(match))
+                else:
+                    indicators['technical_details'].append(match)
+        
+        # Impact analysis
+        impact_keywords = [
+            'critical system', 'data exfiltration', 'system compromise',
+            'unauthorized access', 'privilege escalation', 'lateral movement',
+            'ransomware deployment', 'data encryption', 'service disruption'
+        ]
+        
+        for keyword in impact_keywords:
+            if keyword in full_text:
+                indicators['impact_analysis'].append(keyword)
+        
+        # Urgency indicators
+        urgency_patterns = [
+            r'\b(immediate|urgent|critical|emergency)\b',
+            r'\b(actively exploited|in-the-wild|widespread)\b',
+            r'\b(patch.{0,10}available|fix.{0,10}available)\b',
+            r'\b(due.{0,10}date|deadline|must.{0,10}complete)\b'
+        ]
+        
+        for pattern in urgency_patterns:
+            matches = re.findall(pattern, full_text, re.IGNORECASE)
+            indicators['urgency_indicators'].extend(matches)
+        
         # Clean up and deduplicate
         for key in indicators:
             indicators[key] = list(set([item.strip() for item in indicators[key] if item and len(item.strip()) > 2]))
diff --git a/backend/main.py b/backend/main.py
index 876b3da..f59bc3a 100644
--- a/backend/main.py
+++ b/backend/main.py
@@ -57,6 +57,10 @@ class CVE(Base):
     # nomi-sec PoC fields
     poc_count = Column(Integer, default=0)
     poc_data = Column(JSON)  # Store nomi-sec PoC metadata
+    # Reference data fields
+    reference_data = Column(JSON)  # Store extracted reference content and analysis
+    reference_sync_status = Column(String(20), default='pending')  # 'pending', 'processing', 'completed', 'failed'
+    reference_last_synced = Column(TIMESTAMP)
     created_at = Column(TIMESTAMP, default=datetime.utcnow)
     updated_at = Column(TIMESTAMP, default=datetime.utcnow)
 
@@ -162,6 +166,12 @@ class CISAKEVSyncRequest(BaseModel):
     cve_id: Optional[str] = None
     batch_size: int = 100
 
+class ReferenceSyncRequest(BaseModel):
+    cve_id: Optional[str] = None
+    batch_size: int = 30
+    max_cves: Optional[int] = None
+    force_resync: bool = False
+
 class RuleRegenRequest(BaseModel):
     force: bool = False
 
@@ -1414,6 +1424,133 @@ async def sync_cisa_kev(background_tasks: BackgroundTasks,
         "batch_size": request.batch_size
     }
 
+@app.post("/api/sync-references")
+async def sync_references(request: ReferenceSyncRequest, background_tasks: BackgroundTasks, db: Session = Depends(get_db)):
+    """Start reference data synchronization"""
+    
+    try:
+        from reference_client import ReferenceClient
+        client = ReferenceClient(db)
+        
+        # Create job ID
+        job_id = str(uuid.uuid4())
+        
+        # Add job to tracking
+        running_jobs[job_id] = {
+            'type': 'reference_sync',
+            'status': 'running',
+            'cve_id': request.cve_id,
+            'batch_size': request.batch_size,
+            'max_cves': request.max_cves,
+            'force_resync': request.force_resync,
+            'started_at': datetime.utcnow()
+        }
+        
+        # Create cancellation flag
+        job_cancellation_flags[job_id] = False
+        
+        async def sync_task():
+            try:
+                if request.cve_id:
+                    # Single CVE sync
+                    result = await client.sync_cve_references(request.cve_id)
+                    running_jobs[job_id]['result'] = result
+                    running_jobs[job_id]['status'] = 'completed'
+                else:
+                    # Bulk sync
+                    result = await client.bulk_sync_references(
+                        batch_size=request.batch_size,
+                        max_cves=request.max_cves,
+                        cancellation_flag=lambda: job_cancellation_flags.get(job_id, False)
+                    )
+                    running_jobs[job_id]['result'] = result
+                    running_jobs[job_id]['status'] = 'completed'
+                
+                running_jobs[job_id]['completed_at'] = datetime.utcnow()
+                
+            except Exception as e:
+                logger.error(f"Reference sync task failed: {e}")
+                running_jobs[job_id]['status'] = 'failed'
+                running_jobs[job_id]['error'] = str(e)
+                running_jobs[job_id]['completed_at'] = datetime.utcnow()
+            finally:
+                # Clean up cancellation flag
+                job_cancellation_flags.pop(job_id, None)
+        
+        background_tasks.add_task(sync_task)
+        
+        return {
+            "message": f"Reference sync started" + (f" for {request.cve_id}" if request.cve_id else " for all CVEs"),
+            "status": "started",
+            "job_id": job_id,
+            "cve_id": request.cve_id,
+            "batch_size": request.batch_size,
+            "max_cves": request.max_cves,
+            "force_resync": request.force_resync
+        }
+        
+    except Exception as e:
+        logger.error(f"Failed to start reference sync: {e}")
+        raise HTTPException(status_code=500, detail=f"Failed to start reference sync: {str(e)}")
+
+@app.get("/api/reference-stats")
+async def get_reference_stats(db: Session = Depends(get_db)):
+    """Get reference synchronization statistics"""
+    
+    try:
+        from reference_client import ReferenceClient
+        client = ReferenceClient(db)
+        
+        # Get sync status
+        status = await client.get_reference_sync_status()
+        
+        # Get quality distribution from reference data
+        quality_distribution = {}
+        from sqlalchemy import text
+        cves_with_references = db.query(CVE).filter(
+            text("reference_data::text LIKE '%\"reference_analysis\"%'")
+        ).all()
+        
+        for cve in cves_with_references:
+            if cve.reference_data and 'reference_analysis' in cve.reference_data:
+                ref_analysis = cve.reference_data['reference_analysis']
+                high_conf_refs = ref_analysis.get('high_confidence_references', 0)
+                total_refs = ref_analysis.get('reference_count', 0)
+                
+                if total_refs > 0:
+                    quality_ratio = high_conf_refs / total_refs
+                    if quality_ratio >= 0.8:
+                        quality_tier = 'excellent'
+                    elif quality_ratio >= 0.6:
+                        quality_tier = 'good'
+                    elif quality_ratio >= 0.4:
+                        quality_tier = 'fair'
+                    else:
+                        quality_tier = 'poor'
+                    
+                    quality_distribution[quality_tier] = quality_distribution.get(quality_tier, 0) + 1
+        
+        # Get reference type distribution
+        reference_type_distribution = {}
+        for cve in cves_with_references:
+            if cve.reference_data and 'reference_analysis' in cve.reference_data:
+                ref_analysis = cve.reference_data['reference_analysis']
+                ref_types = ref_analysis.get('reference_types', [])
+                for ref_type in ref_types:
+                    reference_type_distribution[ref_type] = reference_type_distribution.get(ref_type, 0) + 1
+        
+        return {
+            'reference_sync_status': status,
+            'quality_distribution': quality_distribution,
+            'reference_type_distribution': reference_type_distribution,
+            'total_with_reference_analysis': len(cves_with_references),
+            'source': 'reference_extraction'
+        }
+        
+    except Exception as e:
+        logger.error(f"Failed to get reference stats: {e}")
+        raise HTTPException(status_code=500, detail=f"Failed to get reference stats: {str(e)}")
+
 @app.get("/api/exploitdb-stats")
 async def get_exploitdb_stats(db: Session = Depends(get_db)):
     """Get ExploitDB-related statistics"""
diff --git a/backend/requirements.txt b/backend/requirements.txt
index 6a03a5d..dc2d6e0 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -22,3 +22,4 @@ langchain-community==0.2.0
 langchain-core>=0.2.20
 openai>=1.32.0
 anthropic==0.40.0
+certifi==2024.2.2
diff --git a/frontend/src/App.js b/frontend/src/App.js
index 4480cd1..d98ef26 100644
--- a/frontend/src/App.js
+++ b/frontend/src/App.js
@@ -248,11 +248,16 @@ function App() {
 
   const syncReferences = async () => {
     try {
-      // Placeholder for future implementation
-      console.log('Sync References - Not implemented yet');
-      alert('Sync References functionality will be implemented in a future update');
+      const response = await axios.post('http://localhost:8000/api/sync-references', {
+        batch_size: 30,
+        max_cves: 100,
+        force_resync: false
+      });
+      console.log('Reference sync response:', response.data);
+      fetchData();
     } catch (error) {
       console.error('Error syncing references:', error);
+      alert('Error starting reference sync: ' + (error.response?.data?.detail || error.message));
     }
   };
 
@@ -499,7 +504,7 @@ function App() {
                 : 'bg-orange-600 hover:bg-orange-700'
             }`}
           >
-            {hasRunningJobs ? 'Processing...' : 'Sync References (Coming Soon)'}
+            {hasRunningJobs ? 'Processing...' : 'Sync References'}
           </button>
         </div>
 
diff --git a/init.sql b/init.sql
index 8e9e7d5..defed1d 100644
--- a/init.sql
+++ b/init.sql
@@ -20,6 +20,10 @@ CREATE TABLE cves (
     -- nomi-sec PoC fields
     poc_count INTEGER DEFAULT 0,
     poc_data JSON,
+    -- Reference data fields
+    reference_data JSON,
+    reference_sync_status VARCHAR(20) DEFAULT 'pending',
+    reference_last_synced TIMESTAMP,
     created_at TIMESTAMP DEFAULT NOW(),
     updated_at TIMESTAMP DEFAULT NOW()
 );
@@ -181,5 +185,7 @@ level: {level}',
 CREATE INDEX idx_cves_cve_id ON cves(cve_id);
 CREATE INDEX idx_cves_published_date ON cves(published_date);
 CREATE INDEX idx_cves_severity ON cves(severity);
+CREATE INDEX idx_cves_reference_sync_status ON cves(reference_sync_status);
+CREATE INDEX idx_cves_reference_last_synced ON cves(reference_last_synced);
 CREATE INDEX idx_sigma_rules_cve_id ON sigma_rules(cve_id);
 CREATE INDEX idx_sigma_rules_detection_type ON sigma_rules(detection_type);