add reference data gathering

This commit is contained in:
Brendan McDevitt 2025-07-10 17:30:12 -05:00
parent c1bbea09fe
commit 3c120462ac
5 changed files with 197 additions and 5 deletions

View file

@ -172,7 +172,10 @@ class CISAKEVClient:
'required_actions': [],
'ransomware_indicators': [],
'vulnerability_types': [],
'mitigation_techniques': []
'mitigation_techniques': [],
'technical_details': [],
'impact_analysis': [],
'urgency_indicators': []
}
# Extract from vulnerability name and description
@ -247,6 +250,46 @@ class CISAKEVClient:
matches = re.findall(pattern, full_text, re.IGNORECASE)
indicators['vulnerability_types'].extend(matches)
# Technical details extraction
technical_patterns = [
r'\b(port|service)\s+(\d+)\b',
r'\b(protocol)\s+(\w+)\b',
r'\b(version)\s+([\d\.]+)\b',
r'\b(cve-\d{4}-\d{4,7})\b',
r'\b(application|software|system)\s+(\w+)\b'
]
for pattern in technical_patterns:
matches = re.findall(pattern, full_text, re.IGNORECASE)
for match in matches:
if isinstance(match, tuple):
indicators['technical_details'].append(' '.join(match))
else:
indicators['technical_details'].append(match)
# Impact analysis
impact_keywords = [
'critical system', 'data exfiltration', 'system compromise',
'unauthorized access', 'privilege escalation', 'lateral movement',
'ransomware deployment', 'data encryption', 'service disruption'
]
for keyword in impact_keywords:
if keyword in full_text:
indicators['impact_analysis'].append(keyword)
# Urgency indicators
urgency_patterns = [
r'\b(immediate|urgent|critical|emergency)\b',
r'\b(actively exploited|in-the-wild|widespread)\b',
r'\b(patch.{0,10}available|fix.{0,10}available)\b',
r'\b(due.{0,10}date|deadline|must.{0,10}complete)\b'
]
for pattern in urgency_patterns:
matches = re.findall(pattern, full_text, re.IGNORECASE)
indicators['urgency_indicators'].extend(matches)
# Clean up and deduplicate
for key in indicators:
indicators[key] = list(set([item.strip() for item in indicators[key] if item and len(item.strip()) > 2]))

View file

@ -57,6 +57,10 @@ class CVE(Base):
# nomi-sec PoC fields
poc_count = Column(Integer, default=0)
poc_data = Column(JSON) # Store nomi-sec PoC metadata
# Reference data fields
reference_data = Column(JSON) # Store extracted reference content and analysis
reference_sync_status = Column(String(20), default='pending') # 'pending', 'processing', 'completed', 'failed'
reference_last_synced = Column(TIMESTAMP)
created_at = Column(TIMESTAMP, default=datetime.utcnow)
updated_at = Column(TIMESTAMP, default=datetime.utcnow)
@ -162,6 +166,12 @@ class CISAKEVSyncRequest(BaseModel):
cve_id: Optional[str] = None
batch_size: int = 100
class ReferenceSyncRequest(BaseModel):
cve_id: Optional[str] = None
batch_size: int = 30
max_cves: Optional[int] = None
force_resync: bool = False
class RuleRegenRequest(BaseModel):
force: bool = False
@ -1414,6 +1424,133 @@ async def sync_cisa_kev(background_tasks: BackgroundTasks,
"batch_size": request.batch_size
}
@app.post("/api/sync-references")
async def sync_references(request: ReferenceSyncRequest, background_tasks: BackgroundTasks, db: Session = Depends(get_db)):
"""Start reference data synchronization"""
try:
from reference_client import ReferenceClient
client = ReferenceClient(db)
# Create job ID
job_id = str(uuid.uuid4())
# Add job to tracking
running_jobs[job_id] = {
'type': 'reference_sync',
'status': 'running',
'cve_id': request.cve_id,
'batch_size': request.batch_size,
'max_cves': request.max_cves,
'force_resync': request.force_resync,
'started_at': datetime.utcnow()
}
# Create cancellation flag
job_cancellation_flags[job_id] = False
async def sync_task():
try:
if request.cve_id:
# Single CVE sync
result = await client.sync_cve_references(request.cve_id)
running_jobs[job_id]['result'] = result
running_jobs[job_id]['status'] = 'completed'
else:
# Bulk sync
result = await client.bulk_sync_references(
batch_size=request.batch_size,
max_cves=request.max_cves,
cancellation_flag=lambda: job_cancellation_flags.get(job_id, False)
)
running_jobs[job_id]['result'] = result
running_jobs[job_id]['status'] = 'completed'
running_jobs[job_id]['completed_at'] = datetime.utcnow()
except Exception as e:
logger.error(f"Reference sync task failed: {e}")
running_jobs[job_id]['status'] = 'failed'
running_jobs[job_id]['error'] = str(e)
running_jobs[job_id]['completed_at'] = datetime.utcnow()
finally:
# Clean up cancellation flag
job_cancellation_flags.pop(job_id, None)
background_tasks.add_task(sync_task)
return {
"message": f"Reference sync started" + (f" for {request.cve_id}" if request.cve_id else " for all CVEs"),
"status": "started",
"job_id": job_id,
"cve_id": request.cve_id,
"batch_size": request.batch_size,
"max_cves": request.max_cves,
"force_resync": request.force_resync
}
except Exception as e:
logger.error(f"Failed to start reference sync: {e}")
raise HTTPException(status_code=500, detail=f"Failed to start reference sync: {str(e)}")
@app.get("/api/reference-stats")
async def get_reference_stats(db: Session = Depends(get_db)):
"""Get reference synchronization statistics"""
try:
from reference_client import ReferenceClient
client = ReferenceClient(db)
# Get sync status
status = await client.get_reference_sync_status()
# Get quality distribution from reference data
quality_distribution = {}
from sqlalchemy import text
cves_with_references = db.query(CVE).filter(
text("reference_data::text LIKE '%\"reference_analysis\"%'")
).all()
for cve in cves_with_references:
if cve.reference_data and 'reference_analysis' in cve.reference_data:
ref_analysis = cve.reference_data['reference_analysis']
high_conf_refs = ref_analysis.get('high_confidence_references', 0)
total_refs = ref_analysis.get('reference_count', 0)
if total_refs > 0:
quality_ratio = high_conf_refs / total_refs
if quality_ratio >= 0.8:
quality_tier = 'excellent'
elif quality_ratio >= 0.6:
quality_tier = 'good'
elif quality_ratio >= 0.4:
quality_tier = 'fair'
else:
quality_tier = 'poor'
quality_distribution[quality_tier] = quality_distribution.get(quality_tier, 0) + 1
# Get reference type distribution
reference_type_distribution = {}
for cve in cves_with_references:
if cve.reference_data and 'reference_analysis' in cve.reference_data:
ref_analysis = cve.reference_data['reference_analysis']
ref_types = ref_analysis.get('reference_types', [])
for ref_type in ref_types:
reference_type_distribution[ref_type] = reference_type_distribution.get(ref_type, 0) + 1
return {
'reference_sync_status': status,
'quality_distribution': quality_distribution,
'reference_type_distribution': reference_type_distribution,
'total_with_reference_analysis': len(cves_with_references),
'source': 'reference_extraction'
}
except Exception as e:
logger.error(f"Failed to get reference stats: {e}")
raise HTTPException(status_code=500, detail=f"Failed to get reference stats: {str(e)}")
@app.get("/api/exploitdb-stats")
async def get_exploitdb_stats(db: Session = Depends(get_db)):
"""Get ExploitDB-related statistics"""

View file

@ -22,3 +22,4 @@ langchain-community==0.2.0
langchain-core>=0.2.20
openai>=1.32.0
anthropic==0.40.0
certifi==2024.2.2

View file

@ -248,11 +248,16 @@ function App() {
const syncReferences = async () => {
try {
// Placeholder for future implementation
console.log('Sync References - Not implemented yet');
alert('Sync References functionality will be implemented in a future update');
const response = await axios.post('http://localhost:8000/api/sync-references', {
batch_size: 30,
max_cves: 100,
force_resync: false
});
console.log('Reference sync response:', response.data);
fetchData();
} catch (error) {
console.error('Error syncing references:', error);
alert('Error starting reference sync: ' + (error.response?.data?.detail || error.message));
}
};
@ -499,7 +504,7 @@ function App() {
: 'bg-orange-600 hover:bg-orange-700'
}`}
>
{hasRunningJobs ? 'Processing...' : 'Sync References (Coming Soon)'}
{hasRunningJobs ? 'Processing...' : 'Sync References'}
</button>
</div>

View file

@ -20,6 +20,10 @@ CREATE TABLE cves (
-- nomi-sec PoC fields
poc_count INTEGER DEFAULT 0,
poc_data JSON,
-- Reference data fields
reference_data JSON,
reference_sync_status VARCHAR(20) DEFAULT 'pending',
reference_last_synced TIMESTAMP,
created_at TIMESTAMP DEFAULT NOW(),
updated_at TIMESTAMP DEFAULT NOW()
);
@ -181,5 +185,7 @@ level: {level}',
CREATE INDEX idx_cves_cve_id ON cves(cve_id);
CREATE INDEX idx_cves_published_date ON cves(published_date);
CREATE INDEX idx_cves_severity ON cves(severity);
CREATE INDEX idx_cves_reference_sync_status ON cves(reference_sync_status);
CREATE INDEX idx_cves_reference_last_synced ON cves(reference_last_synced);
CREATE INDEX idx_sigma_rules_cve_id ON sigma_rules(cve_id);
CREATE INDEX idx_sigma_rules_detection_type ON sigma_rules(detection_type);