more updates for bulk

This commit is contained in:
Brendan McDevitt 2025-07-08 17:50:01 -05:00
parent 5a9ae34996
commit 790e4bd91f
11 changed files with 2500 additions and 22 deletions

View file

@ -1,20 +1,27 @@
# CVE-SIGMA Auto Generator # CVE-SIGMA Auto Generator (Enhanced)
An automated platform that fetches CVE data and automatically generates SIGMA rules for threat detection. An advanced automated platform that processes comprehensive CVE data and generates enhanced SIGMA rules for threat detection using curated exploit intelligence.
## Features ## 🚀 Enhanced Features
- **Automated CVE Fetching**: Regularly polls the NVD (National Vulnerability Database) for CVEs from July 2025 ### Data Processing
- **GitHub Exploit Analysis**: Automatically searches GitHub for exploit code related to each CVE - **Bulk NVD Processing**: Downloads and processes complete NVD JSON datasets (2002-2025)
- **Intelligent SIGMA Rule Generation**: Creates SIGMA rules based on CVE characteristics AND actual exploit code - **nomi-sec PoC Integration**: Uses curated PoC data from github.com/nomi-sec/PoC-in-GitHub
- **Exploit-Based Detection**: Enhanced rules using real indicators extracted from GitHub exploits - **Incremental Updates**: Efficient updates using NVD modified/recent feeds
- **Modern Web Interface**: React-based UI for browsing CVEs and managing SIGMA rules - **Quality Assessment**: Advanced PoC quality scoring with star count, recency, and relevance analysis
- **Real-time Updates**: Background tasks keep CVE data current with current 2025 vulnerabilities
- **Rule Templates**: Configurable templates for different types of vulnerabilities ### Intelligence Generation
- **MITRE ATT&CK Mapping**: Automatic mapping to MITRE ATT&CK techniques - **Enhanced SIGMA Rules**: Creates rules using real exploit indicators from curated PoCs
- **API Testing**: Built-in NVD API connectivity testing - **Quality Tiers**: Excellent, Good, Fair, Poor, Very Poor classification system
- **Enhanced Error Handling**: Robust fallback mechanisms and detailed logging - **Smart Template Selection**: AI-driven template matching based on PoC characteristics
- **Docker Compose**: Easy deployment and orchestration - **Advanced Indicator Extraction**: Processes, files, network, registry, and command patterns
- **MITRE ATT&CK Mapping**: Automatic technique identification based on exploit analysis
### User Experience
- **Modern Web Interface**: React-based UI with enhanced bulk processing controls
- **Real-time Monitoring**: Live job tracking and progress monitoring
- **Comprehensive Statistics**: PoC coverage, quality metrics, and processing status
- **Bulk Operations Dashboard**: Centralized control for all data processing operations
## Architecture ## Architecture

340
backend/bulk_seeder.py Normal file
View file

@ -0,0 +1,340 @@
"""
Bulk Data Seeding Coordinator
Orchestrates the complete bulk seeding process using NVD JSON feeds and nomi-sec PoC data
"""
import asyncio
import logging
from datetime import datetime
from typing import Optional
from sqlalchemy.orm import Session
from nvd_bulk_processor import NVDBulkProcessor
from nomi_sec_client import NomiSecClient
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class BulkSeeder:
"""Coordinates bulk seeding operations"""
def __init__(self, db_session: Session):
self.db_session = db_session
self.nvd_processor = NVDBulkProcessor(db_session)
self.nomi_sec_client = NomiSecClient(db_session)
async def full_bulk_seed(self, start_year: int = 2002,
end_year: Optional[int] = None,
skip_nvd: bool = False,
skip_nomi_sec: bool = False) -> dict:
"""
Perform complete bulk seeding operation
Args:
start_year: Starting year for NVD data (default: 2002)
end_year: Ending year for NVD data (default: current year)
skip_nvd: Skip NVD bulk processing (default: False)
skip_nomi_sec: Skip nomi-sec PoC synchronization (default: False)
Returns:
Dictionary containing operation results
"""
if end_year is None:
end_year = datetime.now().year
results = {
'start_time': datetime.utcnow(),
'nvd_results': None,
'nomi_sec_results': None,
'total_time': None,
'status': 'running'
}
logger.info(f"Starting full bulk seed operation ({start_year}-{end_year})")
try:
# Phase 1: NVD Bulk Processing
if not skip_nvd:
logger.info("Phase 1: Starting NVD bulk processing...")
nvd_results = await self.nvd_processor.bulk_seed_database(
start_year=start_year,
end_year=end_year
)
results['nvd_results'] = nvd_results
logger.info(f"Phase 1 complete: {nvd_results['total_processed']} CVEs processed")
else:
logger.info("Phase 1: Skipping NVD bulk processing")
# Phase 2: nomi-sec PoC Synchronization
if not skip_nomi_sec:
logger.info("Phase 2: Starting nomi-sec PoC synchronization...")
nomi_sec_results = await self.nomi_sec_client.bulk_sync_all_cves(
batch_size=50 # Smaller batches for API stability
)
results['nomi_sec_results'] = nomi_sec_results
logger.info(f"Phase 2 complete: {nomi_sec_results['total_pocs_found']} PoCs found")
else:
logger.info("Phase 2: Skipping nomi-sec PoC synchronization")
# Phase 3: Generate Enhanced SIGMA Rules
logger.info("Phase 3: Generating enhanced SIGMA rules...")
sigma_results = await self.generate_enhanced_sigma_rules()
results['sigma_results'] = sigma_results
logger.info(f"Phase 3 complete: {sigma_results['rules_generated']} rules generated")
results['status'] = 'completed'
results['end_time'] = datetime.utcnow()
results['total_time'] = (results['end_time'] - results['start_time']).total_seconds()
logger.info(f"Full bulk seed operation completed in {results['total_time']:.2f} seconds")
except Exception as e:
logger.error(f"Bulk seed operation failed: {e}")
results['status'] = 'failed'
results['error'] = str(e)
results['end_time'] = datetime.utcnow()
return results
async def incremental_update(self) -> dict:
"""
Perform incremental update operation
Returns:
Dictionary containing update results
"""
results = {
'start_time': datetime.utcnow(),
'nvd_update': None,
'nomi_sec_update': None,
'status': 'running'
}
logger.info("Starting incremental update...")
try:
# Update NVD data using modified/recent feeds
logger.info("Updating NVD data...")
nvd_update = await self.nvd_processor.incremental_update()
results['nvd_update'] = nvd_update
# Update PoC data for newly added/modified CVEs
if nvd_update['total_processed'] > 0:
logger.info("Updating PoC data for modified CVEs...")
# Get recently modified CVEs and sync their PoCs
recent_cves = await self._get_recently_modified_cves()
nomi_sec_update = await self._sync_specific_cves(recent_cves)
results['nomi_sec_update'] = nomi_sec_update
results['status'] = 'completed'
results['end_time'] = datetime.utcnow()
except Exception as e:
logger.error(f"Incremental update failed: {e}")
results['status'] = 'failed'
results['error'] = str(e)
results['end_time'] = datetime.utcnow()
return results
async def generate_enhanced_sigma_rules(self) -> dict:
"""Generate enhanced SIGMA rules using nomi-sec PoC data"""
from main import CVE, SigmaRule
# Import the enhanced rule generator
from enhanced_sigma_generator import EnhancedSigmaGenerator
generator = EnhancedSigmaGenerator(self.db_session)
# Get all CVEs that have PoC data but no enhanced rules
cves_with_pocs = self.db_session.query(CVE).filter(
CVE.poc_count > 0
).all()
rules_generated = 0
rules_updated = 0
for cve in cves_with_pocs:
try:
# Check if we need to generate/update the rule
existing_rule = self.db_session.query(SigmaRule).filter(
SigmaRule.cve_id == cve.cve_id
).first()
if existing_rule and existing_rule.poc_source == 'nomi_sec':
# Rule already exists and is up to date
continue
# Generate enhanced rule
rule_result = await generator.generate_enhanced_rule(cve)
if rule_result['success']:
if existing_rule:
rules_updated += 1
else:
rules_generated += 1
except Exception as e:
logger.error(f"Error generating rule for {cve.cve_id}: {e}")
continue
self.db_session.commit()
return {
'rules_generated': rules_generated,
'rules_updated': rules_updated,
'total_processed': len(cves_with_pocs)
}
async def _get_recently_modified_cves(self, hours: int = 24) -> list:
"""Get CVEs modified within the last N hours"""
from main import CVE
cutoff_time = datetime.utcnow() - timedelta(hours=hours)
recent_cves = self.db_session.query(CVE).filter(
CVE.updated_at >= cutoff_time
).all()
return [cve.cve_id for cve in recent_cves]
async def _sync_specific_cves(self, cve_ids: list) -> dict:
"""Sync PoC data for specific CVEs"""
total_processed = 0
total_pocs_found = 0
for cve_id in cve_ids:
try:
result = await self.nomi_sec_client.sync_cve_pocs(cve_id)
total_processed += 1
total_pocs_found += result.get('pocs_found', 0)
# Small delay to avoid overwhelming the API
await asyncio.sleep(0.5)
except Exception as e:
logger.error(f"Error syncing PoCs for {cve_id}: {e}")
continue
return {
'total_processed': total_processed,
'total_pocs_found': total_pocs_found
}
async def get_seeding_status(self) -> dict:
"""Get current seeding status and statistics"""
from main import CVE, SigmaRule, BulkProcessingJob
# Get database statistics
total_cves = self.db_session.query(CVE).count()
bulk_processed_cves = self.db_session.query(CVE).filter(
CVE.bulk_processed == True
).count()
cves_with_pocs = self.db_session.query(CVE).filter(
CVE.poc_count > 0
).count()
total_rules = self.db_session.query(SigmaRule).count()
nomi_sec_rules = self.db_session.query(SigmaRule).filter(
SigmaRule.poc_source == 'nomi_sec'
).count()
# Get recent job status
recent_jobs = self.db_session.query(BulkProcessingJob).order_by(
BulkProcessingJob.created_at.desc()
).limit(5).all()
job_status = []
for job in recent_jobs:
job_status.append({
'id': str(job.id),
'job_type': job.job_type,
'status': job.status,
'created_at': job.created_at,
'completed_at': job.completed_at,
'processed_items': job.processed_items,
'total_items': job.total_items,
'failed_items': job.failed_items
})
return {
'database_stats': {
'total_cves': total_cves,
'bulk_processed_cves': bulk_processed_cves,
'cves_with_pocs': cves_with_pocs,
'total_rules': total_rules,
'nomi_sec_rules': nomi_sec_rules,
'poc_coverage': (cves_with_pocs / total_cves * 100) if total_cves > 0 else 0,
'nomi_sec_coverage': (nomi_sec_rules / total_rules * 100) if total_rules > 0 else 0
},
'recent_jobs': job_status,
'nvd_data_status': await self._get_nvd_data_status(),
'nomi_sec_status': await self.nomi_sec_client.get_sync_status()
}
async def _get_nvd_data_status(self) -> dict:
"""Get NVD data status"""
from main import CVE
# Get year distribution
year_counts = {}
cves = self.db_session.query(CVE).all()
for cve in cves:
if cve.published_date:
year = cve.published_date.year
year_counts[year] = year_counts.get(year, 0) + 1
# Get source distribution
source_counts = {}
for cve in cves:
source = cve.data_source or 'unknown'
source_counts[source] = source_counts.get(source, 0) + 1
return {
'year_distribution': year_counts,
'source_distribution': source_counts,
'total_cves': len(cves),
'date_range': {
'earliest': min(cve.published_date for cve in cves if cve.published_date),
'latest': max(cve.published_date for cve in cves if cve.published_date)
} if cves else None
}
# Standalone script functionality
async def main():
"""Main function for standalone execution"""
from main import SessionLocal, engine, Base
# Create tables
Base.metadata.create_all(bind=engine)
# Create database session
db_session = SessionLocal()
try:
# Create bulk seeder
seeder = BulkSeeder(db_session)
# Get current status
status = await seeder.get_seeding_status()
print(f"Current Status: {status['database_stats']['total_cves']} CVEs in database")
# Perform full bulk seed if database is empty
if status['database_stats']['total_cves'] == 0:
print("Database is empty. Starting full bulk seed...")
results = await seeder.full_bulk_seed(start_year=2020) # Start from 2020 for faster testing
print(f"Bulk seed completed: {results}")
else:
print("Database contains data. Running incremental update...")
results = await seeder.incremental_update()
print(f"Incremental update completed: {results}")
finally:
db_session.close()
if __name__ == "__main__":
asyncio.run(main())

View file

@ -0,0 +1,438 @@
"""
Enhanced SIGMA Rule Generator
Generates improved SIGMA rules using nomi-sec PoC data and traditional indicators
"""
import json
import logging
from datetime import datetime
from typing import Dict, List, Optional, Tuple
from sqlalchemy.orm import Session
import re
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class EnhancedSigmaGenerator:
"""Enhanced SIGMA rule generator using nomi-sec PoC data"""
def __init__(self, db_session: Session):
self.db_session = db_session
async def generate_enhanced_rule(self, cve) -> dict:
"""Generate enhanced SIGMA rule for a CVE using PoC data"""
from main import SigmaRule, RuleTemplate
try:
# Get PoC data
poc_data = cve.poc_data or []
# Find the best quality PoC
best_poc = None
if poc_data:
best_poc = max(poc_data, key=lambda x: x.get('quality_analysis', {}).get('quality_score', 0))
# Select appropriate template based on PoC analysis
template = await self._select_template(cve, best_poc)
if not template:
logger.warning(f"No suitable template found for {cve.cve_id}")
return {'success': False, 'error': 'No suitable template'}
# Generate rule content
rule_content = await self._generate_rule_content(cve, template, poc_data)
# Calculate confidence level
confidence_level = self._calculate_confidence_level(cve, poc_data)
# Store or update SIGMA rule
existing_rule = self.db_session.query(SigmaRule).filter(
SigmaRule.cve_id == cve.cve_id
).first()
rule_data = {
'cve_id': cve.cve_id,
'rule_name': f"{cve.cve_id} Enhanced Detection",
'rule_content': rule_content,
'detection_type': template.template_name,
'log_source': self._extract_log_source(template.template_name),
'confidence_level': confidence_level,
'auto_generated': True,
'exploit_based': len(poc_data) > 0,
'poc_source': 'nomi_sec',
'poc_quality_score': best_poc.get('quality_analysis', {}).get('quality_score', 0) if best_poc else 0,
'nomi_sec_data': {
'total_pocs': len(poc_data),
'best_poc_quality': best_poc.get('quality_analysis', {}).get('quality_score', 0) if best_poc else 0,
'total_stars': sum(p.get('stargazers_count', 0) for p in poc_data),
'avg_stars': sum(p.get('stargazers_count', 0) for p in poc_data) / len(poc_data) if poc_data else 0
},
'github_repos': [p.get('html_url', '') for p in poc_data],
'exploit_indicators': json.dumps(self._combine_exploit_indicators(poc_data)),
'updated_at': datetime.utcnow()
}
if existing_rule:
# Update existing rule
for key, value in rule_data.items():
setattr(existing_rule, key, value)
logger.info(f"Updated SIGMA rule for {cve.cve_id}")
else:
# Create new rule
new_rule = SigmaRule(**rule_data)
self.db_session.add(new_rule)
logger.info(f"Created new SIGMA rule for {cve.cve_id}")
self.db_session.commit()
return {
'success': True,
'cve_id': cve.cve_id,
'template': template.template_name,
'confidence_level': confidence_level,
'poc_count': len(poc_data),
'quality_score': best_poc.get('quality_analysis', {}).get('quality_score', 0) if best_poc else 0
}
except Exception as e:
logger.error(f"Error generating enhanced rule for {cve.cve_id}: {e}")
return {'success': False, 'error': str(e)}
async def _select_template(self, cve, best_poc: Optional[dict]) -> Optional[object]:
"""Select the most appropriate template based on CVE and PoC analysis"""
from main import RuleTemplate
templates = self.db_session.query(RuleTemplate).all()
if not templates:
logger.warning("No rule templates found in database")
return None
# Score templates based on relevance
template_scores = {}
for template in templates:
score = 0
# Score based on PoC indicators (highest priority)
if best_poc:
indicators = best_poc.get('exploit_indicators', {})
score += self._score_template_poc_match(template, indicators)
# Score based on CVE description
score += self._score_template_cve_match(template, cve)
# Score based on affected products
if cve.affected_products:
score += self._score_template_product_match(template, cve.affected_products)
template_scores[template] = score
# Return template with highest score
if template_scores:
best_template = max(template_scores, key=template_scores.get)
logger.info(f"Selected template {best_template.template_name} with score {template_scores[best_template]}")
return best_template
return None
def _score_template_poc_match(self, template: object, indicators: dict) -> int:
"""Score template based on PoC indicators"""
score = 0
template_name = template.template_name.lower()
# Process-based templates
if 'process' in template_name or 'execution' in template_name:
if indicators.get('processes') or indicators.get('commands'):
score += 30
# Network-based templates
if 'network' in template_name or 'connection' in template_name:
if indicators.get('network') or indicators.get('urls'):
score += 30
# File-based templates
if 'file' in template_name or 'modification' in template_name:
if indicators.get('files'):
score += 30
# PowerShell templates
if 'powershell' in template_name:
processes = indicators.get('processes', [])
if any('powershell' in p.lower() for p in processes):
score += 35
return score
def _score_template_cve_match(self, template: object, cve) -> int:
"""Score template based on CVE description"""
score = 0
template_name = template.template_name.lower()
description = (cve.description or '').lower()
# Keyword matching
if 'remote' in description and 'execution' in description:
if 'process' in template_name or 'execution' in template_name:
score += 20
if 'powershell' in description:
if 'powershell' in template_name:
score += 25
if 'network' in description or 'http' in description:
if 'network' in template_name:
score += 20
if 'file' in description or 'upload' in description:
if 'file' in template_name:
score += 20
return score
def _score_template_product_match(self, template: object, affected_products: list) -> int:
"""Score template based on affected products"""
score = 0
if not template.applicable_product_patterns:
return 0
for pattern in template.applicable_product_patterns:
pattern_lower = pattern.lower()
for product in affected_products:
product_lower = product.lower()
if pattern_lower in product_lower:
score += 10
break
return score
async def _generate_rule_content(self, cve, template: object, poc_data: list) -> str:
"""Generate the actual SIGMA rule content"""
# Combine all exploit indicators
combined_indicators = self._combine_exploit_indicators(poc_data)
# Get base template content
rule_content = template.template_content
# Replace template placeholders
replacements = {
'{{CVE_ID}}': cve.cve_id,
'{{TITLE}}': f"{cve.cve_id} Enhanced Detection",
'{{DESCRIPTION}}': self._generate_description(cve, poc_data),
'{{LEVEL}}': self._calculate_confidence_level(cve, poc_data).lower(),
'{{REFERENCES}}': self._generate_references(cve, poc_data),
'{{TAGS}}': self._generate_tags(cve, poc_data),
'{{PROCESSES}}': self._format_indicators(combined_indicators.get('processes', [])),
'{{FILES}}': self._format_indicators(combined_indicators.get('files', [])),
'{{COMMANDS}}': self._format_indicators(combined_indicators.get('commands', [])),
'{{NETWORK}}': self._format_indicators(combined_indicators.get('network', [])),
'{{URLS}}': self._format_indicators(combined_indicators.get('urls', [])),
'{{REGISTRY}}': self._format_indicators(combined_indicators.get('registry', []))
}
# Apply replacements
for placeholder, value in replacements.items():
rule_content = rule_content.replace(placeholder, value)
# Add enhanced detection based on PoC quality
if poc_data:
rule_content = self._enhance_detection_logic(rule_content, combined_indicators, poc_data)
return rule_content
def _combine_exploit_indicators(self, poc_data: list) -> dict:
"""Combine exploit indicators from all PoCs"""
combined = {
'processes': [],
'files': [],
'commands': [],
'network': [],
'urls': [],
'registry': []
}
for poc in poc_data:
indicators = poc.get('exploit_indicators', {})
for key in combined.keys():
if key in indicators:
combined[key].extend(indicators[key])
# Deduplicate and filter
for key in combined.keys():
combined[key] = list(set(combined[key]))
# Remove empty and invalid entries
combined[key] = [item for item in combined[key] if item and len(item) > 2]
return combined
def _generate_description(self, cve, poc_data: list) -> str:
"""Generate enhanced rule description"""
base_desc = f"Detection for {cve.cve_id}"
if cve.description:
# Extract key terms from CVE description
desc_words = cve.description.lower().split()
key_terms = [word for word in desc_words if word in [
'remote', 'execution', 'injection', 'bypass', 'privilege', 'escalation',
'overflow', 'disclosure', 'traversal', 'deserialization'
]]
if key_terms:
base_desc += f" involving {', '.join(set(key_terms[:3]))}"
if poc_data:
total_pocs = len(poc_data)
total_stars = sum(p.get('stargazers_count', 0) for p in poc_data)
base_desc += f" [Enhanced with {total_pocs} PoC(s), {total_stars} stars]"
return base_desc
def _generate_references(self, cve, poc_data: list) -> str:
"""Generate references section"""
refs = []
# Add CVE reference
refs.append(f"https://nvd.nist.gov/vuln/detail/{cve.cve_id}")
# Add top PoC references (max 3)
if poc_data:
sorted_pocs = sorted(poc_data, key=lambda x: x.get('stargazers_count', 0), reverse=True)
for poc in sorted_pocs[:3]:
if poc.get('html_url'):
refs.append(poc['html_url'])
return '\\n'.join(f" - {ref}" for ref in refs)
def _generate_tags(self, cve, poc_data: list) -> str:
"""Generate MITRE ATT&CK tags and other tags"""
tags = []
# CVE tag
tags.append(cve.cve_id.lower())
# Add technique tags based on indicators
combined_indicators = self._combine_exploit_indicators(poc_data)
if combined_indicators.get('processes'):
tags.append('attack.t1059') # Command and Scripting Interpreter
if combined_indicators.get('network'):
tags.append('attack.t1071') # Application Layer Protocol
if combined_indicators.get('files'):
tags.append('attack.t1105') # Ingress Tool Transfer
if any('powershell' in p.lower() for p in combined_indicators.get('processes', [])):
tags.append('attack.t1059.001') # PowerShell
# Add PoC quality tags
if poc_data:
tags.append('exploit.poc')
best_poc = max(poc_data, key=lambda x: x.get('quality_analysis', {}).get('quality_score', 0))
quality_tier = best_poc.get('quality_analysis', {}).get('quality_tier', 'poor')
tags.append(f'poc.quality.{quality_tier}')
return '\\n'.join(f" - {tag}" for tag in tags)
def _format_indicators(self, indicators: list) -> str:
"""Format indicators for SIGMA rule"""
if not indicators:
return ''
# Limit indicators to avoid overly complex rules
limited_indicators = indicators[:10]
formatted = []
for indicator in limited_indicators:
# Escape special characters for SIGMA
escaped = indicator.replace('\\\\', '\\\\\\\\').replace('*', '\\\\*').replace('?', '\\\\?')
formatted.append(f' - "{escaped}"')
return '\\n'.join(formatted)
def _enhance_detection_logic(self, rule_content: str, indicators: dict, poc_data: list) -> str:
"""Enhance detection logic based on PoC quality and indicators"""
# If we have high-quality PoCs, add additional detection conditions
best_poc = max(poc_data, key=lambda x: x.get('quality_analysis', {}).get('quality_score', 0))
quality_score = best_poc.get('quality_analysis', {}).get('quality_score', 0)
if quality_score > 60: # High quality PoC
# Add more specific detection conditions
if indicators.get('processes') and indicators.get('commands'):
additional_condition = """
process_and_command:
Image|contains: {{PROCESSES}}
CommandLine|contains: {{COMMANDS}}"""
# Insert before the condition line
rule_content = rule_content.replace(
'condition: selection',
additional_condition + '\\n condition: selection or process_and_command'
)
return rule_content
def _calculate_confidence_level(self, cve, poc_data: list) -> str:
"""Calculate confidence level based on CVE and PoC data"""
score = 0
# CVSS score factor
if cve.cvss_score:
if cve.cvss_score >= 9.0:
score += 40
elif cve.cvss_score >= 7.0:
score += 30
elif cve.cvss_score >= 5.0:
score += 20
else:
score += 10
# PoC quality factor
if poc_data:
total_stars = sum(p.get('stargazers_count', 0) for p in poc_data)
poc_count = len(poc_data)
score += min(total_stars, 30) # Max 30 points for stars
score += min(poc_count * 5, 20) # Max 20 points for PoC count
# Quality tier bonus
best_poc = max(poc_data, key=lambda x: x.get('quality_analysis', {}).get('quality_score', 0))
quality_tier = best_poc.get('quality_analysis', {}).get('quality_tier', 'poor')
tier_bonus = {
'excellent': 20,
'good': 15,
'fair': 10,
'poor': 5,
'very_poor': 0
}
score += tier_bonus.get(quality_tier, 0)
# Determine confidence level
if score >= 80:
return 'HIGH'
elif score >= 60:
return 'MEDIUM'
elif score >= 40:
return 'LOW'
else:
return 'INFORMATIONAL'
def _extract_log_source(self, template_name: str) -> str:
"""Extract log source from template name"""
template_lower = template_name.lower()
if 'process' in template_lower or 'execution' in template_lower:
return 'process_creation'
elif 'network' in template_lower:
return 'network_connection'
elif 'file' in template_lower:
return 'file_event'
elif 'powershell' in template_lower:
return 'powershell'
elif 'registry' in template_lower:
return 'registry_event'
else:
return 'generic'

View file

@ -1,7 +1,7 @@
from fastapi import FastAPI, HTTPException, BackgroundTasks, Depends from fastapi import FastAPI, HTTPException, BackgroundTasks, Depends
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse from fastapi.responses import JSONResponse
from sqlalchemy import create_engine, Column, String, Text, DECIMAL, TIMESTAMP, Boolean, ARRAY from sqlalchemy import create_engine, Column, String, Text, DECIMAL, TIMESTAMP, Boolean, ARRAY, Integer, JSON, func
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker, Session from sqlalchemy.orm import sessionmaker, Session
from sqlalchemy.dialects.postgresql import UUID from sqlalchemy.dialects.postgresql import UUID
@ -19,6 +19,16 @@ import base64
from github import Github from github import Github
from urllib.parse import urlparse from urllib.parse import urlparse
import hashlib import hashlib
import logging
import threading
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Global job tracking
running_jobs = {}
job_cancellation_flags = {}
# Database setup # Database setup
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://cve_user:cve_password@localhost:5432/cve_sigma_db") DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://cve_user:cve_password@localhost:5432/cve_sigma_db")
@ -39,6 +49,13 @@ class CVE(Base):
modified_date = Column(TIMESTAMP) modified_date = Column(TIMESTAMP)
affected_products = Column(ARRAY(String)) affected_products = Column(ARRAY(String))
reference_urls = Column(ARRAY(String)) reference_urls = Column(ARRAY(String))
# Bulk processing fields
data_source = Column(String(20), default='nvd_api') # 'nvd_api', 'nvd_bulk', 'manual'
nvd_json_version = Column(String(10), default='2.0')
bulk_processed = Column(Boolean, default=False)
# nomi-sec PoC fields
poc_count = Column(Integer, default=0)
poc_data = Column(JSON) # Store nomi-sec PoC metadata
created_at = Column(TIMESTAMP, default=datetime.utcnow) created_at = Column(TIMESTAMP, default=datetime.utcnow)
updated_at = Column(TIMESTAMP, default=datetime.utcnow) updated_at = Column(TIMESTAMP, default=datetime.utcnow)
@ -56,6 +73,10 @@ class SigmaRule(Base):
exploit_based = Column(Boolean, default=False) exploit_based = Column(Boolean, default=False)
github_repos = Column(ARRAY(String)) github_repos = Column(ARRAY(String))
exploit_indicators = Column(Text) # JSON string of extracted indicators exploit_indicators = Column(Text) # JSON string of extracted indicators
# Enhanced fields for new data sources
poc_source = Column(String(20), default='github_search') # 'github_search', 'nomi_sec', 'manual'
poc_quality_score = Column(Integer, default=0) # Based on star count, activity, etc.
nomi_sec_data = Column(JSON) # Store nomi-sec PoC metadata
created_at = Column(TIMESTAMP, default=datetime.utcnow) created_at = Column(TIMESTAMP, default=datetime.utcnow)
updated_at = Column(TIMESTAMP, default=datetime.utcnow) updated_at = Column(TIMESTAMP, default=datetime.utcnow)
@ -69,6 +90,23 @@ class RuleTemplate(Base):
description = Column(Text) description = Column(Text)
created_at = Column(TIMESTAMP, default=datetime.utcnow) created_at = Column(TIMESTAMP, default=datetime.utcnow)
class BulkProcessingJob(Base):
__tablename__ = "bulk_processing_jobs"
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
job_type = Column(String(50), nullable=False) # 'nvd_bulk_seed', 'nomi_sec_sync', 'incremental_update'
status = Column(String(20), default='pending') # 'pending', 'running', 'completed', 'failed', 'cancelled'
year = Column(Integer) # For year-based processing
total_items = Column(Integer, default=0)
processed_items = Column(Integer, default=0)
failed_items = Column(Integer, default=0)
error_message = Column(Text)
job_metadata = Column(JSON) # Additional job-specific data
started_at = Column(TIMESTAMP)
completed_at = Column(TIMESTAMP)
cancelled_at = Column(TIMESTAMP)
created_at = Column(TIMESTAMP, default=datetime.utcnow)
# Pydantic models # Pydantic models
class CVEResponse(BaseModel): class CVEResponse(BaseModel):
id: str id: str
@ -941,12 +979,341 @@ async def get_stats(db: Session = Depends(get_db)):
total_rules = db.query(SigmaRule).count() total_rules = db.query(SigmaRule).count()
recent_cves = db.query(CVE).filter(CVE.published_date >= datetime.utcnow() - timedelta(days=7)).count() recent_cves = db.query(CVE).filter(CVE.published_date >= datetime.utcnow() - timedelta(days=7)).count()
# Enhanced stats with bulk processing info
bulk_processed_cves = db.query(CVE).filter(CVE.bulk_processed == True).count()
cves_with_pocs = db.query(CVE).filter(CVE.poc_count > 0).count()
nomi_sec_rules = db.query(SigmaRule).filter(SigmaRule.poc_source == 'nomi_sec').count()
return { return {
"total_cves": total_cves, "total_cves": total_cves,
"total_sigma_rules": total_rules, "total_sigma_rules": total_rules,
"recent_cves_7_days": recent_cves "recent_cves_7_days": recent_cves,
"bulk_processed_cves": bulk_processed_cves,
"cves_with_pocs": cves_with_pocs,
"nomi_sec_rules": nomi_sec_rules,
"poc_coverage": (cves_with_pocs / total_cves * 100) if total_cves > 0 else 0,
"nomi_sec_coverage": (nomi_sec_rules / total_rules * 100) if total_rules > 0 else 0
} }
# New bulk processing endpoints
@app.post("/api/bulk-seed")
async def start_bulk_seed(background_tasks: BackgroundTasks,
start_year: int = 2002,
end_year: Optional[int] = None,
skip_nvd: bool = False,
skip_nomi_sec: bool = False,
db: Session = Depends(get_db)):
"""Start bulk seeding process"""
async def bulk_seed_task():
try:
from bulk_seeder import BulkSeeder
seeder = BulkSeeder(db)
result = await seeder.full_bulk_seed(
start_year=start_year,
end_year=end_year,
skip_nvd=skip_nvd,
skip_nomi_sec=skip_nomi_sec
)
logger.info(f"Bulk seed completed: {result}")
except Exception as e:
logger.error(f"Bulk seed failed: {e}")
import traceback
traceback.print_exc()
background_tasks.add_task(bulk_seed_task)
return {
"message": "Bulk seeding process started",
"status": "started",
"start_year": start_year,
"end_year": end_year or datetime.now().year,
"skip_nvd": skip_nvd,
"skip_nomi_sec": skip_nomi_sec
}
@app.post("/api/incremental-update")
async def start_incremental_update(background_tasks: BackgroundTasks, db: Session = Depends(get_db)):
"""Start incremental update process"""
async def incremental_update_task():
try:
from bulk_seeder import BulkSeeder
seeder = BulkSeeder(db)
result = await seeder.incremental_update()
logger.info(f"Incremental update completed: {result}")
except Exception as e:
logger.error(f"Incremental update failed: {e}")
import traceback
traceback.print_exc()
background_tasks.add_task(incremental_update_task)
return {
"message": "Incremental update process started",
"status": "started"
}
@app.post("/api/sync-nomi-sec")
async def sync_nomi_sec(background_tasks: BackgroundTasks,
cve_id: Optional[str] = None,
batch_size: int = 50,
db: Session = Depends(get_db)):
"""Synchronize nomi-sec PoC data"""
# Create job record
job = BulkProcessingJob(
job_type='nomi_sec_sync',
status='pending',
job_metadata={
'cve_id': cve_id,
'batch_size': batch_size
}
)
db.add(job)
db.commit()
db.refresh(job)
job_id = str(job.id)
running_jobs[job_id] = job
job_cancellation_flags[job_id] = False
async def sync_task():
try:
job.status = 'running'
job.started_at = datetime.utcnow()
db.commit()
from nomi_sec_client import NomiSecClient
client = NomiSecClient(db)
if cve_id:
# Sync specific CVE
if job_cancellation_flags.get(job_id, False):
logger.info(f"Job {job_id} cancelled before starting")
return
result = await client.sync_cve_pocs(cve_id)
logger.info(f"Nomi-sec sync for {cve_id}: {result}")
else:
# Sync all CVEs with cancellation support
result = await client.bulk_sync_all_cves(
batch_size=batch_size,
cancellation_flag=lambda: job_cancellation_flags.get(job_id, False)
)
logger.info(f"Nomi-sec bulk sync completed: {result}")
# Update job status if not cancelled
if not job_cancellation_flags.get(job_id, False):
job.status = 'completed'
job.completed_at = datetime.utcnow()
db.commit()
except Exception as e:
if not job_cancellation_flags.get(job_id, False):
job.status = 'failed'
job.error_message = str(e)
job.completed_at = datetime.utcnow()
db.commit()
logger.error(f"Nomi-sec sync failed: {e}")
import traceback
traceback.print_exc()
finally:
# Clean up tracking
running_jobs.pop(job_id, None)
job_cancellation_flags.pop(job_id, None)
background_tasks.add_task(sync_task)
return {
"message": f"Nomi-sec sync started" + (f" for {cve_id}" if cve_id else " for all CVEs"),
"status": "started",
"job_id": job_id,
"cve_id": cve_id,
"batch_size": batch_size
}
@app.get("/api/bulk-jobs")
async def get_bulk_jobs(limit: int = 10, db: Session = Depends(get_db)):
"""Get bulk processing job status"""
jobs = db.query(BulkProcessingJob).order_by(
BulkProcessingJob.created_at.desc()
).limit(limit).all()
result = []
for job in jobs:
job_dict = {
'id': str(job.id),
'job_type': job.job_type,
'status': job.status,
'year': job.year,
'total_items': job.total_items,
'processed_items': job.processed_items,
'failed_items': job.failed_items,
'error_message': job.error_message,
'metadata': job.job_metadata,
'started_at': job.started_at,
'completed_at': job.completed_at,
'created_at': job.created_at
}
result.append(job_dict)
return result
@app.get("/api/bulk-status")
async def get_bulk_status(db: Session = Depends(get_db)):
"""Get comprehensive bulk processing status"""
try:
from bulk_seeder import BulkSeeder
seeder = BulkSeeder(db)
status = await seeder.get_seeding_status()
return status
except Exception as e:
logger.error(f"Error getting bulk status: {e}")
return {"error": str(e)}
@app.get("/api/poc-stats")
async def get_poc_stats(db: Session = Depends(get_db)):
"""Get PoC-related statistics"""
try:
from nomi_sec_client import NomiSecClient
client = NomiSecClient(db)
stats = await client.get_sync_status()
# Additional PoC statistics
high_quality_cves = db.query(CVE).filter(
CVE.poc_count > 0,
func.json_extract_path_text(CVE.poc_data, '0', 'quality_analysis', 'quality_score').cast(Integer) > 60
).count()
stats.update({
'high_quality_cves': high_quality_cves,
'avg_poc_count': db.query(func.avg(CVE.poc_count)).filter(CVE.poc_count > 0).scalar() or 0
})
return stats
except Exception as e:
logger.error(f"Error getting PoC stats: {e}")
return {"error": str(e)}
@app.post("/api/regenerate-rules")
async def regenerate_sigma_rules(background_tasks: BackgroundTasks,
force: bool = False,
db: Session = Depends(get_db)):
"""Regenerate SIGMA rules using enhanced nomi-sec data"""
async def regenerate_task():
try:
from enhanced_sigma_generator import EnhancedSigmaGenerator
generator = EnhancedSigmaGenerator(db)
# Get CVEs with PoC data
cves_with_pocs = db.query(CVE).filter(CVE.poc_count > 0).all()
rules_generated = 0
rules_updated = 0
for cve in cves_with_pocs:
# Check if we should regenerate
existing_rule = db.query(SigmaRule).filter(
SigmaRule.cve_id == cve.cve_id
).first()
if existing_rule and existing_rule.poc_source == 'nomi_sec' and not force:
continue
# Generate enhanced rule
result = await generator.generate_enhanced_rule(cve)
if result['success']:
if existing_rule:
rules_updated += 1
else:
rules_generated += 1
logger.info(f"Rule regeneration completed: {rules_generated} new, {rules_updated} updated")
except Exception as e:
logger.error(f"Rule regeneration failed: {e}")
import traceback
traceback.print_exc()
background_tasks.add_task(regenerate_task)
return {
"message": "SIGMA rule regeneration started",
"status": "started",
"force": force
}
@app.post("/api/cancel-job/{job_id}")
async def cancel_job(job_id: str, db: Session = Depends(get_db)):
"""Cancel a running job"""
try:
# Find the job in the database
job = db.query(BulkProcessingJob).filter(BulkProcessingJob.id == job_id).first()
if not job:
raise HTTPException(status_code=404, detail="Job not found")
if job.status not in ['pending', 'running']:
raise HTTPException(status_code=400, detail=f"Cannot cancel job with status: {job.status}")
# Set cancellation flag
job_cancellation_flags[job_id] = True
# Update job status
job.status = 'cancelled'
job.cancelled_at = datetime.utcnow()
job.error_message = "Job cancelled by user"
db.commit()
logger.info(f"Job {job_id} cancellation requested")
return {
"message": f"Job {job_id} cancellation requested",
"status": "cancelled",
"job_id": job_id
}
except HTTPException:
raise
except Exception as e:
logger.error(f"Error cancelling job {job_id}: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.get("/api/running-jobs")
async def get_running_jobs(db: Session = Depends(get_db)):
"""Get all currently running jobs"""
try:
jobs = db.query(BulkProcessingJob).filter(
BulkProcessingJob.status.in_(['pending', 'running'])
).order_by(BulkProcessingJob.created_at.desc()).all()
result = []
for job in jobs:
result.append({
'id': str(job.id),
'job_type': job.job_type,
'status': job.status,
'year': job.year,
'total_items': job.total_items,
'processed_items': job.processed_items,
'failed_items': job.failed_items,
'error_message': job.error_message,
'started_at': job.started_at,
'created_at': job.created_at,
'can_cancel': job.status in ['pending', 'running']
})
return result
except Exception as e:
logger.error(f"Error getting running jobs: {e}")
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__": if __name__ == "__main__":
import uvicorn import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000) uvicorn.run(app, host="0.0.0.0", port=8000)

477
backend/nomi_sec_client.py Normal file
View file

@ -0,0 +1,477 @@
"""
Nomi-sec PoC-in-GitHub Integration Client
Interfaces with the nomi-sec PoC-in-GitHub API for curated exploit data
"""
import aiohttp
import asyncio
import json
import logging
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Tuple
from sqlalchemy.orm import Session
from sqlalchemy import and_, or_
import time
import re
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class NomiSecClient:
"""Client for interacting with nomi-sec PoC-in-GitHub API"""
def __init__(self, db_session: Session):
self.db_session = db_session
self.base_url = "https://poc-in-github.motikan2010.net/api/v1"
self.rss_url = "https://poc-in-github.motikan2010.net/rss"
# Rate limiting
self.rate_limit_delay = 1.0 # 1 second between requests
self.last_request_time = 0
# Cache for recently fetched data
self.cache = {}
self.cache_ttl = 300 # 5 minutes
async def _make_request(self, session: aiohttp.ClientSession,
url: str, params: dict = None) -> Optional[dict]:
"""Make a rate-limited request to the API"""
try:
# Rate limiting
current_time = time.time()
time_since_last = current_time - self.last_request_time
if time_since_last < self.rate_limit_delay:
await asyncio.sleep(self.rate_limit_delay - time_since_last)
async with session.get(url, params=params, timeout=30) as response:
self.last_request_time = time.time()
if response.status == 200:
return await response.json()
else:
logger.warning(f"API request failed: {response.status} for {url}")
return None
except Exception as e:
logger.error(f"Error making request to {url}: {e}")
return None
async def get_pocs_for_cve(self, cve_id: str) -> List[dict]:
"""Get all PoC repositories for a specific CVE"""
cache_key = f"cve_{cve_id}"
# Check cache
if cache_key in self.cache:
cached_data, timestamp = self.cache[cache_key]
if time.time() - timestamp < self.cache_ttl:
return cached_data
async with aiohttp.ClientSession() as session:
params = {"cve_id": cve_id}
data = await self._make_request(session, self.base_url, params)
if data and "pocs" in data:
pocs = data["pocs"]
# Cache the result
self.cache[cache_key] = (pocs, time.time())
logger.info(f"Found {len(pocs)} PoCs for {cve_id}")
return pocs
else:
logger.info(f"No PoCs found for {cve_id}")
return []
async def get_recent_pocs(self, limit: int = 100) -> List[dict]:
"""Get recent PoCs from the API"""
async with aiohttp.ClientSession() as session:
params = {"limit": limit, "sort": "created_at"}
data = await self._make_request(session, self.base_url, params)
if data and "pocs" in data:
return data["pocs"]
else:
return []
async def get_high_quality_pocs(self, min_stars: int = 5, limit: int = 100) -> List[dict]:
"""Get high-quality PoCs sorted by star count"""
async with aiohttp.ClientSession() as session:
params = {"limit": limit, "sort": "stargazers_count"}
data = await self._make_request(session, self.base_url, params)
if data and "pocs" in data:
# Filter by star count
filtered_pocs = [
poc for poc in data["pocs"]
if int(poc.get("stargazers_count", "0")) >= min_stars
]
return filtered_pocs
else:
return []
async def search_pocs(self, query: str, limit: int = 50) -> List[dict]:
"""Search for PoCs using a query string"""
async with aiohttp.ClientSession() as session:
params = {"limit": limit, "q": query}
data = await self._make_request(session, self.base_url, params)
if data and "pocs" in data:
return data["pocs"]
else:
return []
def analyze_poc_quality(self, poc: dict) -> dict:
"""Analyze the quality of a PoC repository"""
quality_score = 0
factors = {}
# Star count factor (0-40 points)
stars = int(poc.get("stargazers_count", "0"))
star_score = min(stars * 2, 40) # 2 points per star, max 40
quality_score += star_score
factors["star_score"] = star_score
# Recency factor (0-20 points)
try:
updated_at = datetime.fromisoformat(poc.get("updated_at", "").replace('Z', '+00:00'))
days_old = (datetime.now(updated_at.tzinfo) - updated_at).days
recency_score = max(20 - (days_old // 30), 0) # Lose 1 point per month
quality_score += recency_score
factors["recency_score"] = recency_score
except:
factors["recency_score"] = 0
# Description quality factor (0-15 points)
description = poc.get("description", "")
desc_score = 0
if description:
desc_score = min(len(description) // 10, 15) # 1 point per 10 chars, max 15
quality_score += desc_score
factors["description_score"] = desc_score
# Vulnerability description factor (0-15 points)
vuln_desc = poc.get("vuln_description", "")
vuln_score = 0
if vuln_desc:
vuln_score = min(len(vuln_desc) // 20, 15) # 1 point per 20 chars, max 15
quality_score += vuln_score
factors["vuln_description_score"] = vuln_score
# Repository name relevance factor (0-10 points)
repo_name = poc.get("name", "").lower()
cve_id = poc.get("cve_id", "").lower()
name_score = 0
if cve_id and cve_id.replace("-", "") in repo_name.replace("-", ""):
name_score = 10
elif any(keyword in repo_name for keyword in ["exploit", "poc", "cve", "vuln"]):
name_score = 5
quality_score += name_score
factors["name_relevance_score"] = name_score
return {
"quality_score": quality_score,
"factors": factors,
"quality_tier": self._get_quality_tier(quality_score)
}
def _get_quality_tier(self, score: int) -> str:
"""Get quality tier based on score"""
if score >= 80:
return "excellent"
elif score >= 60:
return "good"
elif score >= 40:
return "fair"
elif score >= 20:
return "poor"
else:
return "very_poor"
def extract_exploit_indicators(self, poc: dict) -> dict:
"""Extract exploit indicators from PoC metadata"""
indicators = {
"processes": [],
"files": [],
"network": [],
"registry": [],
"commands": [],
"urls": [],
"techniques": []
}
# Extract from description and vulnerability description
text_sources = [
poc.get("description", ""),
poc.get("vuln_description", ""),
poc.get("name", "")
]
full_text = " ".join(text_sources).lower()
# Process patterns
process_patterns = [
r'\b(cmd\.exe|powershell\.exe|bash|sh|python\.exe|java\.exe)\b',
r'\b(createprocess|shellexecute|system)\b',
r'\b(reverse.?shell|bind.?shell)\b'
]
for pattern in process_patterns:
matches = re.findall(pattern, full_text, re.IGNORECASE)
indicators["processes"].extend(matches)
# File patterns
file_patterns = [
r'\b([a-zA-Z]:\\[^\\]+\\[^\\]+\.[a-zA-Z0-9]+)\b', # Windows paths
r'\b(/[^/\s]+/[^/\s]+\.[a-zA-Z0-9]+)\b', # Unix paths
r'\b(\w+\.(exe|dll|bat|ps1|py|sh|jar))\b' # Common executable files
]
for pattern in file_patterns:
matches = re.findall(pattern, full_text, re.IGNORECASE)
if isinstance(matches[0], tuple) if matches else False:
indicators["files"].extend([m[0] for m in matches])
else:
indicators["files"].extend(matches)
# Network patterns
network_patterns = [
r'\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\b', # IP addresses
r'\b((?:\d{1,5})|(?:0x[a-fA-F0-9]{1,4}))\b', # Ports
r'\b(http[s]?://[^\s]+)\b' # URLs
]
for pattern in network_patterns:
matches = re.findall(pattern, full_text, re.IGNORECASE)
if pattern.startswith(r'\b(http'):
indicators["urls"].extend(matches)
else:
indicators["network"].extend(matches)
# Command patterns
command_patterns = [
r'\b(curl|wget|nc|netcat|ncat)\b',
r'\b(whoami|id|uname|systeminfo)\b',
r'\b(cat|type|more|less)\b'
]
for pattern in command_patterns:
matches = re.findall(pattern, full_text, re.IGNORECASE)
indicators["commands"].extend(matches)
# Clean up and deduplicate
for key in indicators:
indicators[key] = list(set(indicators[key]))
return indicators
async def sync_cve_pocs(self, cve_id: str) -> dict:
"""Synchronize PoC data for a specific CVE"""
from main import CVE, SigmaRule
# Get existing CVE
cve = self.db_session.query(CVE).filter(CVE.cve_id == cve_id).first()
if not cve:
logger.warning(f"CVE {cve_id} not found in database")
return {"error": "CVE not found"}
# Fetch PoCs from nomi-sec API
pocs = await self.get_pocs_for_cve(cve_id)
if not pocs:
logger.info(f"No PoCs found for {cve_id}")
return {"cve_id": cve_id, "pocs_found": 0}
# Analyze and store PoC data
poc_data = []
github_repos = []
total_quality_score = 0
for poc in pocs:
quality_analysis = self.analyze_poc_quality(poc)
exploit_indicators = self.extract_exploit_indicators(poc)
poc_entry = {
"id": poc.get("id"),
"name": poc.get("name"),
"owner": poc.get("owner"),
"full_name": poc.get("full_name"),
"html_url": poc.get("html_url"),
"description": poc.get("description"),
"stargazers_count": int(poc.get("stargazers_count", "0")),
"created_at": poc.get("created_at"),
"updated_at": poc.get("updated_at"),
"quality_analysis": quality_analysis,
"exploit_indicators": exploit_indicators
}
poc_data.append(poc_entry)
github_repos.append(poc.get("html_url", ""))
total_quality_score += quality_analysis["quality_score"]
# Update CVE with PoC data
cve.poc_count = len(pocs)
cve.poc_data = poc_data
cve.updated_at = datetime.utcnow()
# Update or create SIGMA rule with enhanced PoC data
sigma_rule = self.db_session.query(SigmaRule).filter(
SigmaRule.cve_id == cve_id
).first()
if sigma_rule:
sigma_rule.poc_source = 'nomi_sec'
sigma_rule.poc_quality_score = total_quality_score // len(pocs) if pocs else 0
sigma_rule.nomi_sec_data = {
"total_pocs": len(pocs),
"average_quality": total_quality_score // len(pocs) if pocs else 0,
"best_poc": max(poc_data, key=lambda x: x["quality_analysis"]["quality_score"]) if poc_data else None,
"total_stars": sum(p["stargazers_count"] for p in poc_data)
}
sigma_rule.github_repos = github_repos
sigma_rule.updated_at = datetime.utcnow()
# Extract best exploit indicators
best_indicators = {}
for poc in poc_data:
for key, values in poc["exploit_indicators"].items():
if key not in best_indicators:
best_indicators[key] = []
best_indicators[key].extend(values)
# Deduplicate and store
for key in best_indicators:
best_indicators[key] = list(set(best_indicators[key]))
sigma_rule.exploit_indicators = json.dumps(best_indicators)
self.db_session.commit()
logger.info(f"Synchronized {len(pocs)} PoCs for {cve_id}")
return {
"cve_id": cve_id,
"pocs_found": len(pocs),
"total_quality_score": total_quality_score,
"average_quality": total_quality_score // len(pocs) if pocs else 0,
"github_repos": github_repos
}
async def bulk_sync_all_cves(self, batch_size: int = 100, cancellation_flag: Optional[callable] = None) -> dict:
"""Synchronize PoC data for all CVEs in database"""
from main import CVE, BulkProcessingJob
# Create bulk processing job
job = BulkProcessingJob(
job_type='nomi_sec_sync',
status='running',
started_at=datetime.utcnow(),
job_metadata={'batch_size': batch_size}
)
self.db_session.add(job)
self.db_session.commit()
total_processed = 0
total_found = 0
results = []
try:
# Get all CVEs from database
cves = self.db_session.query(CVE).all()
job.total_items = len(cves)
self.db_session.commit()
# Process in batches
for i in range(0, len(cves), batch_size):
# Check for cancellation before each batch
if cancellation_flag and cancellation_flag():
logger.info("Bulk sync cancelled by user")
job.status = 'cancelled'
job.cancelled_at = datetime.utcnow()
job.error_message = "Job cancelled by user"
break
batch = cves[i:i + batch_size]
for cve in batch:
# Check for cancellation before each CVE
if cancellation_flag and cancellation_flag():
logger.info("Bulk sync cancelled by user")
job.status = 'cancelled'
job.cancelled_at = datetime.utcnow()
job.error_message = "Job cancelled by user"
break
try:
result = await self.sync_cve_pocs(cve.cve_id)
total_processed += 1
if result.get("pocs_found", 0) > 0:
total_found += result["pocs_found"]
results.append(result)
job.processed_items += 1
# Small delay to avoid overwhelming the API
await asyncio.sleep(0.5)
except Exception as e:
logger.error(f"Error syncing PoCs for {cve.cve_id}: {e}")
job.failed_items += 1
# Break out of outer loop if cancelled
if job.status == 'cancelled':
break
# Commit after each batch
self.db_session.commit()
logger.info(f"Processed batch {i//batch_size + 1}/{(len(cves) + batch_size - 1)//batch_size}")
# Update job status (only if not cancelled)
if job.status != 'cancelled':
job.status = 'completed'
job.completed_at = datetime.utcnow()
job.job_metadata.update({
'total_processed': total_processed,
'total_pocs_found': total_found,
'cves_with_pocs': len(results)
})
except Exception as e:
job.status = 'failed'
job.error_message = str(e)
job.completed_at = datetime.utcnow()
logger.error(f"Bulk PoC sync job failed: {e}")
finally:
self.db_session.commit()
return {
'job_id': str(job.id),
'status': job.status,
'total_processed': total_processed,
'total_pocs_found': total_found,
'cves_with_pocs': len(results)
}
async def get_sync_status(self) -> dict:
"""Get synchronization status"""
from main import CVE, SigmaRule
# Count CVEs with PoC data
total_cves = self.db_session.query(CVE).count()
cves_with_pocs = self.db_session.query(CVE).filter(CVE.poc_count > 0).count()
# Count SIGMA rules with nomi-sec data
total_rules = self.db_session.query(SigmaRule).count()
rules_with_nomi_sec = self.db_session.query(SigmaRule).filter(
SigmaRule.poc_source == 'nomi_sec'
).count()
return {
'total_cves': total_cves,
'cves_with_pocs': cves_with_pocs,
'poc_coverage': (cves_with_pocs / total_cves * 100) if total_cves > 0 else 0,
'total_rules': total_rules,
'rules_with_nomi_sec': rules_with_nomi_sec,
'nomi_sec_coverage': (rules_with_nomi_sec / total_rules * 100) if total_rules > 0 else 0
}

View file

@ -0,0 +1,483 @@
"""
NVD JSON Dataset Bulk Processor
Downloads and processes NVD JSON data feeds for comprehensive CVE seeding
"""
import requests
import json
import gzip
import zipfile
import os
import logging
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Tuple
from sqlalchemy.orm import Session
from sqlalchemy import and_, or_
import asyncio
import aiohttp
from pathlib import Path
import hashlib
import time
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class NVDBulkProcessor:
"""Handles bulk downloading and processing of NVD JSON data feeds"""
def __init__(self, db_session: Session, data_dir: str = "./nvd_data"):
self.db_session = db_session
self.data_dir = Path(data_dir)
self.data_dir.mkdir(exist_ok=True)
self.api_key = os.getenv("NVD_API_KEY")
# NVD JSON 2.0 feed URLs
self.base_url = "https://nvd.nist.gov/feeds/json/cve/1.1"
self.feed_urls = {
"modified": f"{self.base_url}/nvdcve-1.1-modified.json.gz",
"recent": f"{self.base_url}/nvdcve-1.1-recent.json.gz"
}
# Rate limiting
self.rate_limit_delay = 0.6 # 600ms between requests
self.last_request_time = 0
def get_year_feed_url(self, year: int) -> str:
"""Get the URL for a specific year's CVE feed"""
return f"{self.base_url}/nvdcve-1.1-{year}.json.gz"
def get_meta_url(self, feed_url: str) -> str:
"""Get the metadata URL for a feed"""
return feed_url.replace(".json.gz", ".meta")
async def download_file(self, session: aiohttp.ClientSession, url: str,
destination: Path, check_meta: bool = True) -> bool:
"""Download a file with metadata checking"""
try:
# Check if we should download based on metadata
if check_meta:
meta_url = self.get_meta_url(url)
should_download = await self._should_download_file(session, meta_url, destination)
if not should_download:
logger.info(f"Skipping {url} - file is up to date")
return True
# Rate limiting
current_time = time.time()
time_since_last = current_time - self.last_request_time
if time_since_last < self.rate_limit_delay:
await asyncio.sleep(self.rate_limit_delay - time_since_last)
# Download the file
headers = {}
if self.api_key:
headers["apiKey"] = self.api_key
async with session.get(url, headers=headers, timeout=30) as response:
if response.status == 200:
content = await response.read()
destination.write_bytes(content)
logger.info(f"Downloaded {url} -> {destination}")
self.last_request_time = time.time()
return True
else:
logger.error(f"Failed to download {url}: HTTP {response.status}")
return False
except Exception as e:
logger.error(f"Error downloading {url}: {e}")
return False
async def _should_download_file(self, session: aiohttp.ClientSession,
meta_url: str, destination: Path) -> bool:
"""Check if file should be downloaded based on metadata"""
try:
# Download metadata
async with session.get(meta_url, timeout=10) as response:
if response.status != 200:
return True # Download if we can't get metadata
meta_content = await response.text()
# Parse metadata
meta_data = {}
for line in meta_content.strip().split('\n'):
if ':' in line:
key, value = line.split(':', 1)
meta_data[key.strip()] = value.strip()
# Check if local file exists and matches
if destination.exists():
local_size = destination.stat().st_size
remote_size = int(meta_data.get('size', 0))
remote_sha256 = meta_data.get('sha256', '')
if local_size == remote_size and remote_sha256:
# Verify SHA256 if available
local_sha256 = self._calculate_sha256(destination)
if local_sha256 == remote_sha256:
return False # File is up to date
return True # Download needed
except Exception as e:
logger.warning(f"Error checking metadata for {meta_url}: {e}")
return True # Download if metadata check fails
def _calculate_sha256(self, file_path: Path) -> str:
"""Calculate SHA256 hash of a file"""
sha256_hash = hashlib.sha256()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
sha256_hash.update(chunk)
return sha256_hash.hexdigest()
async def download_all_feeds(self, start_year: int = 2002,
end_year: Optional[int] = None) -> List[Path]:
"""Download all NVD JSON feeds"""
if end_year is None:
end_year = datetime.now().year
downloaded_files = []
async with aiohttp.ClientSession() as session:
# Download year-based feeds
for year in range(start_year, end_year + 1):
url = self.get_year_feed_url(year)
filename = f"nvdcve-1.1-{year}.json.gz"
destination = self.data_dir / filename
if await self.download_file(session, url, destination):
downloaded_files.append(destination)
# Download modified and recent feeds
for feed_name, url in self.feed_urls.items():
filename = f"nvdcve-1.1-{feed_name}.json.gz"
destination = self.data_dir / filename
if await self.download_file(session, url, destination):
downloaded_files.append(destination)
return downloaded_files
def extract_json_file(self, compressed_file: Path) -> Path:
"""Extract JSON from compressed file"""
json_file = compressed_file.with_suffix('.json')
try:
if compressed_file.suffix == '.gz':
with gzip.open(compressed_file, 'rt', encoding='utf-8') as f_in:
with open(json_file, 'w', encoding='utf-8') as f_out:
f_out.write(f_in.read())
elif compressed_file.suffix == '.zip':
with zipfile.ZipFile(compressed_file, 'r') as zip_ref:
zip_ref.extractall(self.data_dir)
else:
# File is already uncompressed
return compressed_file
logger.info(f"Extracted {compressed_file} -> {json_file}")
return json_file
except Exception as e:
logger.error(f"Error extracting {compressed_file}: {e}")
raise
def process_json_file(self, json_file: Path) -> Tuple[int, int]:
"""Process a single JSON file and return (processed, failed) counts"""
from main import CVE, BulkProcessingJob
processed_count = 0
failed_count = 0
try:
with open(json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
cve_items = data.get('CVE_Items', [])
logger.info(f"Processing {len(cve_items)} CVEs from {json_file}")
for cve_item in cve_items:
try:
cve_data = self._extract_cve_data(cve_item)
if cve_data:
self._store_cve_data(cve_data)
processed_count += 1
else:
failed_count += 1
except Exception as e:
logger.error(f"Error processing CVE item: {e}")
failed_count += 1
# Commit changes
self.db_session.commit()
logger.info(f"Processed {processed_count} CVEs, failed: {failed_count}")
except Exception as e:
logger.error(f"Error processing {json_file}: {e}")
self.db_session.rollback()
raise
return processed_count, failed_count
def _extract_cve_data(self, cve_item: dict) -> Optional[dict]:
"""Extract CVE data from JSON item"""
try:
cve = cve_item.get('cve', {})
impact = cve_item.get('impact', {})
cve_id = cve.get('CVE_data_meta', {}).get('ID', '')
if not cve_id:
return None
# Description
description_data = cve.get('description', {}).get('description_data', [])
description = ''
if description_data:
description = description_data[0].get('value', '')
# CVSS Score
cvss_score = None
severity = None
if 'baseMetricV3' in impact:
cvss_v3 = impact['baseMetricV3'].get('cvssV3', {})
cvss_score = cvss_v3.get('baseScore')
severity = cvss_v3.get('baseSeverity', '').lower()
elif 'baseMetricV2' in impact:
cvss_v2 = impact['baseMetricV2'].get('cvssV2', {})
cvss_score = cvss_v2.get('baseScore')
severity = impact['baseMetricV2'].get('severity', '').lower()
# Dates
published_date = None
modified_date = None
if 'publishedDate' in cve_item:
published_date = datetime.fromisoformat(
cve_item['publishedDate'].replace('Z', '+00:00')
)
if 'lastModifiedDate' in cve_item:
modified_date = datetime.fromisoformat(
cve_item['lastModifiedDate'].replace('Z', '+00:00')
)
# Affected products (from CPE data)
affected_products = []
configurations = cve_item.get('configurations', {})
for node in configurations.get('nodes', []):
for cpe_match in node.get('cpe_match', []):
if cpe_match.get('vulnerable', False):
cpe_uri = cpe_match.get('cpe23Uri', '')
if cpe_uri:
affected_products.append(cpe_uri)
# Reference URLs
reference_urls = []
references = cve.get('references', {}).get('reference_data', [])
for ref in references:
url = ref.get('url', '')
if url:
reference_urls.append(url)
return {
'cve_id': cve_id,
'description': description,
'cvss_score': cvss_score,
'severity': severity,
'published_date': published_date,
'modified_date': modified_date,
'affected_products': affected_products,
'reference_urls': reference_urls,
'data_source': 'nvd_bulk',
'nvd_json_version': '1.1',
'bulk_processed': True
}
except Exception as e:
logger.error(f"Error extracting CVE data: {e}")
return None
def _store_cve_data(self, cve_data: dict):
"""Store CVE data in database"""
from main import CVE
# Check if CVE already exists
existing_cve = self.db_session.query(CVE).filter(
CVE.cve_id == cve_data['cve_id']
).first()
if existing_cve:
# Update existing CVE
for key, value in cve_data.items():
setattr(existing_cve, key, value)
existing_cve.updated_at = datetime.utcnow()
logger.debug(f"Updated CVE {cve_data['cve_id']}")
else:
# Create new CVE
new_cve = CVE(**cve_data)
self.db_session.add(new_cve)
logger.debug(f"Created new CVE {cve_data['cve_id']}")
async def bulk_seed_database(self, start_year: int = 2002,
end_year: Optional[int] = None) -> dict:
"""Perform complete bulk seeding of the database"""
from main import BulkProcessingJob
if end_year is None:
end_year = datetime.now().year
# Create bulk processing job
job = BulkProcessingJob(
job_type='nvd_bulk_seed',
status='running',
started_at=datetime.utcnow(),
job_metadata={
'start_year': start_year,
'end_year': end_year,
'total_years': end_year - start_year + 1
}
)
self.db_session.add(job)
self.db_session.commit()
total_processed = 0
total_failed = 0
results = []
try:
# Download all feeds
logger.info(f"Starting bulk seed from {start_year} to {end_year}")
downloaded_files = await self.download_all_feeds(start_year, end_year)
job.total_items = len(downloaded_files)
self.db_session.commit()
# Process each file
for file_path in downloaded_files:
try:
# Extract JSON file
json_file = self.extract_json_file(file_path)
# Process the JSON file
processed, failed = self.process_json_file(json_file)
total_processed += processed
total_failed += failed
job.processed_items += 1
results.append({
'file': file_path.name,
'processed': processed,
'failed': failed
})
# Clean up extracted file if it's different from original
if json_file != file_path:
json_file.unlink()
self.db_session.commit()
except Exception as e:
logger.error(f"Error processing {file_path}: {e}")
job.failed_items += 1
total_failed += 1
self.db_session.commit()
# Update job status
job.status = 'completed'
job.completed_at = datetime.utcnow()
job.job_metadata.update({
'total_processed': total_processed,
'total_failed': total_failed,
'results': results
})
except Exception as e:
job.status = 'failed'
job.error_message = str(e)
job.completed_at = datetime.utcnow()
logger.error(f"Bulk seed job failed: {e}")
finally:
self.db_session.commit()
return {
'job_id': str(job.id),
'status': job.status,
'total_processed': total_processed,
'total_failed': total_failed,
'results': results
}
async def incremental_update(self) -> dict:
"""Perform incremental update using modified and recent feeds"""
from main import BulkProcessingJob
# Create incremental update job
job = BulkProcessingJob(
job_type='incremental_update',
status='running',
started_at=datetime.utcnow(),
job_metadata={'feeds': ['modified', 'recent']}
)
self.db_session.add(job)
self.db_session.commit()
total_processed = 0
total_failed = 0
results = []
try:
# Download modified and recent feeds
async with aiohttp.ClientSession() as session:
for feed_name, url in self.feed_urls.items():
filename = f"nvdcve-1.1-{feed_name}.json.gz"
destination = self.data_dir / filename
if await self.download_file(session, url, destination):
try:
json_file = self.extract_json_file(destination)
processed, failed = self.process_json_file(json_file)
total_processed += processed
total_failed += failed
results.append({
'feed': feed_name,
'processed': processed,
'failed': failed
})
# Clean up
if json_file != destination:
json_file.unlink()
except Exception as e:
logger.error(f"Error processing {feed_name} feed: {e}")
total_failed += 1
job.status = 'completed'
job.completed_at = datetime.utcnow()
job.job_metadata.update({
'total_processed': total_processed,
'total_failed': total_failed,
'results': results
})
except Exception as e:
job.status = 'failed'
job.error_message = str(e)
job.completed_at = datetime.utcnow()
logger.error(f"Incremental update job failed: {e}")
finally:
self.db_session.commit()
return {
'job_id': str(job.id),
'status': job.status,
'total_processed': total_processed,
'total_failed': total_failed,
'results': results
}

View file

@ -12,3 +12,5 @@ pygithub==2.1.1
gitpython==3.1.40 gitpython==3.1.40
beautifulsoup4==4.12.2 beautifulsoup4==4.12.2
lxml==4.9.3 lxml==4.9.3
aiohttp==3.9.1
aiofiles

View file

@ -1,5 +1,3 @@
version: '3.8'
services: services:
db: db:
image: postgres:15 image: postgres:15
@ -25,6 +23,7 @@ services:
environment: environment:
DATABASE_URL: postgresql://cve_user:cve_password@db:5432/cve_sigma_db DATABASE_URL: postgresql://cve_user:cve_password@db:5432/cve_sigma_db
NVD_API_KEY: ${NVD_API_KEY:-} NVD_API_KEY: ${NVD_API_KEY:-}
GITHUB_TOKEN: ${GITHUB_TOKEN}
depends_on: depends_on:
db: db:
condition: service_healthy condition: service_healthy

View file

@ -15,6 +15,10 @@ function App() {
const [activeTab, setActiveTab] = useState('dashboard'); const [activeTab, setActiveTab] = useState('dashboard');
const [fetchingCves, setFetchingCves] = useState(false); const [fetchingCves, setFetchingCves] = useState(false);
const [testResult, setTestResult] = useState(null); const [testResult, setTestResult] = useState(null);
const [bulkJobs, setBulkJobs] = useState([]);
const [bulkStatus, setBulkStatus] = useState({});
const [pocStats, setPocStats] = useState({});
const [bulkProcessing, setBulkProcessing] = useState(false);
useEffect(() => { useEffect(() => {
fetchData(); fetchData();
@ -23,15 +27,21 @@ function App() {
const fetchData = async () => { const fetchData = async () => {
try { try {
setLoading(true); setLoading(true);
const [cvesRes, rulesRes, statsRes] = await Promise.all([ const [cvesRes, rulesRes, statsRes, bulkJobsRes, bulkStatusRes, pocStatsRes] = await Promise.all([
axios.get(`${API_BASE_URL}/api/cves`), axios.get(`${API_BASE_URL}/api/cves`),
axios.get(`${API_BASE_URL}/api/sigma-rules`), axios.get(`${API_BASE_URL}/api/sigma-rules`),
axios.get(`${API_BASE_URL}/api/stats`) axios.get(`${API_BASE_URL}/api/stats`),
axios.get(`${API_BASE_URL}/api/bulk-jobs`),
axios.get(`${API_BASE_URL}/api/bulk-status`),
axios.get(`${API_BASE_URL}/api/poc-stats`)
]); ]);
setCves(cvesRes.data); setCves(cvesRes.data);
setSigmaRules(rulesRes.data); setSigmaRules(rulesRes.data);
setStats(statsRes.data); setStats(statsRes.data);
setBulkJobs(bulkJobsRes.data);
setBulkStatus(bulkStatusRes.data);
setPocStats(pocStatsRes.data);
} catch (error) { } catch (error) {
console.error('Error fetching data:', error); console.error('Error fetching data:', error);
} finally { } finally {
@ -39,6 +49,20 @@ function App() {
} }
}; };
const cancelJob = async (jobId) => {
try {
const response = await axios.post(`${API_BASE_URL}/api/cancel-job/${jobId}`);
console.log('Cancel job response:', response.data);
// Refresh data after cancelling
setTimeout(() => {
fetchData();
}, 1000);
} catch (error) {
console.error('Error cancelling job:', error);
alert('Failed to cancel job. Please try again.');
}
};
const handleFetchCves = async () => { const handleFetchCves = async () => {
try { try {
setFetchingCves(true); setFetchingCves(true);
@ -73,6 +97,73 @@ function App() {
} }
}; };
const startBulkSeed = async (startYear = 2020, endYear = null) => {
try {
setBulkProcessing(true);
const response = await axios.post(`${API_BASE_URL}/api/bulk-seed`, {
start_year: startYear,
end_year: endYear
});
console.log('Bulk seed response:', response.data);
// Refresh data after starting
setTimeout(() => {
fetchData();
}, 2000);
} catch (error) {
console.error('Error starting bulk seed:', error);
setBulkProcessing(false);
}
};
const startIncrementalUpdate = async () => {
try {
setBulkProcessing(true);
const response = await axios.post(`${API_BASE_URL}/api/incremental-update`);
console.log('Incremental update response:', response.data);
setTimeout(() => {
fetchData();
setBulkProcessing(false);
}, 2000);
} catch (error) {
console.error('Error starting incremental update:', error);
setBulkProcessing(false);
}
};
const syncNomiSec = async (cveId = null) => {
try {
setBulkProcessing(true);
const response = await axios.post(`${API_BASE_URL}/api/sync-nomi-sec`, {
cve_id: cveId
});
console.log('Nomi-sec sync response:', response.data);
setTimeout(() => {
fetchData();
setBulkProcessing(false);
}, 2000);
} catch (error) {
console.error('Error syncing nomi-sec:', error);
setBulkProcessing(false);
}
};
const regenerateRules = async (force = false) => {
try {
setBulkProcessing(true);
const response = await axios.post(`${API_BASE_URL}/api/regenerate-rules`, {
force: force
});
console.log('Rule regeneration response:', response.data);
setTimeout(() => {
fetchData();
setBulkProcessing(false);
}, 2000);
} catch (error) {
console.error('Error regenerating rules:', error);
setBulkProcessing(false);
}
};
const getSeverityColor = (severity) => { const getSeverityColor = (severity) => {
switch (severity?.toLowerCase()) { switch (severity?.toLowerCase()) {
case 'critical': return 'bg-red-100 text-red-800'; case 'critical': return 'bg-red-100 text-red-800';
@ -93,18 +184,81 @@ function App() {
const Dashboard = () => ( const Dashboard = () => (
<div className="space-y-6"> <div className="space-y-6">
<div className="grid grid-cols-1 md:grid-cols-3 gap-6"> <div className="grid grid-cols-1 md:grid-cols-5 gap-6">
<div className="bg-white p-6 rounded-lg shadow"> <div className="bg-white p-6 rounded-lg shadow">
<h3 className="text-lg font-medium text-gray-900">Total CVEs</h3> <h3 className="text-lg font-medium text-gray-900">Total CVEs</h3>
<p className="text-3xl font-bold text-blue-600">{stats.total_cves || 0}</p> <p className="text-3xl font-bold text-blue-600">{stats.total_cves || 0}</p>
<p className="text-sm text-gray-500">Bulk: {stats.bulk_processed_cves || 0}</p>
</div> </div>
<div className="bg-white p-6 rounded-lg shadow"> <div className="bg-white p-6 rounded-lg shadow">
<h3 className="text-lg font-medium text-gray-900">SIGMA Rules</h3> <h3 className="text-lg font-medium text-gray-900">SIGMA Rules</h3>
<p className="text-3xl font-bold text-green-600">{stats.total_sigma_rules || 0}</p> <p className="text-3xl font-bold text-green-600">{stats.total_sigma_rules || 0}</p>
<p className="text-sm text-gray-500">Nomi-sec: {stats.nomi_sec_rules || 0}</p>
</div>
<div className="bg-white p-6 rounded-lg shadow">
<h3 className="text-lg font-medium text-gray-900">CVEs with PoCs</h3>
<p className="text-3xl font-bold text-purple-600">{stats.cves_with_pocs || 0}</p>
<p className="text-sm text-gray-500">{(stats.poc_coverage || 0).toFixed(1)}% coverage</p>
</div> </div>
<div className="bg-white p-6 rounded-lg shadow"> <div className="bg-white p-6 rounded-lg shadow">
<h3 className="text-lg font-medium text-gray-900">Recent CVEs (7d)</h3> <h3 className="text-lg font-medium text-gray-900">Recent CVEs (7d)</h3>
<p className="text-3xl font-bold text-purple-600">{stats.recent_cves_7_days || 0}</p> <p className="text-3xl font-bold text-orange-600">{stats.recent_cves_7_days || 0}</p>
</div>
<div className="bg-white p-6 rounded-lg shadow">
<h3 className="text-lg font-medium text-gray-900">High Quality PoCs</h3>
<p className="text-3xl font-bold text-indigo-600">{pocStats.high_quality_cves || 0}</p>
<p className="text-sm text-gray-500">Avg: {(pocStats.avg_poc_count || 0).toFixed(1)}</p>
</div>
</div>
{/* Bulk Processing Controls */}
<div className="bg-white rounded-lg shadow p-6">
<h2 className="text-xl font-bold text-gray-900 mb-4">Bulk Processing</h2>
<div className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-4 gap-4">
<button
onClick={() => startBulkSeed(2020)}
disabled={bulkProcessing}
className={`px-4 py-2 rounded-md text-white ${
bulkProcessing
? 'bg-gray-400 cursor-not-allowed'
: 'bg-blue-600 hover:bg-blue-700'
}`}
>
{bulkProcessing ? 'Processing...' : 'Bulk Seed (2020+)'}
</button>
<button
onClick={startIncrementalUpdate}
disabled={bulkProcessing}
className={`px-4 py-2 rounded-md text-white ${
bulkProcessing
? 'bg-gray-400 cursor-not-allowed'
: 'bg-green-600 hover:bg-green-700'
}`}
>
{bulkProcessing ? 'Processing...' : 'Incremental Update'}
</button>
<button
onClick={() => syncNomiSec()}
disabled={bulkProcessing}
className={`px-4 py-2 rounded-md text-white ${
bulkProcessing
? 'bg-gray-400 cursor-not-allowed'
: 'bg-purple-600 hover:bg-purple-700'
}`}
>
{bulkProcessing ? 'Processing...' : 'Sync nomi-sec PoCs'}
</button>
<button
onClick={() => regenerateRules()}
disabled={bulkProcessing}
className={`px-4 py-2 rounded-md text-white ${
bulkProcessing
? 'bg-gray-400 cursor-not-allowed'
: 'bg-indigo-600 hover:bg-indigo-700'
}`}
>
{bulkProcessing ? 'Processing...' : 'Regenerate Rules'}
</button>
</div> </div>
</div> </div>
@ -522,6 +676,178 @@ function App() {
); );
}; };
const BulkJobsList = () => (
<div className="space-y-6">
<div className="flex justify-between items-center">
<h1 className="text-2xl font-bold text-gray-900">Bulk Processing Jobs</h1>
<button
onClick={fetchData}
className="bg-blue-600 hover:bg-blue-700 text-white px-4 py-2 rounded-md text-sm"
>
Refresh
</button>
</div>
{/* Bulk Status Overview */}
<div className="bg-white rounded-lg shadow p-6">
<h2 className="text-lg font-bold text-gray-900 mb-4">System Status</h2>
{bulkStatus.database_stats && (
<div className="grid grid-cols-2 md:grid-cols-4 gap-4">
<div className="text-center">
<div className="text-2xl font-bold text-blue-600">{bulkStatus.database_stats.total_cves}</div>
<div className="text-sm text-gray-500">Total CVEs</div>
</div>
<div className="text-center">
<div className="text-2xl font-bold text-green-600">{bulkStatus.database_stats.bulk_processed_cves}</div>
<div className="text-sm text-gray-500">Bulk Processed</div>
</div>
<div className="text-center">
<div className="text-2xl font-bold text-purple-600">{bulkStatus.database_stats.cves_with_pocs}</div>
<div className="text-sm text-gray-500">With PoCs</div>
</div>
<div className="text-center">
<div className="text-2xl font-bold text-indigo-600">{bulkStatus.database_stats.nomi_sec_rules}</div>
<div className="text-sm text-gray-500">Enhanced Rules</div>
</div>
</div>
)}
</div>
{/* Running Jobs */}
{bulkJobs.some(job => job.status === 'running' || job.status === 'pending') && (
<div className="bg-white rounded-lg shadow">
<div className="px-6 py-4 border-b border-gray-200">
<h2 className="text-lg font-bold text-gray-900">Running Jobs</h2>
</div>
<div className="divide-y divide-gray-200">
{bulkJobs
.filter(job => job.status === 'running' || job.status === 'pending')
.map((job) => (
<div key={job.id} className="px-6 py-4 bg-blue-50">
<div className="flex items-center justify-between">
<div className="flex-1">
<div className="flex items-center space-x-3">
<h3 className="text-lg font-medium text-gray-900">{job.job_type}</h3>
<span className={`inline-flex px-2 py-1 text-xs font-semibold rounded-full ${
job.status === 'running' ? 'bg-blue-100 text-blue-800' :
'bg-gray-100 text-gray-800'
}`}>
{job.status}
</span>
</div>
<div className="mt-2 flex items-center space-x-6 text-sm text-gray-500">
<span>Started: {formatDate(job.started_at)}</span>
{job.year && <span>Year: {job.year}</span>}
</div>
{job.total_items > 0 && (
<div className="mt-2">
<div className="flex items-center space-x-4 text-sm text-gray-600">
<span>Progress: {job.processed_items}/{job.total_items}</span>
{job.failed_items > 0 && (
<span className="text-red-600">Failed: {job.failed_items}</span>
)}
</div>
<div className="mt-1 w-full bg-gray-200 rounded-full h-2">
<div
className="bg-blue-600 h-2 rounded-full"
style={{ width: `${(job.processed_items / job.total_items) * 100}%` }}
></div>
</div>
</div>
)}
</div>
<div className="flex-shrink-0 ml-4">
<button
onClick={() => cancelJob(job.id)}
className="bg-red-600 hover:bg-red-700 text-white px-3 py-1 rounded-md text-sm font-medium"
>
Cancel
</button>
</div>
</div>
</div>
))}
</div>
</div>
)}
{/* Recent Jobs */}
<div className="bg-white rounded-lg shadow">
<div className="px-6 py-4 border-b border-gray-200">
<h2 className="text-lg font-bold text-gray-900">Recent Jobs</h2>
</div>
<div className="divide-y divide-gray-200">
{bulkJobs.length === 0 ? (
<div className="px-6 py-8 text-center text-gray-500">
No bulk processing jobs found
</div>
) : (
bulkJobs.map((job) => (
<div key={job.id} className="px-6 py-4">
<div className="flex items-center justify-between">
<div className="flex-1">
<div className="flex items-center space-x-3">
<h3 className="text-lg font-medium text-gray-900">{job.job_type}</h3>
<span className={`inline-flex px-2 py-1 text-xs font-semibold rounded-full ${
job.status === 'completed' ? 'bg-green-100 text-green-800' :
job.status === 'running' ? 'bg-blue-100 text-blue-800' :
job.status === 'failed' ? 'bg-red-100 text-red-800' :
job.status === 'cancelled' ? 'bg-orange-100 text-orange-800' :
'bg-gray-100 text-gray-800'
}`}>
{job.status}
</span>
</div>
<div className="mt-2 flex items-center space-x-6 text-sm text-gray-500">
<span>Started: {formatDate(job.started_at)}</span>
{job.completed_at && (
<span>Completed: {formatDate(job.completed_at)}</span>
)}
{job.year && (
<span>Year: {job.year}</span>
)}
</div>
{job.total_items > 0 && (
<div className="mt-2">
<div className="flex items-center space-x-4 text-sm text-gray-600">
<span>Progress: {job.processed_items}/{job.total_items}</span>
{job.failed_items > 0 && (
<span className="text-red-600">Failed: {job.failed_items}</span>
)}
</div>
<div className="mt-1 w-full bg-gray-200 rounded-full h-2">
<div
className="bg-blue-600 h-2 rounded-full"
style={{ width: `${(job.processed_items / job.total_items) * 100}%` }}
></div>
</div>
</div>
)}
{job.error_message && (
<div className="mt-2 p-2 bg-red-50 border border-red-200 rounded text-sm text-red-700">
{job.error_message}
</div>
)}
</div>
<div className="flex-shrink-0 ml-4">
{(job.status === 'running' || job.status === 'pending') && (
<button
onClick={() => cancelJob(job.id)}
className="bg-red-600 hover:bg-red-700 text-white px-3 py-1 rounded-md text-sm font-medium"
>
Cancel
</button>
)}
</div>
</div>
</div>
))
)}
</div>
</div>
</div>
);
if (loading) { if (loading) {
return ( return (
<div className="min-h-screen bg-gray-100 flex items-center justify-center"> <div className="min-h-screen bg-gray-100 flex items-center justify-center">
@ -573,6 +899,16 @@ function App() {
> >
SIGMA Rules SIGMA Rules
</button> </button>
<button
onClick={() => setActiveTab('bulk-jobs')}
className={`inline-flex items-center px-1 pt-1 border-b-2 text-sm font-medium ${
activeTab === 'bulk-jobs'
? 'border-blue-500 text-gray-900'
: 'border-transparent text-gray-500 hover:text-gray-700 hover:border-gray-300'
}`}
>
Bulk Jobs
</button>
</div> </div>
</div> </div>
</div> </div>
@ -584,6 +920,7 @@ function App() {
{activeTab === 'dashboard' && <Dashboard />} {activeTab === 'dashboard' && <Dashboard />}
{activeTab === 'cves' && <CVEList />} {activeTab === 'cves' && <CVEList />}
{activeTab === 'rules' && <SigmaRulesList />} {activeTab === 'rules' && <SigmaRulesList />}
{activeTab === 'bulk-jobs' && <BulkJobsList />}
</div> </div>
</main> </main>

View file

@ -13,6 +13,13 @@ CREATE TABLE cves (
modified_date TIMESTAMP, modified_date TIMESTAMP,
affected_products TEXT[], affected_products TEXT[],
reference_urls TEXT[], reference_urls TEXT[],
-- Bulk processing fields
data_source VARCHAR(20) DEFAULT 'nvd_api',
nvd_json_version VARCHAR(10) DEFAULT '2.0',
bulk_processed BOOLEAN DEFAULT FALSE,
-- nomi-sec PoC fields
poc_count INTEGER DEFAULT 0,
poc_data JSON,
created_at TIMESTAMP DEFAULT NOW(), created_at TIMESTAMP DEFAULT NOW(),
updated_at TIMESTAMP DEFAULT NOW() updated_at TIMESTAMP DEFAULT NOW()
); );
@ -30,6 +37,10 @@ CREATE TABLE sigma_rules (
exploit_based BOOLEAN DEFAULT FALSE, exploit_based BOOLEAN DEFAULT FALSE,
github_repos TEXT[], github_repos TEXT[],
exploit_indicators TEXT, exploit_indicators TEXT,
-- Enhanced fields for new data sources
poc_source VARCHAR(20) DEFAULT 'github_search',
poc_quality_score INTEGER DEFAULT 0,
nomi_sec_data JSON,
created_at TIMESTAMP DEFAULT NOW(), created_at TIMESTAMP DEFAULT NOW(),
updated_at TIMESTAMP DEFAULT NOW() updated_at TIMESTAMP DEFAULT NOW()
); );
@ -44,6 +55,23 @@ CREATE TABLE rule_templates (
created_at TIMESTAMP DEFAULT NOW() created_at TIMESTAMP DEFAULT NOW()
); );
-- Bulk processing jobs table
CREATE TABLE bulk_processing_jobs (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
job_type VARCHAR(50) NOT NULL,
status VARCHAR(20) DEFAULT 'pending',
year INTEGER,
total_items INTEGER DEFAULT 0,
processed_items INTEGER DEFAULT 0,
failed_items INTEGER DEFAULT 0,
error_message TEXT,
job_metadata JSON,
started_at TIMESTAMP,
completed_at TIMESTAMP,
cancelled_at TIMESTAMP,
created_at TIMESTAMP DEFAULT NOW()
);
-- Insert some basic rule templates -- Insert some basic rule templates
INSERT INTO rule_templates (template_name, template_content, applicable_product_patterns, description) VALUES INSERT INTO rule_templates (template_name, template_content, applicable_product_patterns, description) VALUES
( (

0
start.sh Normal file → Executable file
View file