- Extract database models from monolithic main.py (2,373 lines) into organized modules - Implement service layer pattern with dedicated business logic classes - Split API endpoints into modular FastAPI routers by functionality - Add centralized configuration management with environment variable handling - Create proper separation of concerns across data, service, and presentation layers **Architecture Changes:** - models/: SQLAlchemy database models (CVE, SigmaRule, RuleTemplate, BulkProcessingJob) - config/: Centralized settings and database configuration - services/: Business logic (CVEService, SigmaRuleService, GitHubExploitAnalyzer) - routers/: Modular API endpoints (cves, sigma_rules, bulk_operations, llm_operations) - schemas/: Pydantic request/response models **Key Improvements:** - 95% reduction in main.py size (2,373 → 120 lines) - Updated 15+ backend files with proper import structure - Eliminated circular dependencies and tight coupling - Enhanced testability with isolated service components - Better code organization for team collaboration **Backward Compatibility:** - All API endpoints maintain same URLs and behavior - Zero breaking changes to existing functionality - Database schema unchanged - Environment variables preserved 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
268 lines
No EOL
11 KiB
Python
268 lines
No EOL
11 KiB
Python
import re
|
|
import os
|
|
from typing import List, Optional
|
|
from github import Github
|
|
import sys
|
|
import os
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
from config.settings import settings
|
|
|
|
|
|
class GitHubExploitAnalyzer:
|
|
"""Service for analyzing GitHub repositories for exploit code"""
|
|
|
|
def __init__(self):
|
|
self.github_token = settings.GITHUB_TOKEN
|
|
self.github = Github(self.github_token) if self.github_token else None
|
|
|
|
async def search_exploits_for_cve(self, cve_id: str) -> List[dict]:
|
|
"""Search GitHub for exploit code related to a CVE"""
|
|
if not self.github:
|
|
print(f"No GitHub token configured, skipping exploit search for {cve_id}")
|
|
return []
|
|
|
|
try:
|
|
print(f"Searching GitHub for exploits for {cve_id}")
|
|
|
|
# Search queries to find exploit code
|
|
search_queries = [
|
|
f"{cve_id} exploit",
|
|
f"{cve_id} poc",
|
|
f"{cve_id} vulnerability",
|
|
f'"{cve_id}" exploit code',
|
|
f"{cve_id.replace('-', '_')} exploit"
|
|
]
|
|
|
|
exploits = []
|
|
seen_repos = set()
|
|
|
|
for query in search_queries[:2]: # Limit to 2 queries to avoid rate limits
|
|
try:
|
|
# Search repositories
|
|
repos = self.github.search_repositories(
|
|
query=query,
|
|
sort="updated",
|
|
order="desc"
|
|
)
|
|
|
|
# Get top 5 results per query
|
|
for repo in repos[:5]:
|
|
if repo.full_name in seen_repos:
|
|
continue
|
|
seen_repos.add(repo.full_name)
|
|
|
|
# Analyze repository
|
|
exploit_info = await self._analyze_repository(repo, cve_id)
|
|
if exploit_info:
|
|
exploits.append(exploit_info)
|
|
|
|
if len(exploits) >= settings.MAX_GITHUB_RESULTS:
|
|
break
|
|
|
|
if len(exploits) >= settings.MAX_GITHUB_RESULTS:
|
|
break
|
|
|
|
except Exception as e:
|
|
print(f"Error searching GitHub with query '{query}': {str(e)}")
|
|
continue
|
|
|
|
print(f"Found {len(exploits)} potential exploits for {cve_id}")
|
|
return exploits
|
|
|
|
except Exception as e:
|
|
print(f"Error searching GitHub for {cve_id}: {str(e)}")
|
|
return []
|
|
|
|
async def _analyze_repository(self, repo, cve_id: str) -> Optional[dict]:
|
|
"""Analyze a GitHub repository for exploit code"""
|
|
try:
|
|
# Check if repo name or description mentions the CVE
|
|
repo_text = f"{repo.name} {repo.description or ''}".lower()
|
|
if cve_id.lower() not in repo_text and cve_id.replace('-', '_').lower() not in repo_text:
|
|
return None
|
|
|
|
# Get repository contents
|
|
exploit_files = []
|
|
indicators = {
|
|
'processes': set(),
|
|
'files': set(),
|
|
'registry': set(),
|
|
'network': set(),
|
|
'commands': set(),
|
|
'powershell': set(),
|
|
'urls': set()
|
|
}
|
|
|
|
try:
|
|
contents = repo.get_contents("")
|
|
for content in contents[:20]: # Limit files to analyze
|
|
if content.type == "file" and self._is_exploit_file(content.name):
|
|
file_analysis = await self._analyze_file_content(repo, content, cve_id)
|
|
if file_analysis:
|
|
exploit_files.append(file_analysis)
|
|
# Merge indicators
|
|
for key, values in file_analysis.get('indicators', {}).items():
|
|
if key in indicators:
|
|
indicators[key].update(values)
|
|
|
|
except Exception as e:
|
|
print(f"Error analyzing repo contents for {repo.full_name}: {str(e)}")
|
|
|
|
if not exploit_files:
|
|
return None
|
|
|
|
return {
|
|
'repo_name': repo.full_name,
|
|
'repo_url': repo.html_url,
|
|
'description': repo.description,
|
|
'language': repo.language,
|
|
'stars': repo.stargazers_count,
|
|
'updated': repo.updated_at.isoformat(),
|
|
'files': exploit_files,
|
|
'indicators': {k: list(v) for k, v in indicators.items()}
|
|
}
|
|
|
|
except Exception as e:
|
|
print(f"Error analyzing repository {repo.full_name}: {str(e)}")
|
|
return None
|
|
|
|
def _is_exploit_file(self, filename: str) -> bool:
|
|
"""Check if a file is likely to contain exploit code"""
|
|
exploit_extensions = ['.py', '.ps1', '.sh', '.c', '.cpp', '.js', '.rb', '.pl', '.php', '.java']
|
|
exploit_names = ['exploit', 'poc', 'payload', 'shell', 'reverse', 'bind', 'attack']
|
|
|
|
filename_lower = filename.lower()
|
|
|
|
# Check extension
|
|
if not any(filename_lower.endswith(ext) for ext in exploit_extensions):
|
|
return False
|
|
|
|
# Check filename for exploit-related terms
|
|
return any(term in filename_lower for term in exploit_names) or 'cve' in filename_lower
|
|
|
|
async def _analyze_file_content(self, repo, file_content, cve_id: str) -> Optional[dict]:
|
|
"""Analyze individual file content for exploit indicators"""
|
|
try:
|
|
if file_content.size > 100000: # Skip files larger than 100KB
|
|
return None
|
|
|
|
# Decode file content
|
|
content = file_content.decoded_content.decode('utf-8', errors='ignore')
|
|
|
|
# Check if file actually mentions the CVE
|
|
if cve_id.lower() not in content.lower() and cve_id.replace('-', '_').lower() not in content.lower():
|
|
return None
|
|
|
|
indicators = self._extract_indicators_from_code(content, file_content.name)
|
|
|
|
if not any(indicators.values()):
|
|
return None
|
|
|
|
return {
|
|
'filename': file_content.name,
|
|
'path': file_content.path,
|
|
'size': file_content.size,
|
|
'indicators': indicators
|
|
}
|
|
|
|
except Exception as e:
|
|
print(f"Error analyzing file {file_content.name}: {str(e)}")
|
|
return None
|
|
|
|
def _extract_indicators_from_code(self, content: str, filename: str) -> dict:
|
|
"""Extract security indicators from exploit code"""
|
|
indicators = {
|
|
'processes': set(),
|
|
'files': set(),
|
|
'registry': set(),
|
|
'network': set(),
|
|
'commands': set(),
|
|
'powershell': set(),
|
|
'urls': set()
|
|
}
|
|
|
|
# Process patterns
|
|
process_patterns = [
|
|
r'CreateProcess[AW]?\s*\(\s*["\']([^"\']+)["\']',
|
|
r'ShellExecute[AW]?\s*\([^,]*,\s*["\']([^"\']+)["\']',
|
|
r'system\s*\(\s*["\']([^"\']+)["\']',
|
|
r'exec\s*\(\s*["\']([^"\']+)["\']',
|
|
r'subprocess\.(?:call|run|Popen)\s*\(\s*["\']([^"\']+)["\']'
|
|
]
|
|
|
|
# File patterns
|
|
file_patterns = [
|
|
r'(?:fopen|CreateFile|WriteFile|ReadFile)\s*\(\s*["\']([^"\']+\.[a-zA-Z0-9]+)["\']',
|
|
r'(?:copy|move|del|rm)\s+["\']?([^\s"\']+\.[a-zA-Z0-9]+)["\']?',
|
|
r'\\\\[^\\]+\\[^\\]+\\([^\\]+\.[a-zA-Z0-9]+)',
|
|
r'[C-Z]:\\\\[^\\]+\\\\([^\\]+\.[a-zA-Z0-9]+)'
|
|
]
|
|
|
|
# Registry patterns
|
|
registry_patterns = [
|
|
r'(?:RegOpenKey|RegSetValue|RegCreateKey)\s*\([^,]*,\s*["\']([^"\']+)["\']',
|
|
r'HKEY_[A-Z_]+\\\\([^"\'\\]+)',
|
|
r'reg\s+add\s+["\']?([^"\'\\]+\\\\[^"\']+)["\']?'
|
|
]
|
|
|
|
# Network patterns
|
|
network_patterns = [
|
|
r'(?:connect|bind|listen)\s*\([^,]*,\s*(\d+)',
|
|
r'socket\.connect\s*\(\s*\(["\']?([^"\']+)["\']?,\s*(\d+)\)',
|
|
r'(?:http|https|ftp)://([^\s"\'<>]+)',
|
|
r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+)'
|
|
]
|
|
|
|
# PowerShell patterns
|
|
powershell_patterns = [
|
|
r'(?:powershell|pwsh)\s+(?:-[a-zA-Z]+\s+)*["\']?([^"\']+)["\']?',
|
|
r'Invoke-(?:Expression|Command|WebRequest|RestMethod)\s+["\']?([^"\']+)["\']?',
|
|
r'Start-Process\s+["\']?([^"\']+)["\']?',
|
|
r'Get-Process\s+["\']?([^"\']+)["\']?'
|
|
]
|
|
|
|
# Command patterns
|
|
command_patterns = [
|
|
r'(?:cmd|command)\s+(?:/[a-zA-Z]+\s+)*["\']?([^"\']+)["\']?',
|
|
r'(?:ping|nslookup|netstat|tasklist|wmic)\s+([^\s"\']+)',
|
|
r'(?:net|sc|schtasks)\s+[a-zA-Z]+\s+([^\s"\']+)'
|
|
]
|
|
|
|
# Extract indicators using regex patterns
|
|
patterns = {
|
|
'processes': process_patterns,
|
|
'files': file_patterns,
|
|
'registry': registry_patterns,
|
|
'powershell': powershell_patterns,
|
|
'commands': command_patterns
|
|
}
|
|
|
|
for category, pattern_list in patterns.items():
|
|
for pattern in pattern_list:
|
|
matches = re.findall(pattern, content, re.IGNORECASE | re.MULTILINE)
|
|
for match in matches:
|
|
if isinstance(match, tuple):
|
|
indicators[category].add(match[0])
|
|
else:
|
|
indicators[category].add(match)
|
|
|
|
# Special handling for network indicators
|
|
for pattern in network_patterns:
|
|
matches = re.findall(pattern, content, re.IGNORECASE)
|
|
for match in matches:
|
|
if isinstance(match, tuple):
|
|
if len(match) >= 2:
|
|
indicators['network'].add(f"{match[0]}:{match[1]}")
|
|
else:
|
|
indicators['network'].add(match[0])
|
|
else:
|
|
indicators['network'].add(match)
|
|
|
|
# Convert sets to lists and filter out empty/invalid indicators
|
|
cleaned_indicators = {}
|
|
for key, values in indicators.items():
|
|
cleaned_values = [v for v in values if v and len(v.strip()) > 2 and len(v) < 200]
|
|
if cleaned_values:
|
|
cleaned_indicators[key] = cleaned_values[:10] # Limit to 10 per category
|
|
|
|
return cleaned_indicators |