auto_sigma_rule_generator/backend/services/github_service.py
bpmcdevitt a6fb367ed4 refactor: modularize backend architecture for improved maintainability
- Extract database models from monolithic main.py (2,373 lines) into organized modules
- Implement service layer pattern with dedicated business logic classes
- Split API endpoints into modular FastAPI routers by functionality
- Add centralized configuration management with environment variable handling
- Create proper separation of concerns across data, service, and presentation layers

**Architecture Changes:**
- models/: SQLAlchemy database models (CVE, SigmaRule, RuleTemplate, BulkProcessingJob)
- config/: Centralized settings and database configuration
- services/: Business logic (CVEService, SigmaRuleService, GitHubExploitAnalyzer)
- routers/: Modular API endpoints (cves, sigma_rules, bulk_operations, llm_operations)
- schemas/: Pydantic request/response models

**Key Improvements:**
- 95% reduction in main.py size (2,373 → 120 lines)
- Updated 15+ backend files with proper import structure
- Eliminated circular dependencies and tight coupling
- Enhanced testability with isolated service components
- Better code organization for team collaboration

**Backward Compatibility:**
- All API endpoints maintain same URLs and behavior
- Zero breaking changes to existing functionality
- Database schema unchanged
- Environment variables preserved

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-07-14 17:51:23 -05:00

268 lines
No EOL
11 KiB
Python

import re
import os
from typing import List, Optional
from github import Github
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from config.settings import settings
class GitHubExploitAnalyzer:
"""Service for analyzing GitHub repositories for exploit code"""
def __init__(self):
self.github_token = settings.GITHUB_TOKEN
self.github = Github(self.github_token) if self.github_token else None
async def search_exploits_for_cve(self, cve_id: str) -> List[dict]:
"""Search GitHub for exploit code related to a CVE"""
if not self.github:
print(f"No GitHub token configured, skipping exploit search for {cve_id}")
return []
try:
print(f"Searching GitHub for exploits for {cve_id}")
# Search queries to find exploit code
search_queries = [
f"{cve_id} exploit",
f"{cve_id} poc",
f"{cve_id} vulnerability",
f'"{cve_id}" exploit code',
f"{cve_id.replace('-', '_')} exploit"
]
exploits = []
seen_repos = set()
for query in search_queries[:2]: # Limit to 2 queries to avoid rate limits
try:
# Search repositories
repos = self.github.search_repositories(
query=query,
sort="updated",
order="desc"
)
# Get top 5 results per query
for repo in repos[:5]:
if repo.full_name in seen_repos:
continue
seen_repos.add(repo.full_name)
# Analyze repository
exploit_info = await self._analyze_repository(repo, cve_id)
if exploit_info:
exploits.append(exploit_info)
if len(exploits) >= settings.MAX_GITHUB_RESULTS:
break
if len(exploits) >= settings.MAX_GITHUB_RESULTS:
break
except Exception as e:
print(f"Error searching GitHub with query '{query}': {str(e)}")
continue
print(f"Found {len(exploits)} potential exploits for {cve_id}")
return exploits
except Exception as e:
print(f"Error searching GitHub for {cve_id}: {str(e)}")
return []
async def _analyze_repository(self, repo, cve_id: str) -> Optional[dict]:
"""Analyze a GitHub repository for exploit code"""
try:
# Check if repo name or description mentions the CVE
repo_text = f"{repo.name} {repo.description or ''}".lower()
if cve_id.lower() not in repo_text and cve_id.replace('-', '_').lower() not in repo_text:
return None
# Get repository contents
exploit_files = []
indicators = {
'processes': set(),
'files': set(),
'registry': set(),
'network': set(),
'commands': set(),
'powershell': set(),
'urls': set()
}
try:
contents = repo.get_contents("")
for content in contents[:20]: # Limit files to analyze
if content.type == "file" and self._is_exploit_file(content.name):
file_analysis = await self._analyze_file_content(repo, content, cve_id)
if file_analysis:
exploit_files.append(file_analysis)
# Merge indicators
for key, values in file_analysis.get('indicators', {}).items():
if key in indicators:
indicators[key].update(values)
except Exception as e:
print(f"Error analyzing repo contents for {repo.full_name}: {str(e)}")
if not exploit_files:
return None
return {
'repo_name': repo.full_name,
'repo_url': repo.html_url,
'description': repo.description,
'language': repo.language,
'stars': repo.stargazers_count,
'updated': repo.updated_at.isoformat(),
'files': exploit_files,
'indicators': {k: list(v) for k, v in indicators.items()}
}
except Exception as e:
print(f"Error analyzing repository {repo.full_name}: {str(e)}")
return None
def _is_exploit_file(self, filename: str) -> bool:
"""Check if a file is likely to contain exploit code"""
exploit_extensions = ['.py', '.ps1', '.sh', '.c', '.cpp', '.js', '.rb', '.pl', '.php', '.java']
exploit_names = ['exploit', 'poc', 'payload', 'shell', 'reverse', 'bind', 'attack']
filename_lower = filename.lower()
# Check extension
if not any(filename_lower.endswith(ext) for ext in exploit_extensions):
return False
# Check filename for exploit-related terms
return any(term in filename_lower for term in exploit_names) or 'cve' in filename_lower
async def _analyze_file_content(self, repo, file_content, cve_id: str) -> Optional[dict]:
"""Analyze individual file content for exploit indicators"""
try:
if file_content.size > 100000: # Skip files larger than 100KB
return None
# Decode file content
content = file_content.decoded_content.decode('utf-8', errors='ignore')
# Check if file actually mentions the CVE
if cve_id.lower() not in content.lower() and cve_id.replace('-', '_').lower() not in content.lower():
return None
indicators = self._extract_indicators_from_code(content, file_content.name)
if not any(indicators.values()):
return None
return {
'filename': file_content.name,
'path': file_content.path,
'size': file_content.size,
'indicators': indicators
}
except Exception as e:
print(f"Error analyzing file {file_content.name}: {str(e)}")
return None
def _extract_indicators_from_code(self, content: str, filename: str) -> dict:
"""Extract security indicators from exploit code"""
indicators = {
'processes': set(),
'files': set(),
'registry': set(),
'network': set(),
'commands': set(),
'powershell': set(),
'urls': set()
}
# Process patterns
process_patterns = [
r'CreateProcess[AW]?\s*\(\s*["\']([^"\']+)["\']',
r'ShellExecute[AW]?\s*\([^,]*,\s*["\']([^"\']+)["\']',
r'system\s*\(\s*["\']([^"\']+)["\']',
r'exec\s*\(\s*["\']([^"\']+)["\']',
r'subprocess\.(?:call|run|Popen)\s*\(\s*["\']([^"\']+)["\']'
]
# File patterns
file_patterns = [
r'(?:fopen|CreateFile|WriteFile|ReadFile)\s*\(\s*["\']([^"\']+\.[a-zA-Z0-9]+)["\']',
r'(?:copy|move|del|rm)\s+["\']?([^\s"\']+\.[a-zA-Z0-9]+)["\']?',
r'\\\\[^\\]+\\[^\\]+\\([^\\]+\.[a-zA-Z0-9]+)',
r'[C-Z]:\\\\[^\\]+\\\\([^\\]+\.[a-zA-Z0-9]+)'
]
# Registry patterns
registry_patterns = [
r'(?:RegOpenKey|RegSetValue|RegCreateKey)\s*\([^,]*,\s*["\']([^"\']+)["\']',
r'HKEY_[A-Z_]+\\\\([^"\'\\]+)',
r'reg\s+add\s+["\']?([^"\'\\]+\\\\[^"\']+)["\']?'
]
# Network patterns
network_patterns = [
r'(?:connect|bind|listen)\s*\([^,]*,\s*(\d+)',
r'socket\.connect\s*\(\s*\(["\']?([^"\']+)["\']?,\s*(\d+)\)',
r'(?:http|https|ftp)://([^\s"\'<>]+)',
r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+)'
]
# PowerShell patterns
powershell_patterns = [
r'(?:powershell|pwsh)\s+(?:-[a-zA-Z]+\s+)*["\']?([^"\']+)["\']?',
r'Invoke-(?:Expression|Command|WebRequest|RestMethod)\s+["\']?([^"\']+)["\']?',
r'Start-Process\s+["\']?([^"\']+)["\']?',
r'Get-Process\s+["\']?([^"\']+)["\']?'
]
# Command patterns
command_patterns = [
r'(?:cmd|command)\s+(?:/[a-zA-Z]+\s+)*["\']?([^"\']+)["\']?',
r'(?:ping|nslookup|netstat|tasklist|wmic)\s+([^\s"\']+)',
r'(?:net|sc|schtasks)\s+[a-zA-Z]+\s+([^\s"\']+)'
]
# Extract indicators using regex patterns
patterns = {
'processes': process_patterns,
'files': file_patterns,
'registry': registry_patterns,
'powershell': powershell_patterns,
'commands': command_patterns
}
for category, pattern_list in patterns.items():
for pattern in pattern_list:
matches = re.findall(pattern, content, re.IGNORECASE | re.MULTILINE)
for match in matches:
if isinstance(match, tuple):
indicators[category].add(match[0])
else:
indicators[category].add(match)
# Special handling for network indicators
for pattern in network_patterns:
matches = re.findall(pattern, content, re.IGNORECASE)
for match in matches:
if isinstance(match, tuple):
if len(match) >= 2:
indicators['network'].add(f"{match[0]}:{match[1]}")
else:
indicators['network'].add(match[0])
else:
indicators['network'].add(match)
# Convert sets to lists and filter out empty/invalid indicators
cleaned_indicators = {}
for key, values in indicators.items():
cleaned_values = [v for v in values if v and len(v.strip()) > 2 and len(v) < 200]
if cleaned_values:
cleaned_indicators[key] = cleaned_values[:10] # Limit to 10 per category
return cleaned_indicators