add poc analyzer code

2025-07-16 10:15:55 -05:00 · 2025-07-16 10:15:55 -05:00 · cf57944c7f
commit cf57944c7f
parent 06c4ed74b8
3 changed files with 1083 additions and 52 deletions
--- a/backend/enhanced_sigma_generator.py
+++ b/backend/enhanced_sigma_generator.py
@ -11,6 +11,7 @@ from sqlalchemy.orm import Session
 import re
 from llm_client import LLMClient
 from cve2capec_client import CVE2CAPECClient
 from poc_analyzer import PoCAnalyzer
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@ -23,6 +24,7 @@ class EnhancedSigmaGenerator:
        self.db_session = db_session
        self.llm_client = LLMClient(provider=llm_provider, model=llm_model)
        self.cve2capec_client = CVE2CAPECClient()
        self.poc_analyzer = PoCAnalyzer()
    async def generate_enhanced_rule(self, cve, use_llm: bool = True) -> dict:
        """Generate enhanced SIGMA rule for a CVE using PoC data"""
@ -134,10 +136,17 @@ class EnhancedSigmaGenerator:
                logger.warning(f"No PoC content available for {cve.cve_id}")
                return None
-            # Generate rule using LLM
+            # Analyze PoC content with the PoC analyzer
            logger.info(f"Analyzing PoC content for {cve.cve_id} with PoCAnalyzer")
            poc_analysis = self.poc_analyzer.analyze_poc(poc_content, cve.cve_id)
            # Enhance the PoC content with structured analysis
            enhanced_poc_content = self._format_poc_analysis_for_llm(poc_content, poc_analysis)
            # Generate rule using LLM with enhanced PoC content
            rule_content = await self.llm_client.generate_sigma_rule(
                cve_id=cve.cve_id,
-                poc_content=poc_content,
+                poc_content=enhanced_poc_content,
                cve_description=cve.description or "",
                existing_rule=None
            )
@ -234,6 +243,120 @@ class EnhancedSigmaGenerator:
            return None
    def _format_poc_analysis_for_llm(self, original_poc_content: str, poc_analysis: dict) -> str:
        """Format PoC analysis results for LLM consumption"""
        # Extract key findings from analysis
        language = poc_analysis.get('language', 'unknown')
        quality_score = poc_analysis.get('quality_score', {})
        mitre_techniques = poc_analysis.get('mitre_techniques', [])
        behaviors = poc_analysis.get('behaviors', [])
        # Extract indicators
        processes = poc_analysis.get('processes', [])
        files = poc_analysis.get('files', [])
        network = poc_analysis.get('network', [])
        registry = poc_analysis.get('registry', [])
        commands = poc_analysis.get('commands', [])
        # Build enhanced content for LLM
        enhanced_content = f"""**ORIGINAL POC CODE:**
 {original_poc_content[:2000]}
 **STRUCTURED POC ANALYSIS:**
 **Language Detected:** {language}
 **Security Indicators Extracted:**
 **Process Execution Indicators:**
 {self._format_indicators_for_display(processes)}
 **File System Indicators:**
 {self._format_indicators_for_display(files)}
 **Network Communication Indicators:**
 {self._format_indicators_for_display(network)}
 **Registry Modification Indicators:**
 {self._format_indicators_for_display(registry)}
 **Command Execution Indicators:**
 {self._format_indicators_for_display(commands)}
 **MITRE ATT&CK Techniques Detected:**
 {self._format_mitre_techniques_for_display(mitre_techniques)}
 **Attack Behaviors Identified:**
 {self._format_behaviors_for_display(behaviors)}
 **Analysis Quality:**
 - Overall Score: {quality_score.get('overall_score', 0)}/1.0
 - Total Indicators: {poc_analysis.get('total_indicators', 0)}
 - High Confidence Indicators: {poc_analysis.get('high_confidence_indicators', 0)}
 - Recommendation: {quality_score.get('recommendation', 'Unknown')}
 **DETECTION GUIDANCE:**
 Use the above structured indicators to create specific SIGMA detection patterns. Focus on the high-confidence indicators and behaviors for the most accurate detection rules."""
        return enhanced_content
    def _format_indicators_for_display(self, indicators: list) -> str:
        """Format indicators for LLM display"""
        if not indicators:
            return "- None detected"
        formatted = []
        for indicator in indicators[:5]:  # Limit to top 5 indicators
            if isinstance(indicator, dict):
                value = indicator.get('value', str(indicator))
                confidence = indicator.get('confidence', 0)
                attack_technique = indicator.get('attack_technique')
                technique_info = f" (MITRE: {attack_technique})" if attack_technique else ""
                formatted.append(f"- {value} (confidence: {confidence:.2f}){technique_info}")
            else:
                formatted.append(f"- {indicator}")
        if len(indicators) > 5:
            formatted.append(f"- ... and {len(indicators) - 5} more indicators")
        return "\n".join(formatted)
    def _format_mitre_techniques_for_display(self, techniques: list) -> str:
        """Format MITRE ATT&CK techniques for display"""
        if not techniques:
            return "- None detected"
        formatted = []
        for technique in techniques:
            # Get technique name if available
            technique_name = self.cve2capec_client.get_technique_name(technique) if hasattr(self, 'cve2capec_client') else ""
            if technique_name:
                formatted.append(f"- {technique}: {technique_name}")
            else:
                formatted.append(f"- {technique}")
        return "\n".join(formatted)
    def _format_behaviors_for_display(self, behaviors: list) -> str:
        """Format attack behaviors for display"""
        if not behaviors:
            return "- None detected"
        formatted = []
        for behavior in behaviors:
            if isinstance(behavior, dict):
                behavior_type = behavior.get('behavior', 'unknown')
                confidence = behavior.get('confidence', 0)
                indicators = behavior.get('indicators', [])
                formatted.append(f"- {behavior_type.replace('_', ' ').title()} (confidence: {confidence:.2f})")
                if indicators:
                    formatted.append(f"  Indicators: {', '.join(indicators[:3])}")
            else:
                formatted.append(f"- {behavior}")
        return "\n".join(formatted)
    def _extract_log_source_from_content(self, rule_content: str) -> str:
        """Extract log source from the generated rule content"""
        try:
@ -467,7 +590,7 @@ class EnhancedSigmaGenerator:
                if poc.get('html_url'):
                    refs.append(poc['html_url'])
-        return '\\n'.join(f"    - {ref}" for ref in refs)
+        return '\n'.join(f"    - {ref}" for ref in refs)
    def _generate_tags(self, cve, poc_data: list) -> str:
        """Generate MITRE ATT&CK tags and other tags using CVE2CAPEC mappings"""
@ -488,21 +611,9 @@ class EnhancedSigmaGenerator:
                if attack_tag not in tags:
                    tags.append(attack_tag)
        else:
-            # Fallback to indicator-based technique detection
+            # No CVE2CAPEC mapping found - do not add fallback techniques
-            logger.info(f"No CVE2CAPEC mapping found for {cve.cve_id}, using indicator-based detection")
+            logger.warning(f"No CVE2CAPEC mapping found for {cve.cve_id}, no MITRE techniques will be added")
-            combined_indicators = self._combine_exploit_indicators(poc_data)
+            # Note: LLM will rely on the PoC analysis to determine appropriate techniques
            if combined_indicators.get('processes'):
                tags.append('attack.t1059')  # Command and Scripting Interpreter
            if combined_indicators.get('network'):
                tags.append('attack.t1071')  # Application Layer Protocol
            if combined_indicators.get('files'):
                tags.append('attack.t1105')  # Ingress Tool Transfer
            if any('powershell' in p.lower() for p in combined_indicators.get('processes', [])):
                tags.append('attack.t1059.001')  # PowerShell
        # Get CWE codes for additional context
        cwe_codes = self.cve2capec_client.get_cwe_for_cve(cve.cve_id)
@ -518,17 +629,18 @@ class EnhancedSigmaGenerator:
            quality_tier = best_poc.get('quality_analysis', {}).get('quality_tier', 'poor')
            tags.append(f'poc.quality.{quality_tier}')
-        # Return tags as a single line for first tag, then additional tags on new lines
+        # Return tags as YAML array format
        if not tags:
            return "unknown"
        if len(tags) == 1:
            return tags[0]
        else:
-            # First tag goes directly after the dash, rest are on new lines
+            # Format as proper YAML array
-            first_tag = tags[0]
+            formatted_tags = []
-            additional_tags = '\\n'.join(f"    - {tag}" for tag in tags[1:])
+            for tag in tags:
-            return f"{first_tag}\\n{additional_tags}"
+                formatted_tags.append(f"    - {tag}")
            return '\n'.join(formatted_tags)
    def _format_indicators(self, indicators: list) -> str:
        """Format indicators for SIGMA rule"""
@ -546,7 +658,7 @@ class EnhancedSigmaGenerator:
                escaped = cleaned.replace('\\\\', '\\\\\\\\').replace('*', '\\\\*').replace('?', '\\\\?')
                formatted.append(f'            - "{escaped}"')
-        return '\\n'.join(formatted) if formatted else '            - "*"  # No valid indicators'
+        return '\n'.join(formatted) if formatted else '            - "*"  # No valid indicators'
    def _enhance_detection_logic(self, rule_content: str, indicators: dict, poc_data: list) -> str:
        """Enhance detection logic based on PoC quality and indicators"""
@ -566,7 +678,7 @@ class EnhancedSigmaGenerator:
                # Insert before the condition line
                rule_content = rule_content.replace(
                    'condition: selection',
-                    additional_condition + '\\n    condition: selection or process_and_command'
+                    additional_condition + '\n    condition: selection or process_and_command'
                )
        return rule_content
--- a/backend/llm_client.py
+++ b/backend/llm_client.py
@ -278,13 +278,47 @@ class LLMClient:
 - status: experimental
 - description: Specific description based on CVE and PoC analysis
 - author: 'AI Generated'
- date: Current date (2025/01/14)
+- date: Current date (2025/01/16)
 - references: Include the EXACT CVE URL with the CVE ID provided by the user
 - tags: Relevant MITRE ATT&CK techniques based on PoC analysis
 - logsource: Appropriate category based on exploit type
 - detection: Specific indicators from PoC analysis (NOT generic examples)
 - condition: Logic connecting the detection selections
 **MITRE ATT&CK TAGS FORMAT REQUIREMENTS:**
 - Use ONLY the MITRE ATT&CK techniques provided in the "MITRE ATT&CK TECHNIQUE MAPPINGS" section above
 - Convert technique IDs to lowercase attack.t format (e.g., T1134 becomes attack.t1134)
 - Include specific sub-techniques when available (e.g., T1134.001 becomes attack.t1134.001)
 - DO NOT use generic techniques not listed in the mappings
 - DO NOT add additional techniques based on your training data
 **CRITICAL:** ONLY use the MITRE ATT&CK techniques explicitly provided in the technique mappings above. Do not add any other techniques.
 **COMPLETE SIGMA RULE EXAMPLE (TECHNIQUE TAGS MUST MATCH PROVIDED MAPPINGS):**
 ```yaml
 title: 'CVE-2024-XXXX Detection Rule'
 id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
 status: experimental
 description: 'Detection for CVE-2024-XXXX vulnerability'
 author: 'AI Generated'
 date: 2025/01/16
 references:
  - https://nvd.nist.gov/vuln/detail/CVE-2024-XXXX
 tags:
  - attack.t1134      # Access Token Manipulation (example - use actual mappings)
  - attack.t1134.001  # Token Impersonation/Theft (example - use actual mappings)
 logsource:
  category: process_creation
  product: windows
 detection:
  selection:
    Image|contains: 'specific_indicator'
  condition: selection
 level: medium
 ```
 **IMPORTANT:** The tags section above is just an example format. You MUST use the exact techniques provided in the MITRE ATT&CK TECHNIQUE MAPPINGS section for the specific CVE you're analyzing.
 **CRITICAL ANTI-HALLUCINATION RULES:**
 1. You MUST use the EXACT CVE ID provided in the user input - NEVER generate a different CVE ID
 2. NEVER use example CVE IDs like CVE-2022-1234, CVE-2023-5678, or CVE-2024-1234
@ -323,7 +357,14 @@ Enhance this rule with PoC insights. Output only valid SIGMA YAML starting with
 **MITRE ATT&CK TECHNIQUE MAPPINGS FOR {cve_id}:**
 {chr(10).join(technique_details)}
-**IMPORTANT:** Use these exact MITRE ATT&CK techniques in your tags section. Convert them to lowercase attack.t format (e.g., T1059 becomes attack.t1059)."""
+**CRITICAL REQUIREMENT:** Use ONLY these exact MITRE ATT&CK techniques in your tags section. Convert them to lowercase attack.t format (e.g., T1134 becomes attack.t1134, T1134.001 becomes attack.t1134.001). 
 **ABSOLUTELY FORBIDDEN:**
 - Do not use T1059, T1071, T1105, T1055, T1068, T1140, T1036, T1112, T1547 or any other techniques not listed above
 - Do not add techniques based on PoC analysis if they're not in the provided mappings
 - Do not use generic techniques from your training data
 If no MITRE techniques are provided above, use only CVE and CWE tags."""
            if mitre_mappings['cwe_codes']:
                mitre_suggestions += f"""
@ -344,10 +385,25 @@ Enhance this rule with PoC insights. Output only valid SIGMA YAML starting with
 1. Use EXACTLY this CVE ID in the title: {{cve_id}}
 2. Use EXACTLY this CVE URL in references: https://nvd.nist.gov/vuln/detail/{{cve_id}}
 3. Analyze the CVE description to understand the vulnerability type
-4. Extract specific indicators from the PoC code (files, processes, commands, network patterns)
+4. If the PoC analysis above contains structured indicators, use those EXACT indicators in your detection rules
-5. Create detection logic based on the actual exploit behavior
+5. **USE ONLY THE MITRE ATT&CK TECHNIQUES LISTED IN THE MAPPINGS ABOVE** - Do not add any other techniques
-6. Use relevant logsource category (process_creation, file_event, network_connection, etc.)
+6. Choose the appropriate logsource category based on the primary indicator types (process_creation, file_event, network_connection, registry_event, etc.)
-7. Include the MITRE ATT&CK tags listed above in your tags section (convert to attack.t format)
+7. Convert the mapped MITRE techniques to lowercase attack.t format (T1134 → attack.t1134, T1134.001 → attack.t1134.001)
 **DETECTION PATTERN GUIDANCE:**
 - For Process Execution indicators: Use Image, CommandLine, or ProcessName fields
 - For File System indicators: Use TargetFilename, SourceFilename, or FilePath fields
 - For Network indicators: Use DestinationHostname, DestinationIp, or DestinationPort fields
 - For Registry indicators: Use TargetObject, Details, or EventType fields
 - For Command indicators: Use CommandLine or ProcessCommandLine fields
 **TAGS FORMATTING REQUIREMENTS:**
 - Use ONLY the MITRE ATT&CK techniques provided in the "MITRE ATT&CK TECHNIQUE MAPPINGS" section above
 - Convert to lowercase attack.t format: T1134 → attack.t1134, T1134.001 → attack.t1134.001
 - Include comments for clarity: attack.t1134  # Access Token Manipulation
 - Use specific sub-techniques when available
 - DO NOT add techniques not listed in the provided mappings
 - DO NOT use generic techniques from your training data
 **CRITICAL ANTI-HALLUCINATION REQUIREMENTS:**
 - THE CVE ID IS: {{cve_id}}
@ -355,6 +411,7 @@ Enhance this rule with PoC insights. Output only valid SIGMA YAML starting with
 - DO NOT generate a different CVE ID from your training data
 - You MUST use the exact CVE ID "{{cve_id}}" - this is the ONLY acceptable CVE ID for this rule
 - Base your analysis ONLY on the provided CVE description and PoC code above
 - If structured indicators are provided in the PoC analysis, use those exact values
 - Do not reference other vulnerabilities or exploits not mentioned in the provided content
 - NEVER use placeholder CVE IDs like CVE-YYYY-NNNN or CVE-2022-1234
@ -741,30 +798,36 @@ Output ONLY valid SIGMA YAML starting with 'title:' that includes the exact CVE
            stripped = line.strip()
            # Check for orphaned list items (lines starting with - but not part of an array)
            # But be more careful - don't remove items that are properly indented under a parent
            if (stripped.startswith('- ') and 
                i > 0 and 
                not lines[i-1].strip().endswith(':') and
                ':' not in stripped and
-                not stripped.startswith('- https://')):  # Don't remove reference URLs
+                not stripped.startswith('- https://') and  # Don't remove reference URLs
                not stripped.startswith('- attack.') and  # Don't remove MITRE ATT&CK tags
                not re.match(r'- [a-z0-9._-]+$', stripped)):  # Don't remove simple tags
-                # Check if this looks like a MITRE ATT&CK tag
+                # Check if this is properly indented under a parent (like tags:)
-                if re.match(r'- T\d{4}', stripped):
+                is_properly_indented = False
-                    # Try to find the tags section and add it there
+                current_indent = len(line) - len(line.lstrip())
-                    tags_line_found = False
+                
-                    for j in range(len(fixed_lines)-1, -1, -1):
+                # Look backwards to find a parent with less indentation
-                        if fixed_lines[j].strip().startswith('tags:'):
+                for j in range(i-1, -1, -1):
-                            # This is an orphaned tag, add it to the tags array
+                    prev_line = lines[j]
-                            fixed_lines.append(f"  {stripped}")
+                    prev_stripped = prev_line.strip()
-                            fixes_applied.append(f"Fixed orphaned MITRE tag: {stripped}")
+                    prev_indent = len(prev_line) - len(prev_line.lstrip())
-                            tags_line_found = True
+                    
                    if prev_stripped and prev_indent < current_indent:
                        # Found a parent with less indentation
                        if prev_stripped.endswith(':'):
                            is_properly_indented = True
                            break
                        else:
                            # This is likely orphaned
                            break
-                    if not tags_line_found:
+                if not is_properly_indented:
-                        # No tags section found, remove the orphaned item
+                    # This is truly orphaned, remove it
                        fixes_applied.append(f"Removed orphaned tag (no tags section): {stripped}")
                    continue
                else:
                    # Other orphaned list items, remove them
                    fixes_applied.append(f"Removed orphaned list item: {stripped}")
                    continue
@ -825,6 +888,15 @@ Output ONLY valid SIGMA YAML starting with 'title:' that includes the exact CVE
            except yaml.YAMLError as e2:
                logger.warning(f"YAML repair attempt failed: {e2}")
                # Try a more aggressive repair before falling back to minimal rule
                aggressive_repair = self._aggressive_yaml_repair(content)
                try:
                    yaml.safe_load(aggressive_repair)
                    fixes_applied.append("Applied aggressive YAML repair")
                    logger.info("Successfully repaired YAML with aggressive method")
                    return aggressive_repair
                except yaml.YAMLError as e3:
                    logger.warning(f"Aggressive repair also failed: {e3}")
                    # Last resort: try to build a minimal valid SIGMA rule
                    return self._build_minimal_valid_rule(content, fixes_applied)
@ -837,6 +909,8 @@ Output ONLY valid SIGMA YAML starting with 'title:' that includes the exact CVE
        expected_indent = 0
        in_detection = False
        detection_indent = 0
        in_tags = False
        tags_indent = 0
        for i, line in enumerate(lines):
            stripped = line.strip()
@ -847,6 +921,24 @@ Output ONLY valid SIGMA YAML starting with 'title:' that includes the exact CVE
                repaired_lines.append(line)
                continue
            # Track if we're in the tags section
            if stripped.startswith('tags:'):
                in_tags = True
                tags_indent = current_indent
                repaired_lines.append(line)
                continue
            elif in_tags and current_indent <= tags_indent and not stripped.startswith('-'):
                # We've left the tags section
                in_tags = False
            # Fix tags section indentation
            if in_tags and stripped.startswith('-'):
                # Ensure proper indentation for tag items
                if current_indent <= tags_indent:
                    corrected_line = ' ' * (tags_indent + 2) + stripped
                    repaired_lines.append(corrected_line)
                    continue
            # Track if we're in the detection section
            if stripped.startswith('detection:'):
                in_detection = True
@ -875,6 +967,21 @@ Output ONLY valid SIGMA YAML starting with 'title:' that includes the exact CVE
                            repaired_lines.append(corrected_line)
                            continue
            # Fix logsource section indentation
            if stripped.startswith('logsource:'):
                # Logsource should be at root level (no indentation)
                if current_indent > 0:
                    corrected_line = stripped
                    repaired_lines.append(corrected_line)
                    continue
            elif line.lstrip().startswith(('category:', 'product:', 'service:')) and i > 0:
                # These should be indented under logsource
                prev_line = lines[i-1].strip()
                if prev_line.startswith('logsource:') or any('logsource' in repaired_lines[j] for j in range(max(0, len(repaired_lines)-5), len(repaired_lines))):
                    corrected_line = '    ' + stripped
                    repaired_lines.append(corrected_line)
                    continue
            # Fix lines that start with wrong indentation
            if ':' in stripped and not stripped.startswith('-'):
                # This is a key-value pair
@ -891,6 +998,85 @@ Output ONLY valid SIGMA YAML starting with 'title:' that includes the exact CVE
        return '\n'.join(repaired_lines)
    def _aggressive_yaml_repair(self, content: str) -> str:
        """Aggressive YAML repair that reconstructs the document structure."""
        lines = content.split('\n')
        # Extract key components
        title = "Generated SIGMA Rule"
        rule_id = "00000000-0000-0000-0000-000000000000"
        description = "Generated detection rule"
        author = "AI Generated"
        date = "2025/01/16"
        references = []
        tags = []
        logsource_category = "process_creation"
        logsource_product = "windows"
        detection_rules = []
        condition = "selection"
        level = "medium"
        # Parse existing content
        for i, line in enumerate(lines):
            stripped = line.strip()
            if stripped.startswith('title:'):
                title = stripped.split(':', 1)[1].strip().strip('"\'')
            elif stripped.startswith('id:'):
                rule_id = stripped.split(':', 1)[1].strip().strip('"\'')
            elif stripped.startswith('description:'):
                description = stripped.split(':', 1)[1].strip().strip('"\'')
            elif stripped.startswith('author:'):
                author = stripped.split(':', 1)[1].strip().strip('"\'')
            elif stripped.startswith('date:'):
                date = stripped.split(':', 1)[1].strip().strip('"\'')
            elif stripped.startswith('level:'):
                level = stripped.split(':', 1)[1].strip().strip('"\'')
            elif stripped.startswith('condition:'):
                condition = stripped.split(':', 1)[1].strip().strip('"\'')
            elif stripped.startswith('- http'):
                references.append(stripped[2:].strip())
            elif stripped.startswith('- attack.') or stripped.startswith('- cve-') or stripped.startswith('- exploit.') or stripped.startswith('- poc.') or stripped.startswith('- cwe.'):
                tags.append(stripped[2:].strip())
            elif 'category:' in stripped:
                logsource_category = stripped.split(':', 1)[1].strip().strip('"\'')
            elif 'product:' in stripped:
                logsource_product = stripped.split(':', 1)[1].strip().strip('"\'')
        # Build a clean YAML structure
        yaml_content = f"""title: '{title}'
 id: {rule_id}
 status: experimental
 description: '{description}'
 author: '{author}'
 date: {date}
 references:"""
        if references:
            for ref in references:
                yaml_content += f"\n  - {ref}"
        else:
            yaml_content += "\n  - https://example.com"
        yaml_content += "\ntags:"
        if tags:
            for tag in tags:
                yaml_content += f"\n  - {tag}"
        else:
            yaml_content += "\n  - unknown"
        yaml_content += f"""
 logsource:
  category: {logsource_category}
  product: {logsource_product}
 detection:
  selection:
    Image: '*'
  condition: {condition}
 level: {level}"""
        return yaml_content
    def _build_minimal_valid_rule(self, content: str, fixes_applied: list) -> str:
        """Build a minimal valid SIGMA rule from the content."""
        lines = content.split('\n')
@ -915,7 +1101,7 @@ id: {rule_id}
 status: experimental
 description: '{description}'
 author: 'AI Generated'
-date: 2025/01/14
+date: 2025/01/16
 references:
  - https://example.com
 logsource:
--- a/backend/poc_analyzer.py
+++ b/backend/poc_analyzer.py
@ -0,0 +1,733 @@
 """
 Advanced PoC (Proof of Concept) analyzer for extracting security indicators
 from exploit code across multiple programming languages and attack vectors.
 """
 import re
 import base64
 import binascii
 from typing import Dict, List, Set, Optional, Tuple
 from dataclasses import dataclass
 from enum import Enum
 import logging
 logger = logging.getLogger(__name__)
 class AttackTechnique(Enum):
    PROCESS_INJECTION = "T1055"
    COMMAND_EXECUTION = "T1059"
    POWERSHELL = "T1059.001"
    COMMAND_LINE = "T1059.003"
    FILE_CREATION = "T1105"
    REGISTRY_MODIFICATION = "T1112"
    NETWORK_CONNECTION = "T1071"
    PRIVILEGE_ESCALATION = "T1068"
    DLL_INJECTION = "T1055.001"
    PROCESS_HOLLOWING = "T1055.012"
    SERVICE_CREATION = "T1543.003"
@dataclass
 class SecurityIndicator:
    """Represents a security indicator extracted from PoC code."""
    type: str  # process, file, network, registry, command
    value: str
    confidence: float  # 0.0 to 1.0
    context: str  # surrounding code context
    attack_technique: Optional[AttackTechnique] = None
    metadata: Dict = None
 class PoCAnalyzer:
    """Advanced analyzer for extracting security indicators from PoC code."""
    def __init__(self):
        self.indicators: List[SecurityIndicator] = []
        self.language_patterns = self._initialize_language_patterns()
        self.attack_patterns = self._initialize_attack_patterns()
        self.false_positive_filters = self._initialize_fp_filters()
    def analyze_poc(self, poc_content: str, cve_id: str = None) -> Dict[str, any]:
        """
        Main analysis function that extracts all security indicators.
        Args:
            poc_content: The PoC source code
            cve_id: Optional CVE identifier for context
        Returns:
            Dictionary containing categorized indicators and analysis
        """
        self.indicators = []
        # Detect programming language
        language = self._detect_language(poc_content)
        # Extract indicators by category
        processes = self._extract_process_indicators(poc_content, language)
        files = self._extract_file_indicators(poc_content, language)
        network = self._extract_network_indicators(poc_content, language)
        registry = self._extract_registry_indicators(poc_content, language)
        commands = self._extract_command_indicators(poc_content, language)
        # Extract encoded/obfuscated content
        decoded_content = self._extract_encoded_content(poc_content)
        if decoded_content:
            # Recursively analyze decoded content
            for content in decoded_content:
                sub_analysis = self.analyze_poc(content)
                processes.extend(sub_analysis['processes'])
                files.extend(sub_analysis['files'])
                network.extend(sub_analysis['network'])
                registry.extend(sub_analysis['registry'])
                commands.extend(sub_analysis['commands'])
        # Behavioral analysis
        behaviors = self._analyze_attack_behaviors(poc_content, language)
        # MITRE ATT&CK technique mapping
        techniques = self._map_to_mitre_attack(
            processes + files + network + registry + commands
        )
        # Quality assessment
        analysis_quality = self._assess_analysis_quality(poc_content)
        return {
            'language': language,
            'processes': self._deduplicate_and_rank(processes),
            'files': self._deduplicate_and_rank(files),
            'network': self._deduplicate_and_rank(network),
            'registry': self._deduplicate_and_rank(registry),
            'commands': self._deduplicate_and_rank(commands),
            'behaviors': behaviors,
            'mitre_techniques': techniques,
            'quality_score': analysis_quality,
            'total_indicators': len(self.indicators),
            'high_confidence_indicators': len([i for i in self.indicators if i.confidence > 0.7])
        }
    def _detect_language(self, content: str) -> str:
        """Detect the primary programming language of the PoC."""
        language_indicators = {
            'powershell': [
                r'\$[a-zA-Z_][a-zA-Z0-9_]*', r'Get-\w+', r'Set-\w+', r'New-\w+',
                r'Invoke-\w+', r'Add-Type', r'\[System\.\w+\]'
            ],
            'python': [
                r'import\s+\w+', r'from\s+\w+\s+import', r'def\s+\w+\(',
                r'subprocess\.', r'os\.system', r'__name__\s*==\s*["\']__main__["\']'
            ],
            'bash': [
                r'#!/bin/bash', r'#!/bin/sh', r'\$\{[^}]+\}', r'chmod\s+\+x',
                r'wget\s+', r'curl\s+', r'echo\s+.*\|'
            ],
            'batch': [
                r'@echo\s+off', r'%[^%]+%', r'goto\s+\w+', r'if\s+exist',
                r'cmd\s*/c', r'start\s+'
            ],
            'c_cpp': [
                r'#include\s*<[^>]+>', r'int\s+main\s*\(', r'printf\s*\(',
                r'malloc\s*\(', r'free\s*\(', r'system\s*\('
            ],
            'csharp': [
                r'using\s+System', r'namespace\s+\w+', r'class\s+\w+',
                r'Process\.Start', r'Registry\.', r'new\s+ProcessStartInfo'
            ],
            'javascript': [
                r'function\s+\w+\s*\(', r'var\s+\w+\s*=', r'console\.log',
                r'require\s*\(', r'=>', r'new\s+XMLHttpRequest'
            ],
            'php': [
                r'<\?php', r'\$[a-zA-Z_][a-zA-Z0-9_]*', r'echo\s+',
                r'exec\s*\(', r'system\s*\(', r'shell_exec'
            ]
        }
        scores = {}
        content_lower = content.lower()
        for lang, patterns in language_indicators.items():
            score = 0
            for pattern in patterns:
                matches = len(re.findall(pattern, content, re.IGNORECASE | re.MULTILINE))
                score += matches
            scores[lang] = score
        if not scores or max(scores.values()) == 0:
            return 'unknown'
        return max(scores, key=scores.get)
    def _extract_process_indicators(self, content: str, language: str) -> List[SecurityIndicator]:
        """Extract process execution indicators."""
        indicators = []
        patterns = {
            'powershell': [
                r'Start-Process\s+["\']?([^"\';\s]+)',
                r'Invoke-Expression\s+["\']?([^"\';\s]+)',
                r'&\s+["\']?([^"\';\s]+\.exe)',
                r'\.\s+["\']?([^"\';\s]+\.exe)'
            ],
            'python': [
                r'subprocess\.call\(\s*["\']([^"\']+)',
                r'subprocess\.Popen\(\s*["\']([^"\']+)',
                r'os\.system\(\s*["\']([^"\']+)',
                r'os\.exec[vl]?p?\(\s*["\']([^"\']+)'
            ],
            'bash': [
                r'exec\s+([^;\s&|]+)',
                r'/bin/sh\s+-c\s+["\']([^"\']+)',
                r'system\(\s*["\']([^"\']+)'
            ],
            'batch': [
                r'start\s+["\']?([^"\';\s]+)',
                r'cmd\s*/c\s+["\']?([^"\']+)',
                r'call\s+["\']?([^"\';\s]+)'
            ],
            'c_cpp': [
                r'system\(\s*["\']([^"\']+)',
                r'execve?\(\s*["\']([^"\']+)',
                r'CreateProcess[AW]?\([^,]*["\']([^"\']+)'
            ],
            'csharp': [
                r'Process\.Start\(\s*["\']([^"\']+)',
                r'ProcessStartInfo.*FileName\s*=\s*["\']([^"\']+)',
                r'new\s+Process.*["\']([^"\']+)'
            ]
        }
        if language in patterns:
            for pattern in patterns[language]:
                matches = re.finditer(pattern, content, re.IGNORECASE | re.MULTILINE)
                for match in matches:
                    process_name = match.group(1)
                    context = self._get_context(content, match.start(), match.end())
                    confidence = self._calculate_confidence(process_name, 'process', context)
                    if confidence > 0.3:  # Filter low confidence matches
                        indicators.append(SecurityIndicator(
                            type='process',
                            value=process_name,
                            confidence=confidence,
                            context=context,
                            attack_technique=AttackTechnique.PROCESS_INJECTION if 'inject' in context.lower() else AttackTechnique.COMMAND_EXECUTION
                        ))
        return indicators
    def _extract_file_indicators(self, content: str, language: str) -> List[SecurityIndicator]:
        """Extract file system indicators."""
        indicators = []
        # File path patterns
        file_patterns = [
            r'["\']([a-zA-Z]:\\[^"\'<>|*?]+\.[a-zA-Z0-9]+)["\']',  # Windows paths
            r'["\']([/][^"\'<>|*?\s]+\.[a-zA-Z0-9]+)["\']',       # Unix paths
            r'["\'](\./[^"\'<>|*?\s]+\.[a-zA-Z0-9]+)["\']',       # Relative paths
            r'%TEMP%\\([^"\'<>|*?\s]+\.[a-zA-Z0-9]+)',            # Windows temp
            r'/tmp/([^"\'<>|*?\s]+\.[a-zA-Z0-9]+)',               # Unix temp
        ]
        # Language-specific file operations
        operation_patterns = {
            'powershell': [
                r'New-Item.*Path.*["\']([^"\']+)["\']',
                r'Out-File.*["\']([^"\']+)["\']',
                r'Set-Content.*["\']([^"\']+)["\']',
                r'\|\s*Out-File\s+["\']([^"\']+)["\']'
            ],
            'python': [
                r'open\(\s*["\']([^"\']+)["\']',
                r'with\s+open\(\s*["\']([^"\']+)["\']',
                r'shutil\.copy.*["\']([^"\']+)["\']'
            ],
            'bash': [
                r'touch\s+["\']?([^"\';\s]+)',
                r'cp\s+[^"\';\s]+\s+["\']?([^"\';\s]+)',
                r'mv\s+[^"\';\s]+\s+["\']?([^"\';\s]+)',
                r'echo.*>\s*["\']?([^"\';\s]+)'
            ],
            'c_cpp': [
                r'fopen\(\s*["\']([^"\']+)["\']',
                r'CreateFile[AW]?\([^,]*["\']([^"\']+)["\']',
                r'WriteFile.*["\']([^"\']+)["\']'
            ]
        }
        # Extract file paths
        for pattern in file_patterns:
            matches = re.finditer(pattern, content, re.IGNORECASE)
            for match in matches:
                file_path = match.group(1)
                context = self._get_context(content, match.start(), match.end())
                confidence = self._calculate_confidence(file_path, 'file', context)
                if confidence > 0.4:
                    indicators.append(SecurityIndicator(
                        type='file',
                        value=file_path,
                        confidence=confidence,
                        context=context,
                        attack_technique=AttackTechnique.FILE_CREATION
                    ))
        # Extract file operations
        if language in operation_patterns:
            for pattern in operation_patterns[language]:
                matches = re.finditer(pattern, content, re.IGNORECASE | re.MULTILINE)
                for match in matches:
                    file_path = match.group(1)
                    context = self._get_context(content, match.start(), match.end())
                    confidence = self._calculate_confidence(file_path, 'file', context)
                    if confidence > 0.4:
                        indicators.append(SecurityIndicator(
                            type='file',
                            value=file_path,
                            confidence=confidence,
                            context=context,
                            attack_technique=AttackTechnique.FILE_CREATION
                        ))
        return indicators
    def _extract_network_indicators(self, content: str, language: str) -> List[SecurityIndicator]:
        """Extract network communication indicators."""
        indicators = []
        # Network patterns
        network_patterns = [
            r'(?:http[s]?://)([^/\s"\']+)',                    # URLs
            r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})',         # IP addresses
            r':(\d{2,5})\b',                                  # Port numbers
            r'Host:\s*([^\s\r\n]+)',                          # HTTP Host headers
            r'User-Agent:\s*([^\r\n]+)',                      # User agents
        ]
        # Language-specific network operations
        operation_patterns = {
            'powershell': [
                r'Invoke-WebRequest.*Uri.*["\']([^"\']+)["\']',
                r'New-Object.*WebClient.*DownloadString.*["\']([^"\']+)["\']',
                r'System\.Net\.Sockets\.TcpClient.*(\d+)',
                r'Connect.*(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}).*(\d+)'
            ],
            'python': [
                r'requests\.get\(\s*["\']([^"\']+)["\']',
                r'urllib\.request\.urlopen\(\s*["\']([^"\']+)["\']',
                r'socket\.connect\(\s*\(["\']([^"\']+)["\'],\s*(\d+)',
                r'http\.client\.HTTPConnection\(\s*["\']([^"\']+)["\']'
            ],
            'bash': [
                r'wget\s+["\']?([^"\';\s]+)',
                r'curl\s+["\']?([^"\';\s]+)',
                r'nc\s+([^\s]+)\s+(\d+)',
                r'netcat\s+([^\s]+)\s+(\d+)'
            ],
            'c_cpp': [
                r'connect\([^,]*inet_addr\(["\']([^"\']+)["\']',
                r'gethostbyname\(["\']([^"\']+)["\']',
                r'socket\(.*SOCK_STREAM'
            ]
        }
        # Extract network indicators
        for pattern in network_patterns:
            matches = re.finditer(pattern, content, re.IGNORECASE)
            for match in matches:
                network_indicator = match.group(1) if len(match.groups()) > 0 else match.group(0)
                context = self._get_context(content, match.start(), match.end())
                confidence = self._calculate_confidence(network_indicator, 'network', context)
                if confidence > 0.3:
                    indicators.append(SecurityIndicator(
                        type='network',
                        value=network_indicator,
                        confidence=confidence,
                        context=context,
                        attack_technique=AttackTechnique.NETWORK_CONNECTION
                    ))
        return indicators
    def _extract_registry_indicators(self, content: str, language: str) -> List[SecurityIndicator]:
        """Extract Windows registry indicators."""
        indicators = []
        # Registry key patterns
        registry_patterns = [
            r'(HKEY_[A-Z_]+\\[^"\';\s\]]+)',
            r'(HKLM\\[^"\';\s\]]+)',
            r'(HKCU\\[^"\';\s\]]+)',
            r'(SOFTWARE\\[^"\';\s\]]+)',
            r'(SYSTEM\\[^"\';\s\]]+)'
        ]
        # Language-specific registry operations
        operation_patterns = {
            'powershell': [
                r'New-ItemProperty.*Path.*["\']([^"\']+)["\']',
                r'Set-ItemProperty.*Path.*["\']([^"\']+)["\']',
                r'Get-ItemProperty.*Path.*["\']([^"\']+)["\']',
                r'Remove-ItemProperty.*Path.*["\']([^"\']+)["\']'
            ],
            'batch': [
                r'reg\s+add\s+["\']?([^"\';\s]+)',
                r'reg\s+query\s+["\']?([^"\';\s]+)',
                r'reg\s+delete\s+["\']?([^"\';\s]+)'
            ],
            'c_cpp': [
                r'RegCreateKey[Ex]?[AW]?.*["\']([^"\']+)["\']',
                r'RegSetValue[Ex]?[AW]?.*["\']([^"\']+)["\']',
                r'RegOpenKey[Ex]?[AW]?.*["\']([^"\']+)["\']'
            ],
            'csharp': [
                r'Registry\.[^.]+\.OpenSubKey\(["\']([^"\']+)["\']',
                r'RegistryKey.*["\']([^"\']+)["\']'
            ]
        }
        # Extract registry keys
        for pattern in registry_patterns:
            matches = re.finditer(pattern, content, re.IGNORECASE)
            for match in matches:
                reg_key = match.group(1)
                context = self._get_context(content, match.start(), match.end())
                confidence = self._calculate_confidence(reg_key, 'registry', context)
                if confidence > 0.4:
                    indicators.append(SecurityIndicator(
                        type='registry',
                        value=reg_key,
                        confidence=confidence,
                        context=context,
                        attack_technique=AttackTechnique.REGISTRY_MODIFICATION
                    ))
        return indicators
    def _extract_command_indicators(self, content: str, language: str) -> List[SecurityIndicator]:
        """Extract command-line execution indicators."""
        indicators = []
        # Command patterns
        command_patterns = [
            r'(?:cmd|powershell|bash|sh)\s+[/-]c\s+["\']?([^"\';\n]+)',
            r'(?:system|exec|shell_exec)\(\s*["\']([^"\']+)["\']',
            r'[`]([^`]+)[`]',  # Backticks
            r'\$\(([^)]+)\)',  # Command substitution
        ]
        for pattern in command_patterns:
            matches = re.finditer(pattern, content, re.IGNORECASE | re.MULTILINE)
            for match in matches:
                command = match.group(1)
                context = self._get_context(content, match.start(), match.end())
                confidence = self._calculate_confidence(command, 'command', context)
                if confidence > 0.4:
                    # Determine attack technique based on command content
                    technique = AttackTechnique.COMMAND_EXECUTION
                    if 'powershell' in command.lower():
                        technique = AttackTechnique.POWERSHELL
                    elif any(word in command.lower() for word in ['cmd', 'bat', 'com']):
                        technique = AttackTechnique.COMMAND_LINE
                    indicators.append(SecurityIndicator(
                        type='command',
                        value=command,
                        confidence=confidence,
                        context=context,
                        attack_technique=technique
                    ))
        return indicators
    def _extract_encoded_content(self, content: str) -> List[str]:
        """Extract and decode obfuscated/encoded content."""
        decoded_content = []
        # Base64 patterns
        base64_patterns = [
            r'["\']([A-Za-z0-9+/]{20,}={0,2})["\']',  # Base64 strings
            r'FromBase64String\(["\']([^"\']+)["\']',   # PowerShell
            r'base64\.b64decode\(["\']([^"\']+)["\']',  # Python
        ]
        for pattern in base64_patterns:
            matches = re.finditer(pattern, content, re.IGNORECASE)
            for match in matches:
                try:
                    encoded_str = match.group(1)
                    if len(encoded_str) > 20:  # Only decode substantial content
                        decoded = base64.b64decode(encoded_str + '===').decode('utf-8', errors='ignore')
                        if decoded and len(decoded) > 10:
                            decoded_content.append(decoded)
                except:
                    continue
        # Hex patterns
        hex_patterns = [
            r'0x([0-9a-fA-F]{20,})',
            r'["\']([0-9a-fA-F]{20,})["\']'
        ]
        for pattern in hex_patterns:
            matches = re.finditer(pattern, content)
            for match in matches:
                try:
                    hex_str = match.group(1)
                    if len(hex_str) % 2 == 0 and len(hex_str) > 20:
                        decoded = binascii.unhexlify(hex_str).decode('utf-8', errors='ignore')
                        if decoded and len(decoded) > 10:
                            decoded_content.append(decoded)
                except:
                    continue
        return decoded_content
    def _calculate_confidence(self, indicator: str, indicator_type: str, context: str) -> float:
        """Calculate confidence score for an indicator."""
        confidence = 0.5  # Base confidence
        # Length and complexity scoring
        if len(indicator) > 5:
            confidence += 0.1
        if len(indicator) > 20:
            confidence += 0.1
        # Context-based scoring
        high_confidence_keywords = [
            'exploit', 'payload', 'shell', 'inject', 'execute', 'run',
            'attack', 'malware', 'backdoor', 'trojan', 'virus'
        ]
        context_lower = context.lower()
        for keyword in high_confidence_keywords:
            if keyword in context_lower:
                confidence += 0.1
                break
        # Type-specific scoring
        if indicator_type == 'process':
            if indicator.endswith('.exe') or indicator.endswith('.dll'):
                confidence += 0.2
            if any(word in indicator.lower() for word in ['cmd', 'powershell', 'bash', 'sh']):
                confidence += 0.1
        elif indicator_type == 'file':
            if any(ext in indicator.lower() for ext in ['.exe', '.dll', '.bat', '.ps1', '.sh']):
                confidence += 0.2
            if any(path in indicator.lower() for path in ['temp', 'tmp', 'appdata']):
                confidence += 0.1
        elif indicator_type == 'network':
            if re.match(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', indicator):
                confidence += 0.2
            if any(tld in indicator.lower() for tld in ['.com', '.net', '.org', '.ru', '.cn']):
                confidence += 0.1
        # Apply false positive filters
        if self._is_false_positive(indicator, indicator_type):
            confidence *= 0.3
        return min(confidence, 1.0)
    def _is_false_positive(self, indicator: str, indicator_type: str) -> bool:
        """Check if indicator is likely a false positive."""
        if indicator_type in self.false_positive_filters:
            fp_patterns = self.false_positive_filters[indicator_type]
            for pattern in fp_patterns:
                if re.search(pattern, indicator, re.IGNORECASE):
                    return True
        return False
    def _get_context(self, content: str, start: int, end: int, window: int = 100) -> str:
        """Get context around a match."""
        context_start = max(0, start - window)
        context_end = min(len(content), end + window)
        return content[context_start:context_end].strip()
    def _deduplicate_and_rank(self, indicators: List[SecurityIndicator]) -> List[Dict]:
        """Remove duplicates and rank indicators by confidence."""
        # Deduplicate by value
        seen = set()
        unique_indicators = []
        for indicator in sorted(indicators, key=lambda x: x.confidence, reverse=True):
            if indicator.value not in seen:
                seen.add(indicator.value)
                unique_indicators.append(indicator)
        # Convert to dict format and return top indicators
        return [
            {
                'value': ind.value,
                'confidence': round(ind.confidence, 2),
                'context': ind.context[:200] + '...' if len(ind.context) > 200 else ind.context,
                'attack_technique': ind.attack_technique.value if ind.attack_technique else None
            }
            for ind in unique_indicators[:10]  # Top 10 indicators
        ]
    def _analyze_attack_behaviors(self, content: str, language: str) -> List[Dict]:
        """Analyze attack behaviors and patterns."""
        behaviors = []
        behavior_patterns = {
            'persistence': [
                r'(?:startup|autorun|registry.*run)',
                r'(?:scheduled.*task|cron|at\s+\d)',
                r'(?:service.*create|sc.*create)'
            ],
            'defense_evasion': [
                r'(?:disable.*antivirus|kill.*av)',
                r'(?:encode|encrypt|obfuscat)',
                r'(?:hide|stealth|invisible)'
            ],
            'credential_access': [
                r'(?:password|credential|token)',
                r'(?:keylog|steal.*key)',
                r'(?:mimikatz|lsass)'
            ],
            'lateral_movement': [
                r'(?:psexec|wmi.*exec|remote.*exec)',
                r'(?:net\s+use|mount|smb)',
                r'(?:ssh|rdp|vnc)'
            ],
            'exfiltration': [
                r'(?:upload|ftp|http.*post)',
                r'(?:compress|zip|archive)',
                r'(?:steal|exfil|extract)'
            ]
        }
        content_lower = content.lower()
        for behavior, patterns in behavior_patterns.items():
            score = 0
            matches = []
            for pattern in patterns:
                pattern_matches = re.findall(pattern, content_lower)
                if pattern_matches:
                    score += len(pattern_matches)
                    matches.extend(pattern_matches)
            if score > 0:
                behaviors.append({
                    'behavior': behavior,
                    'confidence': min(score * 0.2, 1.0),
                    'indicators': matches[:5]  # Top 5 matches
                })
        return sorted(behaviors, key=lambda x: x['confidence'], reverse=True)
    def _map_to_mitre_attack(self, indicators: List[SecurityIndicator]) -> List[str]:
        """Map indicators to MITRE ATT&CK techniques."""
        techniques = set()
        for indicator in indicators:
            if indicator.attack_technique:
                techniques.add(indicator.attack_technique.value)
        return sorted(list(techniques))
    def _assess_analysis_quality(self, content: str) -> Dict[str, any]:
        """Assess the quality and completeness of the analysis."""
        # Content metrics
        lines = len(content.split('\n'))
        chars = len(content)
        # Indicator density
        total_indicators = len(self.indicators)
        high_conf_indicators = len([i for i in self.indicators if i.confidence > 0.7])
        # Calculate quality score
        content_score = min(lines / 50, 1.0) * 0.3  # More lines = better
        indicator_score = min(total_indicators / 20, 1.0) * 0.4  # More indicators = better
        confidence_score = (high_conf_indicators / max(total_indicators, 1)) * 0.3  # Higher confidence = better
        overall_score = content_score + indicator_score + confidence_score
        return {
            'overall_score': round(overall_score, 2),
            'content_lines': lines,
            'content_chars': chars,
            'total_indicators': total_indicators,
            'high_confidence_indicators': high_conf_indicators,
            'recommendation': self._get_quality_recommendation(overall_score)
        }
    def _get_quality_recommendation(self, score: float) -> str:
        """Get recommendation based on quality score."""
        if score >= 0.8:
            return "High quality PoC with excellent indicator extraction"
        elif score >= 0.6:
            return "Good quality PoC with adequate indicators"
        elif score >= 0.4:
            return "Moderate quality PoC, may need additional analysis"
        else:
            return "Low quality PoC, limited indicators extracted"
    def _initialize_language_patterns(self) -> Dict:
        """Initialize language-specific patterns."""
        return {
            # Patterns for different languages will be expanded
        }
    def _initialize_attack_patterns(self) -> Dict:
        """Initialize attack pattern recognition."""
        return {
            # Attack patterns will be expanded
        }
    def _initialize_fp_filters(self) -> Dict:
        """Initialize false positive filters."""
        return {
            'process': [
                r'^(explorer|notepad|calc|windir|system32)\.exe$',
                r'^[a-z]$',  # Single characters
                r'^\d+$'     # Pure numbers
            ],
            'file': [
                r'^[a-z]$',
                r'^\d+$',
                r'^(con|aux|prn|nul)$'
            ],
            'network': [
                r'^(localhost|127\.0\.0\.1|0\.0\.0\.0)$',
                r'^\d{1,2}$',  # Port numbers without context
                r'^(example\.com|test\.com|localhost)$'
            ]
        }
 # Example usage
 if __name__ == "__main__":
    analyzer = PoCAnalyzer()
    # Example PoC content
    sample_poc = """
    import subprocess
    import base64
    # CVE-2024-1234 exploit
    payload = base64.b64decode("Y21kIC9jIGVjaG8gSGVsbG8gV29ybGQ=")
    subprocess.call("powershell.exe -enc " + payload.decode(), shell=True)
    # Create persistence
    with open("C:\\temp\\malware.exe", "wb") as f:
        f.write(malicious_bytes)
    # Network connection
    import socket
    s = socket.socket()
    s.connect(("192.168.1.100", 4444))
    """
    result = analyzer.analyze_poc(sample_poc, "CVE-2024-1234")
    print(f"Analysis result: {result}")