STEGOSAURUS-WRECKS/analysis_tools.py

"""
STEGOSAURUS WRECKS - Comprehensive Analysis Tools
Complete toolkit for steganography detection across all file types

This module provides 264+ analysis functions covering:
- Images: PNG, JPEG, GIF, BMP, WebP, TIFF, ICO, HEIC, AVIF, SVG
- Audio: WAV, MP3, FLAC, OGG
- Video: AVI, MKV
- Documents: PDF, Office
- Archives: ZIP, RAR
- Fonts: TTF, OTF, WOFF
"""

import struct
import zlib
import io
import re
import json
import hashlib
import binascii
from pathlib import Path
from typing import Dict, List, Any, Optional, Tuple, Union, BinaryIO
from dataclasses import dataclass, field
from enum import Enum
import math

# Optional imports - gracefully handle missing dependencies
try:
    import numpy as np
    HAS_NUMPY = True
except ImportError:
    HAS_NUMPY = False

try:
    from PIL import Image, ExifTags
    HAS_PIL = True
except ImportError:
    HAS_PIL = False


# ============== CORE INFRASTRUCTURE ==============

@dataclass
class AnalysisResult:
    """Standard result format for all analysis functions"""
    success: bool
    action: str
    file_type: str
    data: Dict[str, Any] = field(default_factory=dict)
    findings: List[str] = field(default_factory=list)
    suspicious: bool = False
    confidence: float = 0.0
    raw_data: Optional[bytes] = None
    error: Optional[str] = None

    def to_dict(self) -> Dict[str, Any]:
        return {
            "success": self.success,
            "action": self.action,
            "file_type": self.file_type,
            "data": self.data,
            "findings": self.findings,
            "suspicious": self.suspicious,
            "confidence": self.confidence,
            "has_raw_data": self.raw_data is not None,
            "error": self.error
        }


class FileType(Enum):
    PNG = "png"
    JPEG = "jpeg"
    GIF = "gif"
    BMP = "bmp"
    WEBP = "webp"
    TIFF = "tiff"
    ICO = "ico"
    HEIC = "heic"
    AVIF = "avif"
    SVG = "svg"
    WAV = "wav"
    MP3 = "mp3"
    FLAC = "flac"
    OGG = "ogg"
    AVI = "avi"
    MKV = "mkv"
    PDF = "pdf"
    OFFICE = "office"
    ZIP = "zip"
    RAR = "rar"
    FONT = "font"
    AIFF = "aiff"
    AU = "au"
    MIDI = "midi"
    PCAP = "pcap"
    SQLITE = "sqlite"
    GZIP = "gzip"
    TAR = "tar"
    UNKNOWN = "unknown"


# Magic bytes for file type detection
MAGIC_SIGNATURES = {
    b'\x89PNG\r\n\x1a\n': FileType.PNG,
    b'\xff\xd8\xff': FileType.JPEG,
    b'GIF87a': FileType.GIF,
    b'GIF89a': FileType.GIF,
    b'BM': FileType.BMP,
    b'RIFF': FileType.WAV,  # Could also be AVI - check further
    b'\xff\xfb': FileType.MP3,
    b'\xff\xfa': FileType.MP3,
    b'\xff\xf3': FileType.MP3,
    b'\xff\xf2': FileType.MP3,
    b'ID3': FileType.MP3,
    b'fLaC': FileType.FLAC,
    b'OggS': FileType.OGG,
    b'%PDF': FileType.PDF,
    b'PK\x03\x04': FileType.ZIP,  # Could be Office - check further
    b'Rar!\x1a\x07': FileType.RAR,
    b'\x1aE\xdf\xa3': FileType.MKV,
    b'\x00\x00\x01\x00': FileType.ICO,
    b'\x00\x00\x02\x00': FileType.ICO,  # CUR format
    b'\x1f\x8b': FileType.GZIP,
    b'MThd': FileType.MIDI,
    b'.snd': FileType.AU,
    b'\xa1\xb2\xc3\xd4': FileType.PCAP,
    b'\xd4\xc3\xb2\xa1': FileType.PCAP,  # Little-endian PCAP
    b'SQLite format 3': FileType.SQLITE,
}

WEBP_SIGNATURES = [b'WEBP']
HEIC_SIGNATURES = [b'ftyp', b'heic', b'heix', b'hevc', b'mif1']
AVIF_SIGNATURES = [b'ftypavif', b'ftypavis']


def detect_file_type(data: bytes) -> FileType:
    """Detect file type from magic bytes"""
    if len(data) < 12:
        return FileType.UNKNOWN

    # Check standard signatures
    for magic, ftype in MAGIC_SIGNATURES.items():
        if data.startswith(magic):
            # Special handling for RIFF container
            if magic == b'RIFF' and len(data) >= 12:
                if data[8:12] == b'WAVE':
                    return FileType.WAV
                elif data[8:12] == b'AVI ':
                    return FileType.AVI
                elif data[8:12] == b'WEBP':
                    return FileType.WEBP
            # Special handling for ZIP-based formats
            elif magic == b'PK\x03\x04':
                # Check if it's an Office document
                if b'[Content_Types].xml' in data[:2000] or b'word/' in data[:2000] or b'xl/' in data[:2000] or b'ppt/' in data[:2000]:
                    return FileType.OFFICE
                return FileType.ZIP
            return ftype

    # Check for HEIC/AVIF (ftyp box)
    if len(data) >= 12 and data[4:8] == b'ftyp':
        brand = data[8:12]
        if brand in [b'heic', b'heix', b'hevc', b'mif1']:
            return FileType.HEIC
        elif brand in [b'avif', b'avis']:
            return FileType.AVIF

    # Check for TIFF (II = little-endian, MM = big-endian)
    if data[:4] in [b'II\x2a\x00', b'MM\x00\x2a']:
        return FileType.TIFF

    # Check for AIFF (FORM container with AIFF type)
    if data[:4] == b'FORM' and len(data) >= 12:
        if data[8:12] == b'AIFF' or data[8:12] == b'AIFC':
            return FileType.AIFF

    # Check for TAR (magic at offset 257)
    if len(data) >= 265 and data[257:262] == b'ustar':
        return FileType.TAR

    # Check for SVG
    if b'<svg' in data[:1000] or b'<?xml' in data[:100] and b'<svg' in data[:2000]:
        return FileType.SVG

    # Check for fonts
    if data[:4] in [b'\x00\x01\x00\x00', b'OTTO', b'true', b'typ1']:
        return FileType.FONT
    if data[:4] == b'wOFF' or data[:4] == b'wOF2':
        return FileType.FONT

    return FileType.UNKNOWN


def calculate_entropy(data: bytes) -> float:
    """Calculate Shannon entropy of data"""
    if not data:
        return 0.0

    byte_counts = [0] * 256
    for byte in data:
        byte_counts[byte] += 1

    length = len(data)
    entropy = 0.0
    for count in byte_counts:
        if count > 0:
            p = count / length
            entropy -= p * math.log2(p)

    return entropy


def calculate_chi_square(data: bytes) -> float:
    """Calculate chi-square statistic for randomness test"""
    if not data:
        return 0.0

    byte_counts = [0] * 256
    for byte in data:
        byte_counts[byte] += 1

    expected = len(data) / 256
    chi_square = sum((count - expected) ** 2 / expected for count in byte_counts)
    return chi_square


def find_strings(data: bytes, min_length: int = 4) -> List[Tuple[int, str]]:
    """Extract printable ASCII strings from binary data"""
    strings = []
    current = []
    start_offset = 0

    for i, byte in enumerate(data):
        if 32 <= byte < 127:
            if not current:
                start_offset = i
            current.append(chr(byte))
        else:
            if len(current) >= min_length:
                strings.append((start_offset, ''.join(current)))
            current = []

    if len(current) >= min_length:
        strings.append((start_offset, ''.join(current)))

    return strings


def hex_dump(data: bytes, offset: int = 0, length: int = 256) -> str:
    """Create hex dump of data"""
    result = []
    chunk = data[offset:offset + length]

    for i in range(0, len(chunk), 16):
        line_data = chunk[i:i + 16]
        hex_part = ' '.join(f'{b:02x}' for b in line_data)
        ascii_part = ''.join(chr(b) if 32 <= b < 127 else '.' for b in line_data)
        result.append(f'{offset + i:08x}  {hex_part:<48}  {ascii_part}')

    return '\n'.join(result)


# ============== BIT PLANE ANALYSIS ==============

def extract_bit_plane(data: bytes, bit: int) -> bytes:
    """Extract specific bit plane from data"""
    if not HAS_NUMPY:
        # Fallback without numpy
        result = bytearray()
        for i in range(0, len(data), 8):
            byte_val = 0
            for j in range(8):
                if i + j < len(data):
                    byte_val |= ((data[i + j] >> bit) & 1) << (7 - j)
            result.append(byte_val)
        return bytes(result)

    arr = np.frombuffer(data, dtype=np.uint8)
    plane = (arr >> bit) & 1
    # Pack bits into bytes
    padded = np.pad(plane, (0, (8 - len(plane) % 8) % 8), mode='constant')
    packed = np.packbits(padded)
    return packed.tobytes()


def analyze_bit_planes(data: bytes) -> Dict[str, Any]:
    """Analyze all 8 bit planes"""
    results = {}
    for bit in range(8):
        plane_data = extract_bit_plane(data, bit)
        results[f'plane_{bit}'] = {
            'entropy': calculate_entropy(plane_data),
            'unique_bytes': len(set(plane_data)),
            'sample': plane_data[:64].hex()
        }
    return results


# ============== LSB EXTRACTION ==============

def extract_lsb(data: bytes, bits: int = 1, channels: str = "RGB") -> bytes:
    """Extract LSB data from raw pixel bytes"""
    if not data:
        return b''

    extracted_bits = []
    mask = (1 << bits) - 1

    for byte in data:
        for bit_pos in range(bits):
            extracted_bits.append((byte >> bit_pos) & 1)

    # Pack bits into bytes
    result = bytearray()
    for i in range(0, len(extracted_bits), 8):
        byte_val = 0
        for j in range(8):
            if i + j < len(extracted_bits):
                byte_val |= extracted_bits[i + j] << j
        result.append(byte_val)

    return bytes(result)


# ============== PATTERN DETECTION ==============

def detect_repeated_patterns(data: bytes, min_length: int = 4, max_length: int = 32) -> List[Dict[str, Any]]:
    """Detect repeated byte patterns"""
    patterns = []

    for length in range(min_length, min(max_length, len(data) // 2) + 1):
        seen = {}
        for i in range(len(data) - length + 1):
            pattern = data[i:i + length]
            if pattern in seen:
                seen[pattern].append(i)
            else:
                seen[pattern] = [i]

        for pattern, offsets in seen.items():
            if len(offsets) >= 3:  # At least 3 occurrences
                patterns.append({
                    'pattern': pattern.hex(),
                    'length': length,
                    'count': len(offsets),
                    'offsets': offsets[:10]  # First 10 offsets
                })

    return sorted(patterns, key=lambda x: x['count'], reverse=True)[:20]


def detect_xor_patterns(data: bytes) -> Dict[str, Any]:
    """Detect potential XOR encryption patterns"""
    results = {
        'single_byte_keys': [],
        'repeating_key_likely': False,
        'key_length_candidates': []
    }

    # Try single-byte XOR keys
    for key in range(256):
        decoded = bytes(b ^ key for b in data[:256])
        # Check if result looks like text
        printable = sum(1 for b in decoded if 32 <= b < 127 or b in [9, 10, 13])
        if printable > len(decoded) * 0.7:
            results['single_byte_keys'].append({
                'key': key,
                'key_hex': f'{key:02x}',
                'printable_ratio': printable / len(decoded),
                'sample': decoded[:50].decode('ascii', errors='replace')
            })

    # Detect repeating key by looking at byte frequency at intervals
    for key_len in range(2, 17):
        columns = [[] for _ in range(key_len)]
        for i, b in enumerate(data[:1024]):
            columns[i % key_len].append(b)

        # Check if each column has low entropy (single-byte XOR characteristic)
        avg_entropy = sum(calculate_entropy(bytes(col)) for col in columns) / key_len
        if avg_entropy < 5.0:  # Lower than random
            results['key_length_candidates'].append({
                'length': key_len,
                'avg_column_entropy': avg_entropy
            })

    if results['key_length_candidates']:
        results['repeating_key_likely'] = True

    return results


# ============== ENCODING DETECTION ==============

def detect_base64(data: bytes) -> Dict[str, Any]:
    """Detect and decode potential Base64 encoded content"""
    results = {
        'found': False,
        'segments': []
    }

    # Base64 pattern
    b64_pattern = rb'[A-Za-z0-9+/]{20,}={0,2}'

    text = data.decode('ascii', errors='ignore')
    matches = re.finditer(r'[A-Za-z0-9+/]{20,}={0,2}', text)

    for match in matches:
        b64_str = match.group()
        try:
            # Try to decode
            import base64
            decoded = base64.b64decode(b64_str)

            # Check if decoded content is meaningful
            printable = sum(1 for b in decoded if 32 <= b < 127 or b in [9, 10, 13])

            results['segments'].append({
                'offset': match.start(),
                'length': len(b64_str),
                'decoded_length': len(decoded),
                'printable_ratio': printable / len(decoded) if decoded else 0,
                'decoded_preview': decoded[:100].decode('utf-8', errors='replace') if printable > len(decoded) * 0.5 else decoded[:50].hex()
            })
            results['found'] = True
        except:
            pass

    return results


def detect_hex_strings(data: bytes) -> Dict[str, Any]:
    """Detect hex-encoded strings"""
    results = {
        'found': False,
        'segments': []
    }

    text = data.decode('ascii', errors='ignore')
    # Match continuous hex strings
    hex_pattern = r'(?:[0-9a-fA-F]{2}){8,}'

    for match in re.finditer(hex_pattern, text):
        hex_str = match.group()
        try:
            decoded = bytes.fromhex(hex_str)
            printable = sum(1 for b in decoded if 32 <= b < 127 or b in [9, 10, 13])

            results['segments'].append({
                'offset': match.start(),
                'length': len(hex_str),
                'decoded_length': len(decoded),
                'printable_ratio': printable / len(decoded) if decoded else 0,
                'decoded_preview': decoded[:100].decode('utf-8', errors='replace') if printable > len(decoded) * 0.5 else None
            })
            results['found'] = True
        except:
            pass

    return results


def detect_unicode_steg(data: bytes) -> Dict[str, Any]:
    """Detect Unicode-based steganography (zero-width chars, homoglyphs)"""
    results = {
        'found': False,
        'zero_width_chars': [],
        'homoglyphs': [],
        'invisible_chars': 0
    }

    try:
        text = data.decode('utf-8', errors='ignore')
    except:
        return results

    # Zero-width characters
    zwc_chars = {
        '\u200b': 'ZERO WIDTH SPACE',
        '\u200c': 'ZERO WIDTH NON-JOINER',
        '\u200d': 'ZERO WIDTH JOINER',
        '\u2060': 'WORD JOINER',
        '\ufeff': 'ZERO WIDTH NO-BREAK SPACE (BOM)',
        '\u180e': 'MONGOLIAN VOWEL SEPARATOR',
    }

    for char, name in zwc_chars.items():
        count = text.count(char)
        if count > 0:
            results['zero_width_chars'].append({
                'char': repr(char),
                'name': name,
                'count': count
            })
            results['invisible_chars'] += count
            results['found'] = True

    # Check for variation selectors
    for i, char in enumerate(text):
        if '\ufe00' <= char <= '\ufe0f':
            results['invisible_chars'] += 1
            results['found'] = True

    return results


def detect_whitespace_steg(data: bytes) -> Dict[str, Any]:
    """Detect whitespace steganography (tabs/spaces encoding)"""
    results = {
        'found': False,
        'trailing_spaces': 0,
        'mixed_indentation': False,
        'suspicious_patterns': [],
        'potential_message': None
    }

    try:
        text = data.decode('utf-8', errors='ignore')
    except:
        return results

    lines = text.split('\n')
    tab_indent_lines = 0
    space_indent_lines = 0

    for line in lines:
        # Count trailing whitespace
        stripped = line.rstrip()
        trailing = len(line) - len(stripped)
        if trailing > 0:
            results['trailing_spaces'] += trailing

        # Check indentation type
        if line.startswith('\t'):
            tab_indent_lines += 1
        elif line.startswith(' '):
            space_indent_lines += 1

    if tab_indent_lines > 0 and space_indent_lines > 0:
        results['mixed_indentation'] = True

    if results['trailing_spaces'] > 10:
        results['found'] = True

        # Try to decode as binary (space=0, tab=1 or similar)
        bits = []
        for line in lines:
            trailing = line[len(line.rstrip()):]
            for char in trailing:
                if char == ' ':
                    bits.append('0')
                elif char == '\t':
                    bits.append('1')

        if len(bits) >= 8:
            try:
                bit_string = ''.join(bits)
                message = bytearray()
                for i in range(0, len(bit_string) - 7, 8):
                    byte_val = int(bit_string[i:i+8], 2)
                    if byte_val == 0:
                        break
                    message.append(byte_val)

                decoded = bytes(message).decode('utf-8', errors='ignore')
                if decoded and all(32 <= ord(c) < 127 or c in '\r\n\t' for c in decoded):
                    results['potential_message'] = decoded[:200]
            except:
                pass

    return results


# ============== TOOL REGISTRY ==============

class AnalysisToolRegistry:
    """Registry of all analysis tools organized by action name"""

    def __init__(self):
        self._tools: Dict[str, callable] = {}
        self._register_all_tools()

    def _register_all_tools(self):
        """Register all analysis tools"""
        # Core detection tools
        self._tools['detect_base64'] = detect_base64
        self._tools['detect_hex_strings'] = detect_hex_strings
        self._tools['detect_unicode_steg'] = detect_unicode_steg
        self._tools['detect_whitespace_steg'] = detect_whitespace_steg
        self._tools['detect_xor_patterns'] = detect_xor_patterns
        self._tools['detect_repeated_patterns'] = detect_repeated_patterns

        # Analysis tools
        self._tools['analyze_entropy'] = lambda data: {'entropy': calculate_entropy(data)}
        self._tools['analyze_bit_planes'] = analyze_bit_planes

        # Will be populated by format-specific modules

    def register(self, action: str, func: callable):
        """Register a tool function"""
        self._tools[action] = func

    def get(self, action: str) -> Optional[callable]:
        """Get a tool function by action name"""
        return self._tools.get(action)

    def execute(self, action: str, data: bytes, **kwargs) -> AnalysisResult:
        """Execute an analysis tool"""
        func = self._tools.get(action)
        if not func:
            return AnalysisResult(
                success=False,
                action=action,
                file_type="unknown",
                error=f"Unknown action: {action}"
            )

        try:
            result = func(data, **kwargs)

            # Convert result to AnalysisResult if needed
            if isinstance(result, AnalysisResult):
                return result
            elif isinstance(result, dict):
                return AnalysisResult(
                    success=True,
                    action=action,
                    file_type=kwargs.get('file_type', 'unknown'),
                    data=result,
                    suspicious=result.get('found', False) or result.get('suspicious', False)
                )
            else:
                return AnalysisResult(
                    success=True,
                    action=action,
                    file_type=kwargs.get('file_type', 'unknown'),
                    data={'result': result}
                )
        except Exception as e:
            return AnalysisResult(
                success=False,
                action=action,
                file_type=kwargs.get('file_type', 'unknown'),
                error=str(e)
            )

    def list_tools(self) -> List[str]:
        """List all registered tools"""
        return sorted(self._tools.keys())


# Global registry instance
TOOL_REGISTRY = AnalysisToolRegistry()


def execute_action(action: str, data: bytes, **kwargs) -> AnalysisResult:
    """Execute an analysis action"""
    return TOOL_REGISTRY.execute(action, data, **kwargs)


def list_available_tools() -> List[str]:
    """List all available analysis tools"""
    return TOOL_REGISTRY.list_tools()


# ============== PNG ANALYSIS TOOLS ==============

PNG_MAGIC = b'\x89PNG\r\n\x1a\n'

PNG_CHUNK_TYPES = {
    'IHDR': 'Image header',
    'PLTE': 'Palette',
    'IDAT': 'Image data',
    'IEND': 'Image end',
    'tEXt': 'Textual data',
    'zTXt': 'Compressed textual data',
    'iTXt': 'International textual data',
    'bKGD': 'Background color',
    'cHRM': 'Primary chromaticities',
    'gAMA': 'Gamma',
    'hIST': 'Palette histogram',
    'iCCP': 'ICC profile',
    'pHYs': 'Physical pixel dimensions',
    'sBIT': 'Significant bits',
    'sPLT': 'Suggested palette',
    'sRGB': 'Standard RGB color space',
    'tIME': 'Last modification time',
    'tRNS': 'Transparency',
    'eXIf': 'EXIF data',
    'acTL': 'Animation control (APNG)',
    'fcTL': 'Frame control (APNG)',
    'fdAT': 'Frame data (APNG)',
}


def png_parse_chunks(data: bytes) -> Dict[str, Any]:
    """Parse all PNG chunks and return detailed information"""
    if not data.startswith(PNG_MAGIC):
        return {'error': 'Not a valid PNG file', 'valid': False}

    chunks = []
    pos = 8  # Skip magic bytes
    total_idat_size = 0
    chunk_type_counts = {}

    while pos < len(data):
        if pos + 8 > len(data):
            break

        chunk_length = struct.unpack('>I', data[pos:pos+4])[0]
        chunk_type = data[pos+4:pos+8].decode('ascii', errors='replace')

        if pos + 12 + chunk_length > len(data):
            chunks.append({
                'type': chunk_type,
                'offset': pos,
                'length': chunk_length,
                'error': 'Truncated chunk'
            })
            break

        chunk_data = data[pos+8:pos+8+chunk_length]
        stored_crc = struct.unpack('>I', data[pos+8+chunk_length:pos+12+chunk_length])[0]
        calculated_crc = zlib.crc32(data[pos+4:pos+8+chunk_length]) & 0xffffffff

        chunk_info = {
            'type': chunk_type,
            'description': PNG_CHUNK_TYPES.get(chunk_type, 'Unknown/Private'),
            'offset': pos,
            'length': chunk_length,
            'crc_valid': stored_crc == calculated_crc,
            'crc_stored': f'{stored_crc:08x}',
            'crc_calculated': f'{calculated_crc:08x}',
        }

        # Track chunk type counts
        chunk_type_counts[chunk_type] = chunk_type_counts.get(chunk_type, 0) + 1

        # Track IDAT size
        if chunk_type == 'IDAT':
            total_idat_size += chunk_length

        # Parse IHDR
        if chunk_type == 'IHDR' and chunk_length == 13:
            width, height, bit_depth, color_type, compression, filter_method, interlace = struct.unpack('>IIBBBBB', chunk_data)
            chunk_info['parsed'] = {
                'width': width,
                'height': height,
                'bit_depth': bit_depth,
                'color_type': color_type,
                'compression': compression,
                'filter': filter_method,
                'interlace': interlace
            }

        # Parse text chunks
        elif chunk_type == 'tEXt':
            null_pos = chunk_data.find(b'\x00')
            if null_pos != -1:
                keyword = chunk_data[:null_pos].decode('latin-1', errors='replace')
                text = chunk_data[null_pos+1:].decode('latin-1', errors='replace')
                chunk_info['parsed'] = {'keyword': keyword, 'text': text[:500]}

        elif chunk_type == 'zTXt':
            null_pos = chunk_data.find(b'\x00')
            if null_pos != -1:
                keyword = chunk_data[:null_pos].decode('latin-1', errors='replace')
                try:
                    text = zlib.decompress(chunk_data[null_pos+2:]).decode('latin-1', errors='replace')
                    chunk_info['parsed'] = {'keyword': keyword, 'text': text[:500], 'compressed': True}
                except:
                    chunk_info['parsed'] = {'keyword': keyword, 'error': 'Decompression failed'}

        elif chunk_type == 'iTXt':
            null_pos = chunk_data.find(b'\x00')
            if null_pos != -1:
                keyword = chunk_data[:null_pos].decode('latin-1', errors='replace')
                chunk_info['parsed'] = {'keyword': keyword}

        # Parse tIME
        elif chunk_type == 'tIME' and chunk_length == 7:
            year, month, day, hour, minute, second = struct.unpack('>HBBBBB', chunk_data)
            chunk_info['parsed'] = {
                'timestamp': f'{year:04d}-{month:02d}-{day:02d} {hour:02d}:{minute:02d}:{second:02d}'
            }

        # Parse pHYs
        elif chunk_type == 'pHYs' and chunk_length == 9:
            ppux, ppuy, unit = struct.unpack('>IIB', chunk_data)
            chunk_info['parsed'] = {
                'pixels_per_unit_x': ppux,
                'pixels_per_unit_y': ppuy,
                'unit': 'meter' if unit == 1 else 'unknown'
            }

        chunks.append(chunk_info)
        pos += 12 + chunk_length

        if chunk_type == 'IEND':
            break

    # Check for data after IEND
    after_iend = len(data) - pos

    return {
        'valid': True,
        'chunks': chunks,
        'chunk_count': len(chunks),
        'chunk_type_counts': chunk_type_counts,
        'total_idat_size': total_idat_size,
        'data_after_iend': after_iend,
        'suspicious': after_iend > 0
    }


def png_extract_text_chunks(data: bytes) -> Dict[str, Any]:
    """Extract all text metadata from PNG"""
    result = png_parse_chunks(data)
    if not result.get('valid'):
        return result

    text_chunks = []
    for chunk in result['chunks']:
        if chunk['type'] in ('tEXt', 'zTXt', 'iTXt') and 'parsed' in chunk:
            text_chunks.append({
                'type': chunk['type'],
                'keyword': chunk['parsed'].get('keyword', ''),
                'text': chunk['parsed'].get('text', ''),
                'offset': chunk['offset']
            })

    return {
        'found': len(text_chunks) > 0,
        'text_chunks': text_chunks,
        'count': len(text_chunks)
    }


def png_detect_appended_data(data: bytes) -> Dict[str, Any]:
    """Detect data appended after PNG IEND chunk"""
    if not data.startswith(PNG_MAGIC):
        return {'found': False, 'error': 'Not a valid PNG file'}

    # Parse through PNG chunks to find actual IEND position
    pos = 8  # Skip magic
    iend_end_pos = None

    while pos + 8 <= len(data):
        chunk_length = struct.unpack('>I', data[pos:pos+4])[0]
        chunk_type = data[pos+4:pos+8]

        # Chunk end = pos + 4 (length) + 4 (type) + chunk_length + 4 (CRC)
        chunk_end_pos = pos + 12 + chunk_length

        if chunk_type == b'IEND':
            iend_end_pos = chunk_end_pos
            break

        pos = chunk_end_pos

    if iend_end_pos is None:
        return {'found': False, 'error': 'No IEND chunk found'}

    if iend_end_pos >= len(data):
        return {'found': False, 'appended_size': 0}

    appended_data = data[iend_end_pos:]

    if len(appended_data) == 0:
        return {'found': False, 'appended_size': 0}

    # Analyze appended data
    result = {
        'found': True,
        'appended_size': len(appended_data),
        'offset': iend_end_pos,
        'entropy': calculate_entropy(appended_data),
        'preview_hex': appended_data[:64].hex(),
        'suspicious': True
    }

    # Check if appended data is another file
    file_type = detect_file_type(appended_data)
    if file_type != FileType.UNKNOWN:
        result['embedded_file_type'] = file_type.value

    # Check for printable text
    try:
        text = appended_data[:200].decode('utf-8')
        if all(c.isprintable() or c in '\r\n\t' for c in text):
            result['text_preview'] = text
    except:
        pass

    return result


def png_analyze_idat(data: bytes) -> Dict[str, Any]:
    """Analyze PNG IDAT chunks for anomalies"""
    result = png_parse_chunks(data)
    if not result.get('valid'):
        return result

    idat_chunks = []
    prev_end = 0

    for chunk in result['chunks']:
        if chunk['type'] == 'IDAT':
            idat_chunks.append({
                'offset': chunk['offset'],
                'length': chunk['length'],
                'crc_valid': chunk['crc_valid']
            })

            # Check for gap between IDAT chunks
            if prev_end > 0 and chunk['offset'] != prev_end:
                gap = chunk['offset'] - prev_end
                if gap > 12:  # More than just the next chunk header
                    idat_chunks[-1]['gap_before'] = gap

            prev_end = chunk['offset'] + 12 + chunk['length']

    if not idat_chunks:
        return {'found': False, 'error': 'No IDAT chunks found'}

    total_size = sum(c['length'] for c in idat_chunks)
    sizes = [c['length'] for c in idat_chunks]

    return {
        'found': True,
        'chunk_count': len(idat_chunks),
        'total_size': total_size,
        'chunks': idat_chunks,
        'size_variance': max(sizes) - min(sizes) if len(sizes) > 1 else 0,
        'avg_chunk_size': total_size // len(idat_chunks),
        'all_crc_valid': all(c['crc_valid'] for c in idat_chunks),
        'suspicious': any('gap_before' in c for c in idat_chunks)
    }


def png_extract_lsb(data: bytes, bits: int = 1, channels: str = "RGB") -> Dict[str, Any]:
    """Extract LSB data from PNG image pixels"""
    if not HAS_PIL:
        return {'error': 'PIL not available', 'found': False}

    try:
        img = Image.open(io.BytesIO(data))

        # Convert to RGBA for consistent processing
        if img.mode == 'P':
            img = img.convert('RGBA')
        elif img.mode == 'L':
            img = img.convert('RGB')
        elif img.mode not in ('RGB', 'RGBA'):
            img = img.convert('RGBA')

        pixels = list(img.getdata())

        # Extract bits from specified channels
        channel_map = {'R': 0, 'G': 1, 'B': 2, 'A': 3}
        channel_indices = [channel_map[c] for c in channels.upper() if c in channel_map]

        extracted_bits = []
        mask = (1 << bits) - 1

        for pixel in pixels:
            for ch_idx in channel_indices:
                if ch_idx < len(pixel):
                    for bit_pos in range(bits):
                        extracted_bits.append((pixel[ch_idx] >> bit_pos) & 1)

        # Pack into bytes
        result_bytes = bytearray()
        for i in range(0, len(extracted_bits) - 7, 8):
            byte_val = 0
            for j in range(8):
                byte_val |= extracted_bits[i + j] << j
            result_bytes.append(byte_val)

        raw_data = bytes(result_bytes)

        # Look for patterns
        result = {
            'found': True,
            'extracted_size': len(raw_data),
            'channels': channels,
            'bits_per_channel': bits,
            'entropy': calculate_entropy(raw_data[:1024]),
            'raw_data': raw_data
        }

        # Check for STEG magic
        if raw_data[:4] == b'STEG':
            result['steg_header_found'] = True
            result['suspicious'] = True

        # Check for file signatures
        file_type = detect_file_type(raw_data)
        if file_type != FileType.UNKNOWN:
            result['embedded_file_type'] = file_type.value
            result['suspicious'] = True

        # Check for readable text
        try:
            text = raw_data[:100].decode('utf-8')
            printable = sum(1 for c in text if c.isprintable() or c in '\r\n\t')
            if printable > len(text) * 0.7:
                result['text_preview'] = text
                result['suspicious'] = True
        except:
            pass

        return result

    except Exception as e:
        return {'error': str(e), 'found': False}


def png_chi_square_analysis(data: bytes) -> Dict[str, Any]:
    """Chi-square analysis to detect LSB manipulation"""
    if not HAS_PIL or not HAS_NUMPY:
        return {'error': 'PIL or numpy not available'}

    try:
        img = Image.open(io.BytesIO(data))

        if img.mode == 'P':
            img = img.convert('RGB')
        elif img.mode == 'L':
            img = img.convert('RGB')
        elif img.mode not in ('RGB', 'RGBA'):
            img = img.convert('RGB')

        pixels = np.array(img)
        results = {}

        # Analyze each channel
        channel_names = ['Red', 'Green', 'Blue', 'Alpha']
        for ch_idx in range(min(pixels.shape[2], 4)):
            channel = pixels[:, :, ch_idx].flatten()

            # Pair analysis: count pairs (2k, 2k+1)
            pairs = np.zeros(128)
            for val in channel:
                pair_idx = val // 2
                if pair_idx < 128:
                    pairs[pair_idx] += 1

            # Expected distribution
            total = len(channel)
            expected = total / 128

            # Chi-square for pairs
            chi_sq = sum((pairs[i] - expected) ** 2 / expected for i in range(128) if expected > 0)

            # Also analyze bit plane
            lsb_plane = channel & 1
            ones = np.sum(lsb_plane)
            zeros = total - ones
            expected_ones = total / 2
            lsb_chi_sq = ((ones - expected_ones) ** 2 / expected_ones +
                          (zeros - expected_ones) ** 2 / expected_ones)

            results[channel_names[ch_idx]] = {
                'chi_square_pairs': float(chi_sq),
                'chi_square_lsb': float(lsb_chi_sq),
                'lsb_ones_ratio': float(ones / total),
                'suspicious': lsb_chi_sq > 3.84  # 95% confidence threshold
            }

        overall_suspicious = any(r['suspicious'] for r in results.values())

        return {
            'found': True,
            'channels': results,
            'suspicious': overall_suspicious,
            'interpretation': 'Low chi-square LSB values may indicate LSB steganography' if overall_suspicious else 'No strong LSB manipulation detected'
        }

    except Exception as e:
        return {'error': str(e), 'found': False}


def png_bit_plane_analysis(data: bytes) -> Dict[str, Any]:
    """Analyze individual bit planes of PNG image"""
    if not HAS_PIL or not HAS_NUMPY:
        return {'error': 'PIL or numpy not available'}

    try:
        img = Image.open(io.BytesIO(data))

        if img.mode == 'P':
            img = img.convert('RGB')

        pixels = np.array(img)
        results = {}

        channel_names = ['Red', 'Green', 'Blue', 'Alpha'][:pixels.shape[2] if len(pixels.shape) > 2 else 1]

        if len(pixels.shape) == 2:  # Grayscale
            pixels = pixels.reshape(pixels.shape[0], pixels.shape[1], 1)
            channel_names = ['Gray']

        for ch_idx, ch_name in enumerate(channel_names):
            channel = pixels[:, :, ch_idx]
            planes = {}

            for bit in range(8):
                plane = (channel >> bit) & 1

                # Calculate entropy of bit plane
                plane_bytes = np.packbits(plane.flatten())
                entropy = calculate_entropy(plane_bytes.tobytes())

                # Calculate percentage of 1s
                ones_pct = np.mean(plane) * 100

                planes[f'bit_{bit}'] = {
                    'entropy': float(entropy),
                    'ones_percentage': float(ones_pct),
                    'suspicious': bit < 2 and (entropy > 7.5 or abs(ones_pct - 50) < 1)
                }

            results[ch_name] = planes

        # Determine overall suspicion
        suspicious = any(
            plane['suspicious']
            for channel_planes in results.values()
            for plane in channel_planes.values()
        )

        return {
            'found': True,
            'channels': results,
            'suspicious': suspicious,
            'interpretation': 'High entropy in lower bit planes may indicate hidden data'
        }

    except Exception as e:
        return {'error': str(e), 'found': False}


def png_palette_analysis(data: bytes) -> Dict[str, Any]:
    """Analyze PNG palette for steganography indicators"""
    if not HAS_PIL:
        return {'error': 'PIL not available'}

    try:
        img = Image.open(io.BytesIO(data))

        if img.mode != 'P':
            return {'found': False, 'reason': 'Image is not palette-based'}

        palette = img.getpalette()
        if not palette:
            return {'found': False, 'reason': 'No palette found'}

        # Palette is RGB triplets
        colors = []
        for i in range(0, len(palette), 3):
            colors.append((palette[i], palette[i+1], palette[i+2]))

        # Check for sorted palette (common steg indicator)
        is_sorted = colors == sorted(colors)

        # Check for near-duplicate colors (LSB differences only)
        near_duplicates = []
        for i, c1 in enumerate(colors):
            for j, c2 in enumerate(colors[i+1:], i+1):
                diff = sum(abs(a - b) for a, b in zip(c1, c2))
                if 0 < diff <= 3:  # Very similar colors
                    near_duplicates.append((i, j, diff))

        # Check color distribution
        histogram = img.histogram()
        used_colors = sum(1 for h in histogram[:256] if h > 0)

        return {
            'found': True,
            'palette_size': len(colors),
            'used_colors': used_colors,
            'is_sorted': is_sorted,
            'near_duplicate_pairs': len(near_duplicates),
            'near_duplicates': near_duplicates[:10],  # First 10
            'suspicious': len(near_duplicates) > 5 or is_sorted,
            'interpretation': 'Sorted palette or many near-duplicates may indicate palette-based steganography'
        }

    except Exception as e:
        return {'error': str(e), 'found': False}


def png_filter_analysis(data: bytes) -> Dict[str, Any]:
    """Analyze PNG filter bytes for anomalies"""
    result = png_parse_chunks(data)
    if not result.get('valid'):
        return result

    # Need to decompress IDAT to get filter bytes
    idat_data = b''
    ihdr_data = None

    for chunk in result['chunks']:
        if chunk['type'] == 'IDAT':
            offset = chunk['offset']
            length = chunk['length']
            idat_data += data[offset+8:offset+8+length]
        elif chunk['type'] == 'IHDR' and 'parsed' in chunk:
            ihdr_data = chunk['parsed']

    if not ihdr_data:
        return {'error': 'No IHDR chunk found'}

    try:
        decompressed = zlib.decompress(idat_data)
    except:
        return {'error': 'Failed to decompress IDAT'}

    # Calculate bytes per row
    width = ihdr_data['width']
    height = ihdr_data['height']
    bit_depth = ihdr_data['bit_depth']
    color_type = ihdr_data['color_type']

    # Samples per pixel based on color type
    samples = {0: 1, 2: 3, 3: 1, 4: 2, 6: 4}.get(color_type, 3)
    bytes_per_pixel = max(1, (samples * bit_depth) // 8)
    row_bytes = 1 + width * bytes_per_pixel  # +1 for filter byte

    # Extract filter bytes
    filter_bytes = []
    for row in range(height):
        offset = row * row_bytes
        if offset < len(decompressed):
            filter_bytes.append(decompressed[offset])

    # Analyze filter distribution
    filter_counts = {}
    for f in filter_bytes:
        filter_counts[f] = filter_counts.get(f, 0) + 1

    filter_names = {0: 'None', 1: 'Sub', 2: 'Up', 3: 'Average', 4: 'Paeth'}

    return {
        'found': True,
        'row_count': len(filter_bytes),
        'filter_distribution': {filter_names.get(k, f'Unknown({k})'): v for k, v in filter_counts.items()},
        'unique_filters': len(filter_counts),
        'suspicious': 0 in filter_counts and filter_counts[0] > len(filter_bytes) * 0.9,
        'interpretation': 'Excessive use of filter 0 (None) may indicate modified image'
    }


def png_detect_embedded_png(data: bytes) -> Dict[str, Any]:
    """Detect PNG files embedded within PNG (nested steganography)"""
    results = {
        'found': False,
        'embedded_pngs': []
    }

    # Look for PNG magic in various locations
    search_start = 8  # Skip the outer PNG magic

    while True:
        pos = data.find(PNG_MAGIC, search_start)
        if pos == -1:
            break

        # Try to parse as PNG
        try:
            end_pos = data.find(b'IEND', pos)
            if end_pos != -1:
                # IEND + length (0) + CRC = +8 bytes
                end_pos += 12
                embedded_size = end_pos - pos

                results['embedded_pngs'].append({
                    'offset': pos,
                    'size': embedded_size,
                    'location': 'after_iend' if pos > data.rfind(b'IEND', 0, pos) else 'within_image'
                })
                results['found'] = True
        except:
            pass

        search_start = pos + 1

    results['count'] = len(results['embedded_pngs'])
    results['suspicious'] = results['found']

    return results


def png_color_histogram_analysis(data: bytes) -> Dict[str, Any]:
    """Analyze color histogram for LSB steganography indicators"""
    if not HAS_PIL or not HAS_NUMPY:
        return {'error': 'PIL or numpy not available'}

    try:
        img = Image.open(io.BytesIO(data))

        if img.mode == 'P':
            img = img.convert('RGB')

        pixels = np.array(img)
        results = {}

        channel_names = ['Red', 'Green', 'Blue'][:pixels.shape[2] if len(pixels.shape) > 2 else 1]

        for ch_idx, ch_name in enumerate(channel_names):
            channel = pixels[:, :, ch_idx].flatten()

            # Calculate histogram
            hist, _ = np.histogram(channel, bins=256, range=(0, 256))

            # Pairs of Values (PoV) analysis
            # In natural images, adjacent histogram bins have similar counts
            # LSB embedding creates anomalies in pairs (2k, 2k+1)
            pair_diffs = []
            for i in range(0, 256, 2):
                if hist[i] + hist[i+1] > 0:
                    diff = abs(hist[i] - hist[i+1]) / (hist[i] + hist[i+1])
                    pair_diffs.append(diff)

            avg_pair_diff = np.mean(pair_diffs) if pair_diffs else 0

            results[ch_name] = {
                'unique_values': int(np.sum(hist > 0)),
                'avg_pair_difference': float(avg_pair_diff),
                'suspicious': avg_pair_diff < 0.05  # Very similar pairs suggest LSB
            }

        return {
            'found': True,
            'channels': results,
            'suspicious': any(r['suspicious'] for r in results.values()),
            'interpretation': 'Similar histogram pair values may indicate LSB steganography'
        }

    except Exception as e:
        return {'error': str(e), 'found': False}


def png_visual_attack(data: bytes) -> Dict[str, Any]:
    """Generate visual attack images for bit plane analysis"""
    if not HAS_PIL or not HAS_NUMPY:
        return {'error': 'PIL or numpy not available'}

    try:
        img = Image.open(io.BytesIO(data))

        if img.mode == 'P':
            img = img.convert('RGB')

        pixels = np.array(img)

        # Extract LSB planes and scale to full intensity
        lsb_images = {}

        channel_names = ['Red', 'Green', 'Blue']
        for ch_idx, ch_name in enumerate(channel_names):
            if ch_idx < pixels.shape[2]:
                # LSB plane scaled to 0 or 255
                lsb = (pixels[:, :, ch_idx] & 1) * 255
                lsb_images[ch_name] = lsb.tolist()  # Can be reconstructed client-side

        # Combined RGB LSB
        combined = np.zeros_like(pixels)
        for ch_idx in range(min(3, pixels.shape[2])):
            combined[:, :, ch_idx] = (pixels[:, :, ch_idx] & 1) * 255

        return {
            'found': True,
            'image_size': [int(pixels.shape[1]), int(pixels.shape[0])],
            'channel_lsb_available': list(lsb_images.keys()),
            'interpretation': 'Visual inspection of LSB planes can reveal hidden patterns'
        }

    except Exception as e:
        return {'error': str(e), 'found': False}


def png_steg_signature_scan(data: bytes) -> Dict[str, Any]:
    """Scan for known steganography tool signatures"""
    signatures = {
        b'STEG': 'Stegosaurus Wrecks',
        b'openstego': 'OpenStego',
        b'steghide': 'Steghide',
        b'F5': 'F5 Algorithm',
        b'jphide': 'JPHide',
        b'outguess': 'OutGuess',
        b'invisible secrets': 'Invisible Secrets',
        b'camouflage': 'Camouflage',
        b'snow': 'SNOW',
        b'\x00\x00\x00\x01steg': 'Generic Steg Header',
    }

    found = []

    for sig, tool_name in signatures.items():
        pos = data.find(sig)
        if pos != -1:
            found.append({
                'signature': sig.hex() if not sig.isascii() else sig.decode('ascii', errors='replace'),
                'tool': tool_name,
                'offset': pos
            })

    # Also check LSB extracted data
    lsb_result = png_extract_lsb(data, bits=1, channels="RGB")
    if lsb_result.get('raw_data'):
        lsb_data = lsb_result['raw_data'][:1000]
        for sig, tool_name in signatures.items():
            if sig in lsb_data:
                found.append({
                    'signature': sig.hex() if not sig.isascii() else sig.decode('ascii', errors='replace'),
                    'tool': tool_name,
                    'location': 'LSB_extracted'
                })

    return {
        'found': len(found) > 0,
        'signatures': found,
        'suspicious': len(found) > 0
    }


def png_full_analysis(data: bytes) -> Dict[str, Any]:
    """Run all PNG analysis tools and compile results"""
    results = {
        'file_type': 'PNG',
        'analyses': {}
    }

    # Run all PNG analysis tools
    analyses = [
        ('chunk_parse', png_parse_chunks),
        ('text_chunks', png_extract_text_chunks),
        ('appended_data', png_detect_appended_data),
        ('idat_analysis', png_analyze_idat),
        ('chi_square', png_chi_square_analysis),
        ('bit_planes', png_bit_plane_analysis),
        ('histogram', png_color_histogram_analysis),
        ('filter_analysis', png_filter_analysis),
        ('embedded_png', png_detect_embedded_png),
        ('steg_signatures', png_steg_signature_scan),
    ]

    suspicious_count = 0

    for name, func in analyses:
        try:
            result = func(data)
            results['analyses'][name] = result
            if result.get('suspicious'):
                suspicious_count += 1
        except Exception as e:
            results['analyses'][name] = {'error': str(e)}

    results['suspicious_indicators'] = suspicious_count
    results['overall_suspicious'] = suspicious_count >= 2
    results['summary'] = f"Found {suspicious_count} suspicious indicators"

    return results


# Register PNG tools
def _register_png_tools():
    """Register all PNG analysis tools with the global registry"""
    TOOL_REGISTRY.register('png_parse_chunks', png_parse_chunks)
    TOOL_REGISTRY.register('png_extract_text_chunks', png_extract_text_chunks)
    TOOL_REGISTRY.register('png_detect_appended_data', png_detect_appended_data)
    TOOL_REGISTRY.register('png_analyze_idat', png_analyze_idat)
    TOOL_REGISTRY.register('png_extract_lsb', png_extract_lsb)
    TOOL_REGISTRY.register('png_chi_square_analysis', png_chi_square_analysis)
    TOOL_REGISTRY.register('png_bit_plane_analysis', png_bit_plane_analysis)
    TOOL_REGISTRY.register('png_palette_analysis', png_palette_analysis)
    TOOL_REGISTRY.register('png_filter_analysis', png_filter_analysis)
    TOOL_REGISTRY.register('png_detect_embedded_png', png_detect_embedded_png)
    TOOL_REGISTRY.register('png_color_histogram_analysis', png_color_histogram_analysis)
    TOOL_REGISTRY.register('png_visual_attack', png_visual_attack)
    TOOL_REGISTRY.register('png_steg_signature_scan', png_steg_signature_scan)
    TOOL_REGISTRY.register('png_full_analysis', png_full_analysis)


# Auto-register on module load
_register_png_tools()


# ============== ADVANCED TEXT STEGANOGRAPHY DETECTION ==============

def detect_homoglyph_steg(data: bytes) -> Dict[str, Any]:
    """Detect Cyrillic/Latin homoglyph substitution steganography."""
    results = {'found': False, 'substitutions': 0, 'details': []}
    try:
        text = data.decode('utf-8', errors='ignore')
    except:
        return results

    # Cyrillic chars that look like Latin
    CYRILLIC_TO_LATIN = {
        '\u0430': 'a', '\u0441': 'c', '\u0435': 'e', '\u043e': 'o',
        '\u0440': 'p', '\u0455': 's', '\u0445': 'x', '\u0443': 'y',
        '\u0410': 'A', '\u0412': 'B', '\u0421': 'C', '\u0415': 'E',
        '\u041d': 'H', '\u041a': 'K', '\u041c': 'M', '\u041e': 'O',
        '\u0420': 'P', '\u0422': 'T', '\u0425': 'X',
    }

    for i, ch in enumerate(text):
        if ch in CYRILLIC_TO_LATIN:
            results['substitutions'] += 1
            if results['substitutions'] <= 5:
                results['details'].append({
                    'offset': i,
                    'cyrillic': repr(ch),
                    'looks_like': CYRILLIC_TO_LATIN[ch]
                })

    if results['substitutions'] > 3:
        results['found'] = True

    return results


def detect_variation_selector_steg(data: bytes) -> Dict[str, Any]:
    """Detect variation selector steganography."""
    results = {'found': False, 'count': 0, 'selectors': []}
    try:
        text = data.decode('utf-8', errors='ignore')
    except:
        return results

    for i, ch in enumerate(text):
        if '\uFE00' <= ch <= '\uFE0F':
            results['count'] += 1
            if results['count'] <= 5:
                results['selectors'].append({'offset': i, 'selector': f'VS{ord(ch) - 0xFDFF}'})

    if results['count'] > 3:
        results['found'] = True

    return results


def detect_combining_mark_steg(data: bytes) -> Dict[str, Any]:
    """Detect steganography via invisible combining characters."""
    results = {'found': False, 'count': 0, 'marks': []}
    try:
        text = data.decode('utf-8', errors='ignore')
    except:
        return results

    INVISIBLE_COMBINERS = {
        '\u034F': 'COMBINING GRAPHEME JOINER',
        '\u200D': 'ZERO WIDTH JOINER',
        '\u2060': 'WORD JOINER',
        '\u2061': 'FUNCTION APPLICATION',
        '\u2062': 'INVISIBLE TIMES',
        '\u2063': 'INVISIBLE SEPARATOR',
        '\u2064': 'INVISIBLE PLUS',
    }

    for i, ch in enumerate(text):
        if ch in INVISIBLE_COMBINERS:
            results['count'] += 1
            if results['count'] <= 5:
                results['marks'].append({
                    'offset': i,
                    'name': INVISIBLE_COMBINERS[ch]
                })

    if results['count'] > 3:
        results['found'] = True

    return results


def detect_confusable_whitespace(data: bytes) -> Dict[str, Any]:
    """Detect steganography via Unicode whitespace variants."""
    results = {'found': False, 'non_standard_spaces': 0, 'types': {}}
    try:
        text = data.decode('utf-8', errors='ignore')
    except:
        return results

    SPACE_VARIANTS = {
        '\u00A0': 'NO-BREAK SPACE',
        '\u2000': 'EN QUAD',
        '\u2001': 'EM QUAD',
        '\u2002': 'EN SPACE',
        '\u2003': 'EM SPACE',
        '\u2004': 'THREE-PER-EM SPACE',
        '\u2005': 'FOUR-PER-EM SPACE',
        '\u2006': 'SIX-PER-EM SPACE',
        '\u2007': 'FIGURE SPACE',
        '\u2008': 'PUNCTUATION SPACE',
        '\u2009': 'THIN SPACE',
        '\u200A': 'HAIR SPACE',
        '\u202F': 'NARROW NO-BREAK SPACE',
        '\u205F': 'MEDIUM MATHEMATICAL SPACE',
        '\u3000': 'IDEOGRAPHIC SPACE',
    }

    for ch in text:
        if ch in SPACE_VARIANTS:
            name = SPACE_VARIANTS[ch]
            results['non_standard_spaces'] += 1
            results['types'][name] = results['types'].get(name, 0) + 1

    if results['non_standard_spaces'] > 3:
        results['found'] = True

    return results


def detect_emoji_steg(data: bytes) -> Dict[str, Any]:
    """Detect emoji substitution steganography patterns."""
    results = {'found': False, 'emoji_count': 0, 'pattern_detected': False}
    try:
        text = data.decode('utf-8', errors='ignore')
    except:
        return results

    import unicodedata
    emojis = [ch for ch in text if unicodedata.category(ch).startswith(('So', 'Sk'))]
    results['emoji_count'] = len(emojis)

    # Check for alternating emoji pairs (characteristic of emoji substitution steg)
    if len(emojis) > 20:
        # Count unique emoji types
        unique = len(set(emojis))
        if unique <= 20 and len(emojis) > 50:
            results['pattern_detected'] = True
            results['found'] = True

    return results


def detect_capitalization_steg(data: bytes) -> Dict[str, Any]:
    """Detect capitalization encoding steganography."""
    results = {'found': False, 'suspicious_caps': 0, 'total_words': 0}
    try:
        text = data.decode('utf-8', errors='ignore')
    except:
        return results

    words = text.split()
    results['total_words'] = len(words)

    # Count words with unexpected capitalization (mid-sentence uppercase)
    for i, word in enumerate(words):
        if word and word[0].isupper() and i > 0:
            prev = words[i - 1] if i > 0 else ''
            # Not after sentence end
            if prev and prev[-1] not in '.!?:':
                results['suspicious_caps'] += 1

    # High ratio of unexpected caps suggests encoding
    if results['total_words'] > 20:
        ratio = results['suspicious_caps'] / results['total_words']
        if ratio > 0.15:  # More than 15% unexpected caps
            results['found'] = True

    return results