codenuk_backend_mine/services/ai-analysis-service/enhanced_config.py

#!/usr/bin/env python3
"""
Enhanced Chunking Configuration
Configuration management for enhanced AI analysis system.

Author: Senior Engineer (20+ years experience)
Version: 1.0.0
"""

import os
from typing import Dict, Any

# Default configuration for enhanced chunking
DEFAULT_ENHANCED_CONFIG = {
    # Chunking parameters
    "max_tokens_per_chunk": int(os.getenv('ENHANCED_MAX_TOKENS_PER_CHUNK', 4000)),
    "overlap_lines": int(os.getenv('ENHANCED_OVERLAP_LINES', 5)),
    "min_chunk_size": int(os.getenv('ENHANCED_MIN_CHUNK_SIZE', 100)),

    # Processing parameters
    "preserve_imports": os.getenv('ENHANCED_PRESERVE_IMPORTS', 'true').lower() == 'true',
    "preserve_comments": os.getenv('ENHANCED_PRESERVE_COMMENTS', 'true').lower() == 'true',
    "enable_context_sharing": os.getenv('ENHANCED_CONTEXT_SHARING', 'true').lower() == 'true',
    "enable_memory_integration": os.getenv('ENHANCED_MEMORY_INTEGRATION', 'true').lower() == 'true',

    # Rate limiting for enhanced processing
    "enhanced_rate_limit": int(os.getenv('ENHANCED_RATE_LIMIT', 60)),  # requests per minute
    "batch_delay": float(os.getenv('ENHANCED_BATCH_DELAY', 0.1)),  # seconds between batches

    # File size thresholds
    "small_file_threshold": int(os.getenv('ENHANCED_SMALL_FILE_THRESHOLD', 200)),  # lines
    "medium_file_threshold": int(os.getenv('ENHANCED_MEDIUM_FILE_THRESHOLD', 500)),  # lines
    "large_file_threshold": int(os.getenv('ENHANCED_LARGE_FILE_THRESHOLD', 1000)),  # lines

    # Processing delays (seconds)
    "small_file_delay": float(os.getenv('ENHANCED_SMALL_FILE_DELAY', 0.05)),
    "medium_file_delay": float(os.getenv('ENHANCED_MEDIUM_FILE_DELAY', 0.1)),
    "large_file_delay": float(os.getenv('ENHANCED_LARGE_FILE_DELAY', 0.2)),

    # Memory and caching
    "chunk_cache_ttl": int(os.getenv('ENHANCED_CHUNK_CACHE_TTL', 3600)),  # seconds
    "enable_chunk_caching": os.getenv('ENHANCED_CHUNK_CACHING', 'true').lower() == 'true',

    # Feature flags
    "enable_enhanced_processing": os.getenv('ENHANCED_PROCESSING_ENABLED', 'true').lower() == 'true',
    "enable_batch_processing": os.getenv('ENHANCED_BATCH_PROCESSING', 'true').lower() == 'true',
    "enable_smart_chunking": os.getenv('ENHANCED_SMART_CHUNKING', 'true').lower() == 'true',

    # Fallback behavior
    "fallback_on_error": os.getenv('ENHANCED_FALLBACK_ON_ERROR', 'true').lower() == 'true',
    "log_enhanced_processing": os.getenv('ENHANCED_LOGGING', 'true').lower() == 'true',
}

# Language-specific chunking patterns
LANGUAGE_CHUNKING_PATTERNS = {
    'python': {
        'function': r'^def\s+\w+',
        'class': r'^class\s+\w+',
        'import': r'^(import|from)\s+',
        'comment': r'^\s*#',
        'docstring': r'^\s*""".*"""',
        'async_function': r'^async\s+def\s+\w+'
    },
    'javascript': {
        'function': r'^(function\s+\w+|const\s+\w+\s*=\s*(async\s+)?\(|export\s+(function|const))',
        'class': r'^class\s+\w+',
        'import': r'^(import|const\s+\w+\s*=\s*require)',
        'comment': r'^\s*//',
        'jsdoc': r'^\s*/\*\*',
        'arrow_function': r'^\s*\w+\s*=\s*\([^)]*\)\s*=>'
    },
    'typescript': {
        'function': r'^(function\s+\w+|const\s+\w+\s*=\s*(async\s+)?\(|export\s+(function|const))',
        'class': r'^class\s+\w+',
        'interface': r'^interface\s+\w+',
        'type': r'^type\s+\w+',
        'import': r'^(import|const\s+\w+\s*=\s*require)',
        'comment': r'^\s*//',
        'jsdoc': r'^\s*/\*\*',
        'arrow_function': r'^\s*\w+\s*=\s*\([^)]*\)\s*=>'
    },
    'java': {
        'function': r'^\s*(public|private|protected)?\s*(static\s+)?\w+\s+\w+\s*\(',
        'class': r'^class\s+\w+',
        'interface': r'^interface\s+\w+',
        'import': r'^import\s+',
        'comment': r'^\s*//',
        'javadoc': r'^\s*/\*\*',
        'annotation': r'^@\w+'
    },
    'cpp': {
        'function': r'^\w+\s+\w+\s*\(',
        'class': r'^class\s+\w+',
        'include': r'^#include\s*<',
        'comment': r'^\s*//',
        'block_comment': r'^\s*/\*',
        'namespace': r'^namespace\s+\w+'
    },
    'go': {
        'function': r'^func\s+\w+',
        'struct': r'^type\s+\w+\s+struct',
        'import': r'^import\s+',
        'comment': r'^\s*//',
        'package': r'^package\s+\w+'
    },
    'rust': {
        'function': r'^fn\s+\w+',
        'struct': r'^struct\s+\w+',
        'impl': r'^impl\s+\w+',
        'use': r'^use\s+',
        'comment': r'^\s*//',
        'module': r'^mod\s+\w+'
    }
}

# File size categories for processing optimization
FILE_SIZE_CATEGORIES = {
    'small': {
        'max_lines': DEFAULT_ENHANCED_CONFIG['small_file_threshold'],
        'processing_delay': DEFAULT_ENHANCED_CONFIG['small_file_delay'],
        'chunking_strategy': 'single_chunk'
    },
    'medium': {
        'max_lines': DEFAULT_ENHANCED_CONFIG['medium_file_threshold'],
        'processing_delay': DEFAULT_ENHANCED_CONFIG['medium_file_delay'],
        'chunking_strategy': 'basic_chunking'
    },
    'large': {
        'max_lines': DEFAULT_ENHANCED_CONFIG['large_file_threshold'],
        'processing_delay': DEFAULT_ENHANCED_CONFIG['large_file_delay'],
        'chunking_strategy': 'intelligent_chunking'
    },
    'huge': {
        'max_lines': float('inf'),
        'processing_delay': DEFAULT_ENHANCED_CONFIG['large_file_delay'] * 2,
        'chunking_strategy': 'advanced_chunking'
    }
}

# API optimization settings
API_OPTIMIZATION_CONFIG = {
    'max_concurrent_requests': 3,
    'request_timeout': 30.0,
    'retry_attempts': 2,
    'retry_delay': 1.0,
    'circuit_breaker_threshold': 5,
    'circuit_breaker_timeout': 60.0
}

# Memory system integration
MEMORY_INTEGRATION_CONFIG = {
    'enable_episodic_memory': True,
    'enable_persistent_memory': True,
    'enable_working_memory': True,
    'memory_retention_days': 30,
    'similarity_threshold': 0.7,
    'context_window_size': 5
}

def get_enhanced_config() -> Dict[str, Any]:
    """Get enhanced configuration with environment variable overrides."""
    config = DEFAULT_ENHANCED_CONFIG.copy()

    # Override with environment variables if present
    for key, value in config.items():
        env_key = f"ENHANCED_{key.upper()}"
        if env_key in os.environ:
            if isinstance(value, bool):
                config[key] = os.environ[env_key].lower() == 'true'
            elif isinstance(value, int):
                config[key] = int(os.environ[env_key])
            elif isinstance(value, float):
                config[key] = float(os.environ[env_key])
            else:
                config[key] = os.environ[env_key]

    return config

def get_language_patterns(language: str) -> Dict[str, str]:
    """Get chunking patterns for a specific language."""
    return LANGUAGE_CHUNKING_PATTERNS.get(language.lower(), LANGUAGE_CHUNKING_PATTERNS['python'])

def get_file_size_category(file_size: int) -> str:
    """Determine file size category for processing optimization."""
    if file_size <= FILE_SIZE_CATEGORIES['small']['max_lines']:
        return 'small'
    elif file_size <= FILE_SIZE_CATEGORIES['medium']['max_lines']:
        return 'medium'
    elif file_size <= FILE_SIZE_CATEGORIES['large']['max_lines']:
        return 'large'
    else:
        return 'huge'

def get_processing_strategy(file_size: int, language: str) -> Dict[str, Any]:
    """Get processing strategy for a file based on size and language."""
    category = get_file_size_category(file_size)
    strategy = FILE_SIZE_CATEGORIES[category].copy()
    strategy['language'] = language
    strategy['file_size'] = file_size
    return strategy

# Validation functions
def validate_enhanced_config(config: Dict[str, Any]) -> bool:
    """Validate enhanced configuration."""
    required_keys = [
        'max_tokens_per_chunk',
        'overlap_lines',
        'min_chunk_size',
        'enhanced_rate_limit',
        'batch_delay'
    ]

    for key in required_keys:
        if key not in config:
            return False
        if not isinstance(config[key], (int, float)) or config[key] <= 0:
            return False

    return True

def get_optimized_config_for_repo(file_count: int, avg_file_size: int) -> Dict[str, Any]:
    """Get optimized configuration based on repository characteristics."""
    config = get_enhanced_config()

    # Adjust batch processing based on file count
    if file_count > 20:
        config['batch_delay'] = max(0.05, config['batch_delay'] * 0.5)
    elif file_count < 5:
        config['batch_delay'] = min(0.5, config['batch_delay'] * 2)

    # Adjust chunking based on average file size
    if avg_file_size > 1000:
        config['max_tokens_per_chunk'] = min(6000, config['max_tokens_per_chunk'] * 1.5)
    elif avg_file_size < 200:
        config['max_tokens_per_chunk'] = max(2000, config['max_tokens_per_chunk'] * 0.7)

    return config