#!/usr/bin/env python3 """ Enhanced Chunking Configuration Configuration management for enhanced AI analysis system. Author: Senior Engineer (20+ years experience) Version: 1.0.0 """ import os from typing import Dict, Any # Default configuration for enhanced chunking DEFAULT_ENHANCED_CONFIG = { # Chunking parameters "max_tokens_per_chunk": int(os.getenv('ENHANCED_MAX_TOKENS_PER_CHUNK', 4000)), "overlap_lines": int(os.getenv('ENHANCED_OVERLAP_LINES', 5)), "min_chunk_size": int(os.getenv('ENHANCED_MIN_CHUNK_SIZE', 100)), # Processing parameters "preserve_imports": os.getenv('ENHANCED_PRESERVE_IMPORTS', 'true').lower() == 'true', "preserve_comments": os.getenv('ENHANCED_PRESERVE_COMMENTS', 'true').lower() == 'true', "enable_context_sharing": os.getenv('ENHANCED_CONTEXT_SHARING', 'true').lower() == 'true', "enable_memory_integration": os.getenv('ENHANCED_MEMORY_INTEGRATION', 'true').lower() == 'true', # Rate limiting for enhanced processing "enhanced_rate_limit": int(os.getenv('ENHANCED_RATE_LIMIT', 60)), # requests per minute "batch_delay": float(os.getenv('ENHANCED_BATCH_DELAY', 0.1)), # seconds between batches # File size thresholds "small_file_threshold": int(os.getenv('ENHANCED_SMALL_FILE_THRESHOLD', 200)), # lines "medium_file_threshold": int(os.getenv('ENHANCED_MEDIUM_FILE_THRESHOLD', 500)), # lines "large_file_threshold": int(os.getenv('ENHANCED_LARGE_FILE_THRESHOLD', 1000)), # lines # Processing delays (seconds) "small_file_delay": float(os.getenv('ENHANCED_SMALL_FILE_DELAY', 0.05)), "medium_file_delay": float(os.getenv('ENHANCED_MEDIUM_FILE_DELAY', 0.1)), "large_file_delay": float(os.getenv('ENHANCED_LARGE_FILE_DELAY', 0.2)), # Memory and caching "chunk_cache_ttl": int(os.getenv('ENHANCED_CHUNK_CACHE_TTL', 3600)), # seconds "enable_chunk_caching": os.getenv('ENHANCED_CHUNK_CACHING', 'true').lower() == 'true', # Feature flags "enable_enhanced_processing": os.getenv('ENHANCED_PROCESSING_ENABLED', 'true').lower() == 'true', "enable_batch_processing": os.getenv('ENHANCED_BATCH_PROCESSING', 'true').lower() == 'true', "enable_smart_chunking": os.getenv('ENHANCED_SMART_CHUNKING', 'true').lower() == 'true', # Fallback behavior "fallback_on_error": os.getenv('ENHANCED_FALLBACK_ON_ERROR', 'true').lower() == 'true', "log_enhanced_processing": os.getenv('ENHANCED_LOGGING', 'true').lower() == 'true', } # Language-specific chunking patterns LANGUAGE_CHUNKING_PATTERNS = { 'python': { 'function': r'^def\s+\w+', 'class': r'^class\s+\w+', 'import': r'^(import|from)\s+', 'comment': r'^\s*#', 'docstring': r'^\s*""".*"""', 'async_function': r'^async\s+def\s+\w+' }, 'javascript': { 'function': r'^(function\s+\w+|const\s+\w+\s*=\s*(async\s+)?\(|export\s+(function|const))', 'class': r'^class\s+\w+', 'import': r'^(import|const\s+\w+\s*=\s*require)', 'comment': r'^\s*//', 'jsdoc': r'^\s*/\*\*', 'arrow_function': r'^\s*\w+\s*=\s*\([^)]*\)\s*=>' }, 'typescript': { 'function': r'^(function\s+\w+|const\s+\w+\s*=\s*(async\s+)?\(|export\s+(function|const))', 'class': r'^class\s+\w+', 'interface': r'^interface\s+\w+', 'type': r'^type\s+\w+', 'import': r'^(import|const\s+\w+\s*=\s*require)', 'comment': r'^\s*//', 'jsdoc': r'^\s*/\*\*', 'arrow_function': r'^\s*\w+\s*=\s*\([^)]*\)\s*=>' }, 'java': { 'function': r'^\s*(public|private|protected)?\s*(static\s+)?\w+\s+\w+\s*\(', 'class': r'^class\s+\w+', 'interface': r'^interface\s+\w+', 'import': r'^import\s+', 'comment': r'^\s*//', 'javadoc': r'^\s*/\*\*', 'annotation': r'^@\w+' }, 'cpp': { 'function': r'^\w+\s+\w+\s*\(', 'class': r'^class\s+\w+', 'include': r'^#include\s*<', 'comment': r'^\s*//', 'block_comment': r'^\s*/\*', 'namespace': r'^namespace\s+\w+' }, 'go': { 'function': r'^func\s+\w+', 'struct': r'^type\s+\w+\s+struct', 'import': r'^import\s+', 'comment': r'^\s*//', 'package': r'^package\s+\w+' }, 'rust': { 'function': r'^fn\s+\w+', 'struct': r'^struct\s+\w+', 'impl': r'^impl\s+\w+', 'use': r'^use\s+', 'comment': r'^\s*//', 'module': r'^mod\s+\w+' } } # File size categories for processing optimization FILE_SIZE_CATEGORIES = { 'small': { 'max_lines': DEFAULT_ENHANCED_CONFIG['small_file_threshold'], 'processing_delay': DEFAULT_ENHANCED_CONFIG['small_file_delay'], 'chunking_strategy': 'single_chunk' }, 'medium': { 'max_lines': DEFAULT_ENHANCED_CONFIG['medium_file_threshold'], 'processing_delay': DEFAULT_ENHANCED_CONFIG['medium_file_delay'], 'chunking_strategy': 'basic_chunking' }, 'large': { 'max_lines': DEFAULT_ENHANCED_CONFIG['large_file_threshold'], 'processing_delay': DEFAULT_ENHANCED_CONFIG['large_file_delay'], 'chunking_strategy': 'intelligent_chunking' }, 'huge': { 'max_lines': float('inf'), 'processing_delay': DEFAULT_ENHANCED_CONFIG['large_file_delay'] * 2, 'chunking_strategy': 'advanced_chunking' } } # API optimization settings API_OPTIMIZATION_CONFIG = { 'max_concurrent_requests': 3, 'request_timeout': 30.0, 'retry_attempts': 2, 'retry_delay': 1.0, 'circuit_breaker_threshold': 5, 'circuit_breaker_timeout': 60.0 } # Memory system integration MEMORY_INTEGRATION_CONFIG = { 'enable_episodic_memory': True, 'enable_persistent_memory': True, 'enable_working_memory': True, 'memory_retention_days': 30, 'similarity_threshold': 0.7, 'context_window_size': 5 } def get_enhanced_config() -> Dict[str, Any]: """Get enhanced configuration with environment variable overrides.""" config = DEFAULT_ENHANCED_CONFIG.copy() # Override with environment variables if present for key, value in config.items(): env_key = f"ENHANCED_{key.upper()}" if env_key in os.environ: if isinstance(value, bool): config[key] = os.environ[env_key].lower() == 'true' elif isinstance(value, int): config[key] = int(os.environ[env_key]) elif isinstance(value, float): config[key] = float(os.environ[env_key]) else: config[key] = os.environ[env_key] return config def get_language_patterns(language: str) -> Dict[str, str]: """Get chunking patterns for a specific language.""" return LANGUAGE_CHUNKING_PATTERNS.get(language.lower(), LANGUAGE_CHUNKING_PATTERNS['python']) def get_file_size_category(file_size: int) -> str: """Determine file size category for processing optimization.""" if file_size <= FILE_SIZE_CATEGORIES['small']['max_lines']: return 'small' elif file_size <= FILE_SIZE_CATEGORIES['medium']['max_lines']: return 'medium' elif file_size <= FILE_SIZE_CATEGORIES['large']['max_lines']: return 'large' else: return 'huge' def get_processing_strategy(file_size: int, language: str) -> Dict[str, Any]: """Get processing strategy for a file based on size and language.""" category = get_file_size_category(file_size) strategy = FILE_SIZE_CATEGORIES[category].copy() strategy['language'] = language strategy['file_size'] = file_size return strategy # Validation functions def validate_enhanced_config(config: Dict[str, Any]) -> bool: """Validate enhanced configuration.""" required_keys = [ 'max_tokens_per_chunk', 'overlap_lines', 'min_chunk_size', 'enhanced_rate_limit', 'batch_delay' ] for key in required_keys: if key not in config: return False if not isinstance(config[key], (int, float)) or config[key] <= 0: return False return True def get_optimized_config_for_repo(file_count: int, avg_file_size: int) -> Dict[str, Any]: """Get optimized configuration based on repository characteristics.""" config = get_enhanced_config() # Adjust batch processing based on file count if file_count > 20: config['batch_delay'] = max(0.05, config['batch_delay'] * 0.5) elif file_count < 5: config['batch_delay'] = min(0.5, config['batch_delay'] * 2) # Adjust chunking based on average file size if avg_file_size > 1000: config['max_tokens_per_chunk'] = min(6000, config['max_tokens_per_chunk'] * 1.5) elif avg_file_size < 200: config['max_tokens_per_chunk'] = max(2000, config['max_tokens_per_chunk'] * 0.7) return config