codenuk_backend_mine/services/ai-analysis-service/enhanced_config.py
2025-10-24 13:02:49 +05:30

238 lines
8.7 KiB
Python

#!/usr/bin/env python3
"""
Enhanced Chunking Configuration
Configuration management for enhanced AI analysis system.
Author: Senior Engineer (20+ years experience)
Version: 1.0.0
"""
import os
from typing import Dict, Any
# Default configuration for enhanced chunking
DEFAULT_ENHANCED_CONFIG = {
# Chunking parameters
"max_tokens_per_chunk": int(os.getenv('ENHANCED_MAX_TOKENS_PER_CHUNK', 4000)),
"overlap_lines": int(os.getenv('ENHANCED_OVERLAP_LINES', 5)),
"min_chunk_size": int(os.getenv('ENHANCED_MIN_CHUNK_SIZE', 100)),
# Processing parameters
"preserve_imports": os.getenv('ENHANCED_PRESERVE_IMPORTS', 'true').lower() == 'true',
"preserve_comments": os.getenv('ENHANCED_PRESERVE_COMMENTS', 'true').lower() == 'true',
"enable_context_sharing": os.getenv('ENHANCED_CONTEXT_SHARING', 'true').lower() == 'true',
"enable_memory_integration": os.getenv('ENHANCED_MEMORY_INTEGRATION', 'true').lower() == 'true',
# Rate limiting for enhanced processing
"enhanced_rate_limit": int(os.getenv('ENHANCED_RATE_LIMIT', 60)), # requests per minute
"batch_delay": float(os.getenv('ENHANCED_BATCH_DELAY', 0.1)), # seconds between batches
# File size thresholds
"small_file_threshold": int(os.getenv('ENHANCED_SMALL_FILE_THRESHOLD', 200)), # lines
"medium_file_threshold": int(os.getenv('ENHANCED_MEDIUM_FILE_THRESHOLD', 500)), # lines
"large_file_threshold": int(os.getenv('ENHANCED_LARGE_FILE_THRESHOLD', 1000)), # lines
# Processing delays (seconds)
"small_file_delay": float(os.getenv('ENHANCED_SMALL_FILE_DELAY', 0.05)),
"medium_file_delay": float(os.getenv('ENHANCED_MEDIUM_FILE_DELAY', 0.1)),
"large_file_delay": float(os.getenv('ENHANCED_LARGE_FILE_DELAY', 0.2)),
# Memory and caching
"chunk_cache_ttl": int(os.getenv('ENHANCED_CHUNK_CACHE_TTL', 3600)), # seconds
"enable_chunk_caching": os.getenv('ENHANCED_CHUNK_CACHING', 'true').lower() == 'true',
# Feature flags
"enable_enhanced_processing": os.getenv('ENHANCED_PROCESSING_ENABLED', 'true').lower() == 'true',
"enable_batch_processing": os.getenv('ENHANCED_BATCH_PROCESSING', 'true').lower() == 'true',
"enable_smart_chunking": os.getenv('ENHANCED_SMART_CHUNKING', 'true').lower() == 'true',
# Fallback behavior
"fallback_on_error": os.getenv('ENHANCED_FALLBACK_ON_ERROR', 'true').lower() == 'true',
"log_enhanced_processing": os.getenv('ENHANCED_LOGGING', 'true').lower() == 'true',
}
# Language-specific chunking patterns
LANGUAGE_CHUNKING_PATTERNS = {
'python': {
'function': r'^def\s+\w+',
'class': r'^class\s+\w+',
'import': r'^(import|from)\s+',
'comment': r'^\s*#',
'docstring': r'^\s*""".*"""',
'async_function': r'^async\s+def\s+\w+'
},
'javascript': {
'function': r'^(function\s+\w+|const\s+\w+\s*=\s*(async\s+)?\(|export\s+(function|const))',
'class': r'^class\s+\w+',
'import': r'^(import|const\s+\w+\s*=\s*require)',
'comment': r'^\s*//',
'jsdoc': r'^\s*/\*\*',
'arrow_function': r'^\s*\w+\s*=\s*\([^)]*\)\s*=>'
},
'typescript': {
'function': r'^(function\s+\w+|const\s+\w+\s*=\s*(async\s+)?\(|export\s+(function|const))',
'class': r'^class\s+\w+',
'interface': r'^interface\s+\w+',
'type': r'^type\s+\w+',
'import': r'^(import|const\s+\w+\s*=\s*require)',
'comment': r'^\s*//',
'jsdoc': r'^\s*/\*\*',
'arrow_function': r'^\s*\w+\s*=\s*\([^)]*\)\s*=>'
},
'java': {
'function': r'^\s*(public|private|protected)?\s*(static\s+)?\w+\s+\w+\s*\(',
'class': r'^class\s+\w+',
'interface': r'^interface\s+\w+',
'import': r'^import\s+',
'comment': r'^\s*//',
'javadoc': r'^\s*/\*\*',
'annotation': r'^@\w+'
},
'cpp': {
'function': r'^\w+\s+\w+\s*\(',
'class': r'^class\s+\w+',
'include': r'^#include\s*<',
'comment': r'^\s*//',
'block_comment': r'^\s*/\*',
'namespace': r'^namespace\s+\w+'
},
'go': {
'function': r'^func\s+\w+',
'struct': r'^type\s+\w+\s+struct',
'import': r'^import\s+',
'comment': r'^\s*//',
'package': r'^package\s+\w+'
},
'rust': {
'function': r'^fn\s+\w+',
'struct': r'^struct\s+\w+',
'impl': r'^impl\s+\w+',
'use': r'^use\s+',
'comment': r'^\s*//',
'module': r'^mod\s+\w+'
}
}
# File size categories for processing optimization
FILE_SIZE_CATEGORIES = {
'small': {
'max_lines': DEFAULT_ENHANCED_CONFIG['small_file_threshold'],
'processing_delay': DEFAULT_ENHANCED_CONFIG['small_file_delay'],
'chunking_strategy': 'single_chunk'
},
'medium': {
'max_lines': DEFAULT_ENHANCED_CONFIG['medium_file_threshold'],
'processing_delay': DEFAULT_ENHANCED_CONFIG['medium_file_delay'],
'chunking_strategy': 'basic_chunking'
},
'large': {
'max_lines': DEFAULT_ENHANCED_CONFIG['large_file_threshold'],
'processing_delay': DEFAULT_ENHANCED_CONFIG['large_file_delay'],
'chunking_strategy': 'intelligent_chunking'
},
'huge': {
'max_lines': float('inf'),
'processing_delay': DEFAULT_ENHANCED_CONFIG['large_file_delay'] * 2,
'chunking_strategy': 'advanced_chunking'
}
}
# API optimization settings
API_OPTIMIZATION_CONFIG = {
'max_concurrent_requests': 3,
'request_timeout': 30.0,
'retry_attempts': 2,
'retry_delay': 1.0,
'circuit_breaker_threshold': 5,
'circuit_breaker_timeout': 60.0
}
# Memory system integration
MEMORY_INTEGRATION_CONFIG = {
'enable_episodic_memory': True,
'enable_persistent_memory': True,
'enable_working_memory': True,
'memory_retention_days': 30,
'similarity_threshold': 0.7,
'context_window_size': 5
}
def get_enhanced_config() -> Dict[str, Any]:
"""Get enhanced configuration with environment variable overrides."""
config = DEFAULT_ENHANCED_CONFIG.copy()
# Override with environment variables if present
for key, value in config.items():
env_key = f"ENHANCED_{key.upper()}"
if env_key in os.environ:
if isinstance(value, bool):
config[key] = os.environ[env_key].lower() == 'true'
elif isinstance(value, int):
config[key] = int(os.environ[env_key])
elif isinstance(value, float):
config[key] = float(os.environ[env_key])
else:
config[key] = os.environ[env_key]
return config
def get_language_patterns(language: str) -> Dict[str, str]:
"""Get chunking patterns for a specific language."""
return LANGUAGE_CHUNKING_PATTERNS.get(language.lower(), LANGUAGE_CHUNKING_PATTERNS['python'])
def get_file_size_category(file_size: int) -> str:
"""Determine file size category for processing optimization."""
if file_size <= FILE_SIZE_CATEGORIES['small']['max_lines']:
return 'small'
elif file_size <= FILE_SIZE_CATEGORIES['medium']['max_lines']:
return 'medium'
elif file_size <= FILE_SIZE_CATEGORIES['large']['max_lines']:
return 'large'
else:
return 'huge'
def get_processing_strategy(file_size: int, language: str) -> Dict[str, Any]:
"""Get processing strategy for a file based on size and language."""
category = get_file_size_category(file_size)
strategy = FILE_SIZE_CATEGORIES[category].copy()
strategy['language'] = language
strategy['file_size'] = file_size
return strategy
# Validation functions
def validate_enhanced_config(config: Dict[str, Any]) -> bool:
"""Validate enhanced configuration."""
required_keys = [
'max_tokens_per_chunk',
'overlap_lines',
'min_chunk_size',
'enhanced_rate_limit',
'batch_delay'
]
for key in required_keys:
if key not in config:
return False
if not isinstance(config[key], (int, float)) or config[key] <= 0:
return False
return True
def get_optimized_config_for_repo(file_count: int, avg_file_size: int) -> Dict[str, Any]:
"""Get optimized configuration based on repository characteristics."""
config = get_enhanced_config()
# Adjust batch processing based on file count
if file_count > 20:
config['batch_delay'] = max(0.05, config['batch_delay'] * 0.5)
elif file_count < 5:
config['batch_delay'] = min(0.5, config['batch_delay'] * 2)
# Adjust chunking based on average file size
if avg_file_size > 1000:
config['max_tokens_per_chunk'] = min(6000, config['max_tokens_per_chunk'] * 1.5)
elif avg_file_size < 200:
config['max_tokens_per_chunk'] = max(2000, config['max_tokens_per_chunk'] * 0.7)
return config