238 lines
8.7 KiB
Python
238 lines
8.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enhanced Chunking Configuration
|
|
Configuration management for enhanced AI analysis system.
|
|
|
|
Author: Senior Engineer (20+ years experience)
|
|
Version: 1.0.0
|
|
"""
|
|
|
|
import os
|
|
from typing import Dict, Any
|
|
|
|
# Default configuration for enhanced chunking
|
|
DEFAULT_ENHANCED_CONFIG = {
|
|
# Chunking parameters
|
|
"max_tokens_per_chunk": int(os.getenv('ENHANCED_MAX_TOKENS_PER_CHUNK', 4000)),
|
|
"overlap_lines": int(os.getenv('ENHANCED_OVERLAP_LINES', 5)),
|
|
"min_chunk_size": int(os.getenv('ENHANCED_MIN_CHUNK_SIZE', 100)),
|
|
|
|
# Processing parameters
|
|
"preserve_imports": os.getenv('ENHANCED_PRESERVE_IMPORTS', 'true').lower() == 'true',
|
|
"preserve_comments": os.getenv('ENHANCED_PRESERVE_COMMENTS', 'true').lower() == 'true',
|
|
"enable_context_sharing": os.getenv('ENHANCED_CONTEXT_SHARING', 'true').lower() == 'true',
|
|
"enable_memory_integration": os.getenv('ENHANCED_MEMORY_INTEGRATION', 'true').lower() == 'true',
|
|
|
|
# Rate limiting for enhanced processing
|
|
"enhanced_rate_limit": int(os.getenv('ENHANCED_RATE_LIMIT', 60)), # requests per minute
|
|
"batch_delay": float(os.getenv('ENHANCED_BATCH_DELAY', 0.1)), # seconds between batches
|
|
|
|
# File size thresholds
|
|
"small_file_threshold": int(os.getenv('ENHANCED_SMALL_FILE_THRESHOLD', 200)), # lines
|
|
"medium_file_threshold": int(os.getenv('ENHANCED_MEDIUM_FILE_THRESHOLD', 500)), # lines
|
|
"large_file_threshold": int(os.getenv('ENHANCED_LARGE_FILE_THRESHOLD', 1000)), # lines
|
|
|
|
# Processing delays (seconds)
|
|
"small_file_delay": float(os.getenv('ENHANCED_SMALL_FILE_DELAY', 0.05)),
|
|
"medium_file_delay": float(os.getenv('ENHANCED_MEDIUM_FILE_DELAY', 0.1)),
|
|
"large_file_delay": float(os.getenv('ENHANCED_LARGE_FILE_DELAY', 0.2)),
|
|
|
|
# Memory and caching
|
|
"chunk_cache_ttl": int(os.getenv('ENHANCED_CHUNK_CACHE_TTL', 3600)), # seconds
|
|
"enable_chunk_caching": os.getenv('ENHANCED_CHUNK_CACHING', 'true').lower() == 'true',
|
|
|
|
# Feature flags
|
|
"enable_enhanced_processing": os.getenv('ENHANCED_PROCESSING_ENABLED', 'true').lower() == 'true',
|
|
"enable_batch_processing": os.getenv('ENHANCED_BATCH_PROCESSING', 'true').lower() == 'true',
|
|
"enable_smart_chunking": os.getenv('ENHANCED_SMART_CHUNKING', 'true').lower() == 'true',
|
|
|
|
# Fallback behavior
|
|
"fallback_on_error": os.getenv('ENHANCED_FALLBACK_ON_ERROR', 'true').lower() == 'true',
|
|
"log_enhanced_processing": os.getenv('ENHANCED_LOGGING', 'true').lower() == 'true',
|
|
}
|
|
|
|
# Language-specific chunking patterns
|
|
LANGUAGE_CHUNKING_PATTERNS = {
|
|
'python': {
|
|
'function': r'^def\s+\w+',
|
|
'class': r'^class\s+\w+',
|
|
'import': r'^(import|from)\s+',
|
|
'comment': r'^\s*#',
|
|
'docstring': r'^\s*""".*"""',
|
|
'async_function': r'^async\s+def\s+\w+'
|
|
},
|
|
'javascript': {
|
|
'function': r'^(function\s+\w+|const\s+\w+\s*=\s*(async\s+)?\(|export\s+(function|const))',
|
|
'class': r'^class\s+\w+',
|
|
'import': r'^(import|const\s+\w+\s*=\s*require)',
|
|
'comment': r'^\s*//',
|
|
'jsdoc': r'^\s*/\*\*',
|
|
'arrow_function': r'^\s*\w+\s*=\s*\([^)]*\)\s*=>'
|
|
},
|
|
'typescript': {
|
|
'function': r'^(function\s+\w+|const\s+\w+\s*=\s*(async\s+)?\(|export\s+(function|const))',
|
|
'class': r'^class\s+\w+',
|
|
'interface': r'^interface\s+\w+',
|
|
'type': r'^type\s+\w+',
|
|
'import': r'^(import|const\s+\w+\s*=\s*require)',
|
|
'comment': r'^\s*//',
|
|
'jsdoc': r'^\s*/\*\*',
|
|
'arrow_function': r'^\s*\w+\s*=\s*\([^)]*\)\s*=>'
|
|
},
|
|
'java': {
|
|
'function': r'^\s*(public|private|protected)?\s*(static\s+)?\w+\s+\w+\s*\(',
|
|
'class': r'^class\s+\w+',
|
|
'interface': r'^interface\s+\w+',
|
|
'import': r'^import\s+',
|
|
'comment': r'^\s*//',
|
|
'javadoc': r'^\s*/\*\*',
|
|
'annotation': r'^@\w+'
|
|
},
|
|
'cpp': {
|
|
'function': r'^\w+\s+\w+\s*\(',
|
|
'class': r'^class\s+\w+',
|
|
'include': r'^#include\s*<',
|
|
'comment': r'^\s*//',
|
|
'block_comment': r'^\s*/\*',
|
|
'namespace': r'^namespace\s+\w+'
|
|
},
|
|
'go': {
|
|
'function': r'^func\s+\w+',
|
|
'struct': r'^type\s+\w+\s+struct',
|
|
'import': r'^import\s+',
|
|
'comment': r'^\s*//',
|
|
'package': r'^package\s+\w+'
|
|
},
|
|
'rust': {
|
|
'function': r'^fn\s+\w+',
|
|
'struct': r'^struct\s+\w+',
|
|
'impl': r'^impl\s+\w+',
|
|
'use': r'^use\s+',
|
|
'comment': r'^\s*//',
|
|
'module': r'^mod\s+\w+'
|
|
}
|
|
}
|
|
|
|
# File size categories for processing optimization
|
|
FILE_SIZE_CATEGORIES = {
|
|
'small': {
|
|
'max_lines': DEFAULT_ENHANCED_CONFIG['small_file_threshold'],
|
|
'processing_delay': DEFAULT_ENHANCED_CONFIG['small_file_delay'],
|
|
'chunking_strategy': 'single_chunk'
|
|
},
|
|
'medium': {
|
|
'max_lines': DEFAULT_ENHANCED_CONFIG['medium_file_threshold'],
|
|
'processing_delay': DEFAULT_ENHANCED_CONFIG['medium_file_delay'],
|
|
'chunking_strategy': 'basic_chunking'
|
|
},
|
|
'large': {
|
|
'max_lines': DEFAULT_ENHANCED_CONFIG['large_file_threshold'],
|
|
'processing_delay': DEFAULT_ENHANCED_CONFIG['large_file_delay'],
|
|
'chunking_strategy': 'intelligent_chunking'
|
|
},
|
|
'huge': {
|
|
'max_lines': float('inf'),
|
|
'processing_delay': DEFAULT_ENHANCED_CONFIG['large_file_delay'] * 2,
|
|
'chunking_strategy': 'advanced_chunking'
|
|
}
|
|
}
|
|
|
|
# API optimization settings
|
|
API_OPTIMIZATION_CONFIG = {
|
|
'max_concurrent_requests': 3,
|
|
'request_timeout': 30.0,
|
|
'retry_attempts': 2,
|
|
'retry_delay': 1.0,
|
|
'circuit_breaker_threshold': 5,
|
|
'circuit_breaker_timeout': 60.0
|
|
}
|
|
|
|
# Memory system integration
|
|
MEMORY_INTEGRATION_CONFIG = {
|
|
'enable_episodic_memory': True,
|
|
'enable_persistent_memory': True,
|
|
'enable_working_memory': True,
|
|
'memory_retention_days': 30,
|
|
'similarity_threshold': 0.7,
|
|
'context_window_size': 5
|
|
}
|
|
|
|
def get_enhanced_config() -> Dict[str, Any]:
|
|
"""Get enhanced configuration with environment variable overrides."""
|
|
config = DEFAULT_ENHANCED_CONFIG.copy()
|
|
|
|
# Override with environment variables if present
|
|
for key, value in config.items():
|
|
env_key = f"ENHANCED_{key.upper()}"
|
|
if env_key in os.environ:
|
|
if isinstance(value, bool):
|
|
config[key] = os.environ[env_key].lower() == 'true'
|
|
elif isinstance(value, int):
|
|
config[key] = int(os.environ[env_key])
|
|
elif isinstance(value, float):
|
|
config[key] = float(os.environ[env_key])
|
|
else:
|
|
config[key] = os.environ[env_key]
|
|
|
|
return config
|
|
|
|
def get_language_patterns(language: str) -> Dict[str, str]:
|
|
"""Get chunking patterns for a specific language."""
|
|
return LANGUAGE_CHUNKING_PATTERNS.get(language.lower(), LANGUAGE_CHUNKING_PATTERNS['python'])
|
|
|
|
def get_file_size_category(file_size: int) -> str:
|
|
"""Determine file size category for processing optimization."""
|
|
if file_size <= FILE_SIZE_CATEGORIES['small']['max_lines']:
|
|
return 'small'
|
|
elif file_size <= FILE_SIZE_CATEGORIES['medium']['max_lines']:
|
|
return 'medium'
|
|
elif file_size <= FILE_SIZE_CATEGORIES['large']['max_lines']:
|
|
return 'large'
|
|
else:
|
|
return 'huge'
|
|
|
|
def get_processing_strategy(file_size: int, language: str) -> Dict[str, Any]:
|
|
"""Get processing strategy for a file based on size and language."""
|
|
category = get_file_size_category(file_size)
|
|
strategy = FILE_SIZE_CATEGORIES[category].copy()
|
|
strategy['language'] = language
|
|
strategy['file_size'] = file_size
|
|
return strategy
|
|
|
|
# Validation functions
|
|
def validate_enhanced_config(config: Dict[str, Any]) -> bool:
|
|
"""Validate enhanced configuration."""
|
|
required_keys = [
|
|
'max_tokens_per_chunk',
|
|
'overlap_lines',
|
|
'min_chunk_size',
|
|
'enhanced_rate_limit',
|
|
'batch_delay'
|
|
]
|
|
|
|
for key in required_keys:
|
|
if key not in config:
|
|
return False
|
|
if not isinstance(config[key], (int, float)) or config[key] <= 0:
|
|
return False
|
|
|
|
return True
|
|
|
|
def get_optimized_config_for_repo(file_count: int, avg_file_size: int) -> Dict[str, Any]:
|
|
"""Get optimized configuration based on repository characteristics."""
|
|
config = get_enhanced_config()
|
|
|
|
# Adjust batch processing based on file count
|
|
if file_count > 20:
|
|
config['batch_delay'] = max(0.05, config['batch_delay'] * 0.5)
|
|
elif file_count < 5:
|
|
config['batch_delay'] = min(0.5, config['batch_delay'] * 2)
|
|
|
|
# Adjust chunking based on average file size
|
|
if avg_file_size > 1000:
|
|
config['max_tokens_per_chunk'] = min(6000, config['max_tokens_per_chunk'] * 1.5)
|
|
elif avg_file_size < 200:
|
|
config['max_tokens_per_chunk'] = max(2000, config['max_tokens_per_chunk'] * 0.7)
|
|
|
|
return config
|