codenuk_backend_mine/services/ai-analysis-service/enhanced_analyzer.py
2025-10-24 13:02:49 +05:30

305 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Enhanced Analyzer Integration
Seamlessly integrates enhanced chunking with existing AI Analysis Service.
Author: Senior Engineer (20+ years experience)
Version: 1.0.0
"""
import asyncio
import logging
from typing import Dict, List, Any, Optional
from pathlib import Path
# Import existing classes (maintain compatibility)
from ai_analyze import EnhancedGitHubAnalyzer, FileAnalysis, RepositoryAnalysis
from enhanced_chunking import EnhancedFileProcessor, ENHANCED_CHUNKING_CONFIG
class EnhancedGitHubAnalyzerV2(EnhancedGitHubAnalyzer):
"""
Enhanced version of GitHubAnalyzer with intelligent chunking.
Maintains 100% backward compatibility while adding enhanced capabilities.
"""
def __init__(self, api_key: str, memory_config: Dict[str, Any]):
# Initialize parent class
super().__init__(api_key, memory_config)
# Add enhanced processing capability
self.enhanced_processor = EnhancedFileProcessor(self.client, self.memory_manager)
self.enhanced_enabled = True # Feature flag for easy toggling
# Configuration
self.chunking_config = ENHANCED_CHUNKING_CONFIG
self.logger = logging.getLogger(__name__)
self.logger.info("Enhanced GitHub Analyzer V2 initialized with chunking capabilities")
async def analyze_file_with_memory_enhanced(self, file_path: Path, content: str, repo_id: str) -> FileAnalysis:
"""
Enhanced version of analyze_file_with_memory with intelligent chunking.
Maintains exact same interface and return type for backward compatibility.
"""
try:
if not self.enhanced_enabled:
print(f"🔍 [DEBUG] Enhanced disabled, using original method for {file_path}")
return await super().analyze_file_with_memory(file_path, content, repo_id)
print(f"🔍 [DEBUG] Starting enhanced processing for {file_path}")
# Use enhanced processing
enhanced_result = await self.enhanced_processor.process_file_enhanced(
str(file_path), content, repo_id
)
print(f"🔍 [DEBUG] Enhanced processing completed for {file_path}")
# Convert to FileAnalysis object (maintain compatibility)
return self._convert_to_file_analysis(enhanced_result, file_path)
except Exception as e:
print(f"🔍 [DEBUG] Enhanced analysis failed for {file_path}: {e}")
self.logger.error(f"Enhanced analysis failed for {file_path}, falling back to original: {e}")
# Fallback to original method
return await super().analyze_file_with_memory(file_path, content, repo_id)
def _convert_to_file_analysis(self, enhanced_result: Dict[str, Any], file_path: Path) -> FileAnalysis:
"""Convert enhanced analysis result to FileAnalysis object for compatibility."""
return FileAnalysis(
path=str(file_path),
language=enhanced_result.get('language', 'Unknown'),
lines_of_code=enhanced_result.get('lines_of_code', 0),
complexity_score=enhanced_result.get('complexity_score', 5.0),
issues_found=enhanced_result.get('issues_found', []),
recommendations=enhanced_result.get('recommendations', []),
detailed_analysis=enhanced_result.get('detailed_analysis', ''),
severity_score=enhanced_result.get('severity_score', 5.0)
)
async def analyze_repository_with_memory_enhanced(self, repo_path: str) -> RepositoryAnalysis:
"""
Enhanced repository analysis with intelligent chunking and batch processing.
Maintains exact same interface and return type for backward compatibility.
"""
try:
if not self.enhanced_enabled:
# Fallback to original method
return await super().analyze_repository_with_memory(repo_path)
# Use enhanced processing with batch optimization
return await self._analyze_repository_enhanced(repo_path)
except Exception as e:
self.logger.error(f"Enhanced repository analysis failed, falling back to original: {e}")
# Fallback to original method
return await super().analyze_repository_with_memory(repo_path)
async def _analyze_repository_enhanced(self, repo_path: str) -> RepositoryAnalysis:
"""Enhanced repository analysis with batch processing and chunking."""
# Generate repo ID and check cache
repo_id = self.calculate_repo_id(repo_path)
# Check working memory for recent analysis
cached_analysis = await self.memory_manager.get_working_memory(f"repo_analysis:{repo_id}")
if cached_analysis:
self.logger.info("Using cached repository analysis from memory")
return RepositoryAnalysis(**cached_analysis)
# Clone/access repository
actual_repo_path = self.clone_repository(repo_path)
# Get analysis context from memory
context_memories = await self.get_analysis_context(repo_path, "", repo_id)
# Scan files with enhanced processing
files_to_analyze = self.scan_repository(actual_repo_path)
if not files_to_analyze:
raise Exception("No files found to analyze")
self.logger.info(f"Starting enhanced analysis of {len(files_to_analyze)} files...")
# Process files with batch optimization
file_analyses = await self._process_files_with_batching(files_to_analyze, repo_id)
# Repository-level analysis with enhanced context
architecture_assessment, security_assessment = await self.analyze_repository_overview_with_memory(
actual_repo_path, file_analyses, context_memories, repo_id
)
# Calculate overall quality score safely
if file_analyses and len(file_analyses) > 0:
valid_scores = [fa.severity_score for fa in file_analyses if fa.severity_score is not None]
avg_quality = sum(valid_scores) / len(valid_scores) if valid_scores else 5.0
else:
avg_quality = 5.0
# Generate statistics safely
from collections import Counter
if file_analyses:
language_list = [fa.language for fa in file_analyses if fa.language is not None]
languages = dict(Counter(language_list))
total_lines = sum(fa.lines_of_code for fa in file_analyses if fa.lines_of_code is not None)
else:
languages = {}
total_lines = 0
# Create repository analysis
repo_analysis = RepositoryAnalysis(
repo_path=repo_path,
total_files=len(file_analyses),
total_lines=total_lines,
languages=languages,
architecture_assessment=architecture_assessment,
security_assessment=security_assessment,
code_quality_score=avg_quality,
file_analyses=file_analyses,
executive_summary="",
high_quality_files=[]
)
# Generate executive summary with enhanced context
repo_analysis.executive_summary = await self.generate_executive_summary_with_memory(
repo_analysis, context_memories
)
# Store analysis in episodic memory
await self.memory_manager.store_episodic_memory(
self.session_id, "Enhanced automated repository analysis",
f"Analyzed {repo_analysis.total_files} files with enhanced chunking, found {sum(len(fa.issues_found) if isinstance(fa.issues_found, (list, tuple)) else 0 for fa in file_analyses)} issues",
repo_id,
{
'repo_path': repo_path,
'quality_score': avg_quality,
'total_issues': sum(len(fa.issues_found) if isinstance(fa.issues_found, (list, tuple)) else 0 for fa in file_analyses),
'analysis_type': 'enhanced_automated_comprehensive',
'chunking_enabled': True
}
)
# Cache analysis in working memory
await self.memory_manager.store_working_memory(
f"repo_analysis:{repo_id}",
self._repo_analysis_to_dict(repo_analysis),
ttl=7200 # 2 hours
)
return repo_analysis
async def _process_files_with_batching(self, files_to_analyze: List[tuple], repo_id: str) -> List[FileAnalysis]:
"""Process files with intelligent batching to optimize API usage."""
file_analyses = []
processed_files = 0
# Group files by size and type for optimal batching
small_files = []
medium_files = []
large_files = []
for file_path, content in files_to_analyze:
file_size = len(content.split('\n'))
if file_size < 200:
small_files.append((file_path, content))
elif file_size < 500:
medium_files.append((file_path, content))
else:
large_files.append((file_path, content))
# Process small files in batches (fast processing)
if small_files:
self.logger.info(f"Processing {len(small_files)} small files...")
for file_path, content in small_files:
try:
analysis = await self.analyze_file_with_memory_enhanced(
Path(file_path), content, repo_id
)
file_analyses.append(analysis)
processed_files += 1
await asyncio.sleep(0.05) # Small delay
except Exception as e:
self.logger.error(f"Error analyzing small file {file_path}: {e}")
continue
# Process medium files individually (balanced processing)
if medium_files:
self.logger.info(f"Processing {len(medium_files)} medium files...")
for file_path, content in medium_files:
try:
analysis = await self.analyze_file_with_memory_enhanced(
Path(file_path), content, repo_id
)
file_analyses.append(analysis)
processed_files += 1
await asyncio.sleep(0.1) # Medium delay
except Exception as e:
self.logger.error(f"Error analyzing medium file {file_path}: {e}")
continue
# Process large files with enhanced chunking (careful processing)
if large_files:
self.logger.info(f"Processing {len(large_files)} large files with enhanced chunking...")
for file_path, content in large_files:
try:
analysis = await self.analyze_file_with_memory_enhanced(
Path(file_path), content, repo_id
)
file_analyses.append(analysis)
processed_files += 1
await asyncio.sleep(0.2) # Longer delay for large files
except Exception as e:
self.logger.error(f"Error analyzing large file {file_path}: {e}")
continue
self.logger.info(f"Enhanced processing completed: {processed_files}/{len(files_to_analyze)} files processed")
return file_analyses
def _repo_analysis_to_dict(self, repo_analysis: RepositoryAnalysis) -> Dict[str, Any]:
"""Convert RepositoryAnalysis to dictionary for caching."""
return {
'repo_path': repo_analysis.repo_path,
'total_files': repo_analysis.total_files,
'total_lines': repo_analysis.total_lines,
'languages': repo_analysis.languages,
'architecture_assessment': repo_analysis.architecture_assessment,
'security_assessment': repo_analysis.security_assessment,
'code_quality_score': repo_analysis.code_quality_score,
'file_analyses': [
{
'path': fa.path,
'language': fa.language,
'lines_of_code': fa.lines_of_code,
'complexity_score': fa.complexity_score,
'issues_found': fa.issues_found,
'recommendations': fa.recommendations,
'detailed_analysis': fa.detailed_analysis,
'severity_score': fa.severity_score
} for fa in repo_analysis.file_analyses
],
'executive_summary': repo_analysis.executive_summary
}
def enable_enhanced_processing(self, enabled: bool = True):
"""Enable or disable enhanced processing (feature flag)."""
self.enhanced_enabled = enabled
self.logger.info(f"Enhanced processing {'enabled' if enabled else 'disabled'}")
def get_processing_stats(self) -> Dict[str, Any]:
"""Get statistics about enhanced processing."""
return {
'enhanced_enabled': self.enhanced_enabled,
'chunking_config': self.chunking_config,
'memory_stats': {}
}
# Factory function for easy integration
def create_enhanced_analyzer(api_key: str, memory_config: Dict[str, Any]) -> EnhancedGitHubAnalyzerV2:
"""
Factory function to create enhanced analyzer.
Drop-in replacement for existing EnhancedGitHubAnalyzer.
"""
return EnhancedGitHubAnalyzerV2(api_key, memory_config)
# Backward compatibility alias
EnhancedGitHubAnalyzer = EnhancedGitHubAnalyzerV2