319 lines
15 KiB
Python
319 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enhanced Analyzer Integration
|
|
Seamlessly integrates enhanced chunking with existing AI Analysis Service.
|
|
|
|
Author: Senior Engineer (20+ years experience)
|
|
Version: 1.0.0
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
from typing import Dict, List, Any, Optional, Tuple
|
|
from pathlib import Path
|
|
|
|
# Import existing classes (maintain compatibility)
|
|
from ai_analyze import EnhancedGitHubAnalyzer, FileAnalysis, RepositoryAnalysis
|
|
from enhanced_chunking import EnhancedFileProcessor, ENHANCED_CHUNKING_CONFIG
|
|
|
|
class EnhancedGitHubAnalyzerV2(EnhancedGitHubAnalyzer):
|
|
"""
|
|
Enhanced version of GitHubAnalyzer with intelligent chunking.
|
|
Maintains 100% backward compatibility while adding enhanced capabilities.
|
|
"""
|
|
|
|
def __init__(self, api_key: str, memory_config: Dict[str, Any]):
|
|
# Initialize parent class
|
|
super().__init__(api_key, memory_config)
|
|
|
|
# Add enhanced processing capability
|
|
self.enhanced_processor = EnhancedFileProcessor(self.client, self.memory_manager)
|
|
self.enhanced_enabled = True # Feature flag for easy toggling
|
|
|
|
# Configuration
|
|
self.chunking_config = ENHANCED_CHUNKING_CONFIG
|
|
self.logger = logging.getLogger(__name__)
|
|
|
|
print(f"🔍 [DEBUG] EnhancedGitHubAnalyzerV2 initialized - class: {self.__class__.__name__}")
|
|
self.logger.info("Enhanced GitHub Analyzer V2 initialized with chunking capabilities")
|
|
|
|
async def analyze_file_with_memory_enhanced(self, file_path: Path, content: str, repo_id: str) -> FileAnalysis:
|
|
"""
|
|
Enhanced version of analyze_file_with_memory with intelligent chunking.
|
|
Maintains exact same interface and return type for backward compatibility.
|
|
"""
|
|
try:
|
|
if not self.enhanced_enabled:
|
|
print(f"🔍 [DEBUG] Enhanced disabled, using original method for {file_path}")
|
|
return await super().analyze_file_with_memory(file_path, content, repo_id)
|
|
|
|
print(f"🔍 [DEBUG] Starting enhanced processing for {file_path}")
|
|
# Use enhanced processing
|
|
enhanced_result = await self.enhanced_processor.process_file_enhanced(
|
|
str(file_path), content, repo_id
|
|
)
|
|
print(f"🔍 [DEBUG] Enhanced processing completed for {file_path}")
|
|
|
|
# Convert to FileAnalysis object (maintain compatibility)
|
|
return self._convert_to_file_analysis(enhanced_result, file_path)
|
|
|
|
except Exception as e:
|
|
print(f"🔍 [DEBUG] Enhanced analysis failed for {file_path}: {e}")
|
|
self.logger.error(f"Enhanced analysis failed for {file_path}, falling back to original: {e}")
|
|
# Fallback to original method
|
|
return await super().analyze_file_with_memory(file_path, content, repo_id)
|
|
|
|
async def analyze_file_with_memory(self, file_path: Path, content: str, repo_id: str) -> FileAnalysis:
|
|
"""Wrapper method to maintain compatibility with server calls."""
|
|
return await self.analyze_file_with_memory_enhanced(file_path, content, repo_id)
|
|
|
|
async def analyze_repository_overview_with_memory(self, repo_path: str, file_analyses: List[FileAnalysis],
|
|
context_memories: Dict, repo_id: str) -> Tuple[str, str]:
|
|
"""Wrapper method to maintain compatibility with server calls."""
|
|
return await super().analyze_repository_overview_with_memory(repo_path, file_analyses, context_memories, repo_id)
|
|
|
|
def create_pdf_report(self, analysis: RepositoryAnalysis, output_path: str, progress_mgr=None):
|
|
"""Wrapper method to maintain compatibility with server calls."""
|
|
return super().create_pdf_report(analysis, output_path, progress_mgr)
|
|
|
|
def _convert_to_file_analysis(self, enhanced_result: Dict[str, Any], file_path: Path) -> FileAnalysis:
|
|
"""Convert enhanced analysis result to FileAnalysis object for compatibility."""
|
|
return FileAnalysis(
|
|
path=str(file_path),
|
|
language=enhanced_result.get('language', 'Unknown'),
|
|
lines_of_code=enhanced_result.get('lines_of_code', 0),
|
|
complexity_score=enhanced_result.get('complexity_score', 5.0),
|
|
issues_found=enhanced_result.get('issues_found', []),
|
|
recommendations=enhanced_result.get('recommendations', []),
|
|
detailed_analysis=enhanced_result.get('detailed_analysis', ''),
|
|
severity_score=enhanced_result.get('severity_score', 5.0)
|
|
)
|
|
|
|
async def analyze_repository_with_memory_enhanced(self, repo_path: str) -> RepositoryAnalysis:
|
|
"""
|
|
Enhanced repository analysis with intelligent chunking and batch processing.
|
|
Maintains exact same interface and return type for backward compatibility.
|
|
"""
|
|
try:
|
|
if not self.enhanced_enabled:
|
|
# Fallback to original method
|
|
return await super().analyze_repository_with_memory(repo_path)
|
|
|
|
# Use enhanced processing with batch optimization
|
|
return await self._analyze_repository_enhanced(repo_path)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Enhanced repository analysis failed, falling back to original: {e}")
|
|
# Fallback to original method
|
|
return await super().analyze_repository_with_memory(repo_path)
|
|
|
|
async def _analyze_repository_enhanced(self, repo_path: str) -> RepositoryAnalysis:
|
|
"""Enhanced repository analysis with batch processing and chunking."""
|
|
|
|
# Generate repo ID and check cache
|
|
repo_id = self.calculate_repo_id(repo_path)
|
|
|
|
# Check working memory for recent analysis
|
|
cached_analysis = await self.memory_manager.get_working_memory(f"repo_analysis:{repo_id}")
|
|
if cached_analysis:
|
|
self.logger.info("Using cached repository analysis from memory")
|
|
return RepositoryAnalysis(**cached_analysis)
|
|
|
|
# Clone/access repository
|
|
actual_repo_path = self.clone_repository(repo_path)
|
|
|
|
# Get analysis context from memory
|
|
context_memories = await self.get_analysis_context(repo_path, "", repo_id)
|
|
|
|
# Scan files with enhanced processing
|
|
files_to_analyze = self.scan_repository(actual_repo_path)
|
|
|
|
if not files_to_analyze:
|
|
raise Exception("No files found to analyze")
|
|
|
|
self.logger.info(f"Starting enhanced analysis of {len(files_to_analyze)} files...")
|
|
|
|
# Process files with batch optimization
|
|
file_analyses = await self._process_files_with_batching(files_to_analyze, repo_id)
|
|
|
|
# Repository-level analysis with enhanced context
|
|
architecture_assessment, security_assessment = await self.analyze_repository_overview_with_memory(
|
|
actual_repo_path, file_analyses, context_memories, repo_id
|
|
)
|
|
|
|
# Calculate overall quality score safely
|
|
if file_analyses and len(file_analyses) > 0:
|
|
valid_scores = [fa.severity_score for fa in file_analyses if fa.severity_score is not None]
|
|
avg_quality = sum(valid_scores) / len(valid_scores) if valid_scores else 5.0
|
|
else:
|
|
avg_quality = 5.0
|
|
|
|
# Generate statistics safely
|
|
from collections import Counter
|
|
if file_analyses:
|
|
language_list = [fa.language for fa in file_analyses if fa.language is not None]
|
|
languages = dict(Counter(language_list))
|
|
total_lines = sum(fa.lines_of_code for fa in file_analyses if fa.lines_of_code is not None)
|
|
else:
|
|
languages = {}
|
|
total_lines = 0
|
|
|
|
# Create repository analysis
|
|
repo_analysis = RepositoryAnalysis(
|
|
repo_path=repo_path,
|
|
total_files=len(file_analyses),
|
|
total_lines=total_lines,
|
|
languages=languages,
|
|
architecture_assessment=architecture_assessment,
|
|
security_assessment=security_assessment,
|
|
code_quality_score=avg_quality,
|
|
file_analyses=file_analyses,
|
|
executive_summary="",
|
|
high_quality_files=[]
|
|
)
|
|
|
|
# Generate executive summary with enhanced context
|
|
repo_analysis.executive_summary = await self.generate_executive_summary_with_memory(
|
|
repo_analysis, context_memories
|
|
)
|
|
|
|
# Store analysis in episodic memory
|
|
await self.memory_manager.store_episodic_memory(
|
|
self.session_id, "Enhanced automated repository analysis",
|
|
f"Analyzed {repo_analysis.total_files} files with enhanced chunking, found {sum(len(fa.issues_found) if isinstance(fa.issues_found, (list, tuple)) else 0 for fa in file_analyses)} issues",
|
|
repo_id,
|
|
{
|
|
'repo_path': repo_path,
|
|
'quality_score': avg_quality,
|
|
'total_issues': sum(len(fa.issues_found) if isinstance(fa.issues_found, (list, tuple)) else 0 for fa in file_analyses),
|
|
'analysis_type': 'enhanced_automated_comprehensive',
|
|
'chunking_enabled': True
|
|
}
|
|
)
|
|
|
|
# Cache analysis in working memory
|
|
await self.memory_manager.store_working_memory(
|
|
f"repo_analysis:{repo_id}",
|
|
self._repo_analysis_to_dict(repo_analysis),
|
|
ttl=7200 # 2 hours
|
|
)
|
|
|
|
return repo_analysis
|
|
|
|
async def _process_files_with_batching(self, files_to_analyze: List[tuple], repo_id: str) -> List[FileAnalysis]:
|
|
"""Process files with intelligent batching to optimize API usage."""
|
|
|
|
file_analyses = []
|
|
processed_files = 0
|
|
|
|
# Group files by size and type for optimal batching
|
|
small_files = []
|
|
medium_files = []
|
|
large_files = []
|
|
|
|
for file_path, content in files_to_analyze:
|
|
file_size = len(content.split('\n'))
|
|
if file_size < 200:
|
|
small_files.append((file_path, content))
|
|
elif file_size < 500:
|
|
medium_files.append((file_path, content))
|
|
else:
|
|
large_files.append((file_path, content))
|
|
|
|
# Process small files in batches (fast processing)
|
|
if small_files:
|
|
self.logger.info(f"Processing {len(small_files)} small files...")
|
|
for file_path, content in small_files:
|
|
try:
|
|
analysis = await self.analyze_file_with_memory_enhanced(
|
|
Path(file_path), content, repo_id
|
|
)
|
|
file_analyses.append(analysis)
|
|
processed_files += 1
|
|
await asyncio.sleep(0.05) # Small delay
|
|
except Exception as e:
|
|
self.logger.error(f"Error analyzing small file {file_path}: {e}")
|
|
continue
|
|
|
|
# Process medium files individually (balanced processing)
|
|
if medium_files:
|
|
self.logger.info(f"Processing {len(medium_files)} medium files...")
|
|
for file_path, content in medium_files:
|
|
try:
|
|
analysis = await self.analyze_file_with_memory_enhanced(
|
|
Path(file_path), content, repo_id
|
|
)
|
|
file_analyses.append(analysis)
|
|
processed_files += 1
|
|
await asyncio.sleep(0.1) # Medium delay
|
|
except Exception as e:
|
|
self.logger.error(f"Error analyzing medium file {file_path}: {e}")
|
|
continue
|
|
|
|
# Process large files with enhanced chunking (careful processing)
|
|
if large_files:
|
|
self.logger.info(f"Processing {len(large_files)} large files with enhanced chunking...")
|
|
for file_path, content in large_files:
|
|
try:
|
|
analysis = await self.analyze_file_with_memory_enhanced(
|
|
Path(file_path), content, repo_id
|
|
)
|
|
file_analyses.append(analysis)
|
|
processed_files += 1
|
|
await asyncio.sleep(0.2) # Longer delay for large files
|
|
except Exception as e:
|
|
self.logger.error(f"Error analyzing large file {file_path}: {e}")
|
|
continue
|
|
|
|
self.logger.info(f"Enhanced processing completed: {processed_files}/{len(files_to_analyze)} files processed")
|
|
return file_analyses
|
|
|
|
def _repo_analysis_to_dict(self, repo_analysis: RepositoryAnalysis) -> Dict[str, Any]:
|
|
"""Convert RepositoryAnalysis to dictionary for caching."""
|
|
return {
|
|
'repo_path': repo_analysis.repo_path,
|
|
'total_files': repo_analysis.total_files,
|
|
'total_lines': repo_analysis.total_lines,
|
|
'languages': repo_analysis.languages,
|
|
'architecture_assessment': repo_analysis.architecture_assessment,
|
|
'security_assessment': repo_analysis.security_assessment,
|
|
'code_quality_score': repo_analysis.code_quality_score,
|
|
'file_analyses': [
|
|
{
|
|
'path': fa.path,
|
|
'language': fa.language,
|
|
'lines_of_code': fa.lines_of_code,
|
|
'complexity_score': fa.complexity_score,
|
|
'issues_found': fa.issues_found,
|
|
'recommendations': fa.recommendations,
|
|
'detailed_analysis': fa.detailed_analysis,
|
|
'severity_score': fa.severity_score
|
|
} for fa in repo_analysis.file_analyses
|
|
],
|
|
'executive_summary': repo_analysis.executive_summary
|
|
}
|
|
|
|
def enable_enhanced_processing(self, enabled: bool = True):
|
|
"""Enable or disable enhanced processing (feature flag)."""
|
|
self.enhanced_enabled = enabled
|
|
self.logger.info(f"Enhanced processing {'enabled' if enabled else 'disabled'}")
|
|
|
|
def get_processing_stats(self) -> Dict[str, Any]:
|
|
"""Get statistics about enhanced processing."""
|
|
return {
|
|
'enhanced_enabled': self.enhanced_enabled,
|
|
'chunking_config': self.chunking_config,
|
|
'memory_stats': {}
|
|
}
|
|
|
|
# Factory function for easy integration
|
|
def create_enhanced_analyzer(api_key: str, memory_config: Dict[str, Any]) -> EnhancedGitHubAnalyzerV2:
|
|
"""
|
|
Factory function to create enhanced analyzer.
|
|
Drop-in replacement for existing EnhancedGitHubAnalyzer.
|
|
"""
|
|
return EnhancedGitHubAnalyzerV2(api_key, memory_config)
|
|
|
|
# Backward compatibility alias
|
|
EnhancedGitHubAnalyzer = EnhancedGitHubAnalyzerV2
|