codenuk_backend_mine/services/ai-analysis-service/enhanced_analyzer.py

#!/usr/bin/env python3
"""
Enhanced Analyzer Integration
Seamlessly integrates enhanced chunking with existing AI Analysis Service.

Author: Senior Engineer (20+ years experience)
Version: 1.0.0
"""

import asyncio
import logging
from typing import Dict, List, Any, Optional, Tuple
from pathlib import Path

# Import existing classes (maintain compatibility)
from ai_analyze import EnhancedGitHubAnalyzer, FileAnalysis, RepositoryAnalysis
from enhanced_chunking import EnhancedFileProcessor, ENHANCED_CHUNKING_CONFIG

class EnhancedGitHubAnalyzerV2(EnhancedGitHubAnalyzer):
    """
    Enhanced version of GitHubAnalyzer with intelligent chunking.
    Maintains 100% backward compatibility while adding enhanced capabilities.
    """

    def __init__(self, api_key: str, memory_config: Dict[str, Any]):
        # Initialize parent class
        super().__init__(api_key, memory_config)

        # Add enhanced processing capability
        self.enhanced_processor = EnhancedFileProcessor(self.client, self.memory_manager)
        self.enhanced_enabled = True  # Feature flag for easy toggling

        # Configuration
        self.chunking_config = ENHANCED_CHUNKING_CONFIG
        self.logger = logging.getLogger(__name__)

        print(f"🔍 [DEBUG] EnhancedGitHubAnalyzerV2 initialized - class: {self.__class__.__name__}")
        self.logger.info("Enhanced GitHub Analyzer V2 initialized with chunking capabilities")

    async def analyze_file_with_memory_enhanced(self, file_path: Path, content: str, repo_id: str) -> FileAnalysis:
        """
        Enhanced version of analyze_file_with_memory with intelligent chunking.
        Maintains exact same interface and return type for backward compatibility.
        """
        try:
            if not self.enhanced_enabled:
                print(f"🔍 [DEBUG] Enhanced disabled, using original method for {file_path}")
                return await super().analyze_file_with_memory(file_path, content, repo_id)

            print(f"🔍 [DEBUG] Starting enhanced processing for {file_path}")
            # Use enhanced processing
            enhanced_result = await self.enhanced_processor.process_file_enhanced(
                str(file_path), content, repo_id
            )
            print(f"🔍 [DEBUG] Enhanced processing completed for {file_path}")

            # Convert to FileAnalysis object (maintain compatibility)
            return self._convert_to_file_analysis(enhanced_result, file_path)

        except Exception as e:
            print(f"🔍 [DEBUG] Enhanced analysis failed for {file_path}: {e}")
            self.logger.error(f"Enhanced analysis failed for {file_path}, falling back to original: {e}")
            # Fallback to original method
            return await super().analyze_file_with_memory(file_path, content, repo_id)

    async def analyze_file_with_memory(self, file_path: Path, content: str, repo_id: str) -> FileAnalysis:
        """Wrapper method to maintain compatibility with server calls."""
        return await self.analyze_file_with_memory_enhanced(file_path, content, repo_id)

    async def analyze_repository_overview_with_memory(self, repo_path: str, file_analyses: List[FileAnalysis],
                                                    context_memories: Dict, repo_id: str) -> Tuple[str, str]:
        """Wrapper method to maintain compatibility with server calls."""
        return await super().analyze_repository_overview_with_memory(repo_path, file_analyses, context_memories, repo_id)

    def create_pdf_report(self, analysis: RepositoryAnalysis, output_path: str, progress_mgr=None):
        """Wrapper method to maintain compatibility with server calls."""
        return super().create_pdf_report(analysis, output_path, progress_mgr)

    def _convert_to_file_analysis(self, enhanced_result: Dict[str, Any], file_path: Path) -> FileAnalysis:
        """Convert enhanced analysis result to FileAnalysis object for compatibility."""
        return FileAnalysis(
            path=str(file_path),
            language=enhanced_result.get('language', 'Unknown'),
            lines_of_code=enhanced_result.get('lines_of_code', 0),
            complexity_score=enhanced_result.get('complexity_score', 5.0),
            issues_found=enhanced_result.get('issues_found', []),
            recommendations=enhanced_result.get('recommendations', []),
            detailed_analysis=enhanced_result.get('detailed_analysis', ''),
            severity_score=enhanced_result.get('severity_score', 5.0)
        )

    async def analyze_repository_with_memory_enhanced(self, repo_path: str) -> RepositoryAnalysis:
        """
        Enhanced repository analysis with intelligent chunking and batch processing.
        Maintains exact same interface and return type for backward compatibility.
        """
        try:
            if not self.enhanced_enabled:
                # Fallback to original method
                return await super().analyze_repository_with_memory(repo_path)

            # Use enhanced processing with batch optimization
            return await self._analyze_repository_enhanced(repo_path)

        except Exception as e:
            self.logger.error(f"Enhanced repository analysis failed, falling back to original: {e}")
            # Fallback to original method
            return await super().analyze_repository_with_memory(repo_path)

    async def _analyze_repository_enhanced(self, repo_path: str) -> RepositoryAnalysis:
        """Enhanced repository analysis with batch processing and chunking."""

        # Generate repo ID and check cache
        repo_id = self.calculate_repo_id(repo_path)

        # Check working memory for recent analysis
        cached_analysis = await self.memory_manager.get_working_memory(f"repo_analysis:{repo_id}")
        if cached_analysis:
            self.logger.info("Using cached repository analysis from memory")
            return RepositoryAnalysis(**cached_analysis)

        # Clone/access repository
        actual_repo_path = self.clone_repository(repo_path)

        # Get analysis context from memory
        context_memories = await self.get_analysis_context(repo_path, "", repo_id)

        # Scan files with enhanced processing
        files_to_analyze = self.scan_repository(actual_repo_path)

        if not files_to_analyze:
            raise Exception("No files found to analyze")

        self.logger.info(f"Starting enhanced analysis of {len(files_to_analyze)} files...")

        # Process files with batch optimization
        file_analyses = await self._process_files_with_batching(files_to_analyze, repo_id)

        # Repository-level analysis with enhanced context
        architecture_assessment, security_assessment = await self.analyze_repository_overview_with_memory(
            actual_repo_path, file_analyses, context_memories, repo_id
        )

        # Calculate overall quality score safely
        if file_analyses and len(file_analyses) > 0:
            valid_scores = [fa.severity_score for fa in file_analyses if fa.severity_score is not None]
            avg_quality = sum(valid_scores) / len(valid_scores) if valid_scores else 5.0
        else:
            avg_quality = 5.0

        # Generate statistics safely
        from collections import Counter
        if file_analyses:
            language_list = [fa.language for fa in file_analyses if fa.language is not None]
            languages = dict(Counter(language_list))
            total_lines = sum(fa.lines_of_code for fa in file_analyses if fa.lines_of_code is not None)
        else:
            languages = {}
            total_lines = 0

        # Create repository analysis
        repo_analysis = RepositoryAnalysis(
            repo_path=repo_path,
            total_files=len(file_analyses),
            total_lines=total_lines,
            languages=languages,
            architecture_assessment=architecture_assessment,
            security_assessment=security_assessment,
            code_quality_score=avg_quality,
            file_analyses=file_analyses,
            executive_summary="",
            high_quality_files=[]
        )

        # Generate executive summary with enhanced context
        repo_analysis.executive_summary = await self.generate_executive_summary_with_memory(
            repo_analysis, context_memories
        )

        # Store analysis in episodic memory
        await self.memory_manager.store_episodic_memory(
            self.session_id, "Enhanced automated repository analysis",
            f"Analyzed {repo_analysis.total_files} files with enhanced chunking, found {sum(len(fa.issues_found) if isinstance(fa.issues_found, (list, tuple)) else 0 for fa in file_analyses)} issues",
            repo_id,
            {
                'repo_path': repo_path,
                'quality_score': avg_quality,
                'total_issues': sum(len(fa.issues_found) if isinstance(fa.issues_found, (list, tuple)) else 0 for fa in file_analyses),
                'analysis_type': 'enhanced_automated_comprehensive',
                'chunking_enabled': True
            }
        )

        # Cache analysis in working memory
        await self.memory_manager.store_working_memory(
            f"repo_analysis:{repo_id}",
            self._repo_analysis_to_dict(repo_analysis),
            ttl=7200  # 2 hours
        )

        return repo_analysis

    async def _process_files_with_batching(self, files_to_analyze: List[tuple], repo_id: str) -> List[FileAnalysis]:
        """Process files with intelligent batching to optimize API usage."""

        file_analyses = []
        processed_files = 0

        # Group files by size and type for optimal batching
        small_files = []
        medium_files = []
        large_files = []

        for file_path, content in files_to_analyze:
            file_size = len(content.split('\n'))
            if file_size < 200:
                small_files.append((file_path, content))
            elif file_size < 500:
                medium_files.append((file_path, content))
            else:
                large_files.append((file_path, content))

        # Process small files in batches (fast processing)
        if small_files:
            self.logger.info(f"Processing {len(small_files)} small files...")
            for file_path, content in small_files:
                try:
                    analysis = await self.analyze_file_with_memory_enhanced(
                        Path(file_path), content, repo_id
                    )
                    file_analyses.append(analysis)
                    processed_files += 1
                    await asyncio.sleep(0.05)  # Small delay
                except Exception as e:
                    self.logger.error(f"Error analyzing small file {file_path}: {e}")
                    continue

        # Process medium files individually (balanced processing)
        if medium_files:
            self.logger.info(f"Processing {len(medium_files)} medium files...")
            for file_path, content in medium_files:
                try:
                    analysis = await self.analyze_file_with_memory_enhanced(
                        Path(file_path), content, repo_id
                    )
                    file_analyses.append(analysis)
                    processed_files += 1
                    await asyncio.sleep(0.1)  # Medium delay
                except Exception as e:
                    self.logger.error(f"Error analyzing medium file {file_path}: {e}")
                    continue

        # Process large files with enhanced chunking (careful processing)
        if large_files:
            self.logger.info(f"Processing {len(large_files)} large files with enhanced chunking...")
            for file_path, content in large_files:
                try:
                    analysis = await self.analyze_file_with_memory_enhanced(
                        Path(file_path), content, repo_id
                    )
                    file_analyses.append(analysis)
                    processed_files += 1
                    await asyncio.sleep(0.2)  # Longer delay for large files
                except Exception as e:
                    self.logger.error(f"Error analyzing large file {file_path}: {e}")
                    continue

        self.logger.info(f"Enhanced processing completed: {processed_files}/{len(files_to_analyze)} files processed")
        return file_analyses

    def _repo_analysis_to_dict(self, repo_analysis: RepositoryAnalysis) -> Dict[str, Any]:
        """Convert RepositoryAnalysis to dictionary for caching."""
        return {
            'repo_path': repo_analysis.repo_path,
            'total_files': repo_analysis.total_files,
            'total_lines': repo_analysis.total_lines,
            'languages': repo_analysis.languages,
            'architecture_assessment': repo_analysis.architecture_assessment,
            'security_assessment': repo_analysis.security_assessment,
            'code_quality_score': repo_analysis.code_quality_score,
            'file_analyses': [
                {
                    'path': fa.path,
                    'language': fa.language,
                    'lines_of_code': fa.lines_of_code,
                    'complexity_score': fa.complexity_score,
                    'issues_found': fa.issues_found,
                    'recommendations': fa.recommendations,
                    'detailed_analysis': fa.detailed_analysis,
                    'severity_score': fa.severity_score
                } for fa in repo_analysis.file_analyses
            ],
            'executive_summary': repo_analysis.executive_summary
        }

    def enable_enhanced_processing(self, enabled: bool = True):
        """Enable or disable enhanced processing (feature flag)."""
        self.enhanced_enabled = enabled
        self.logger.info(f"Enhanced processing {'enabled' if enabled else 'disabled'}")

    def get_processing_stats(self) -> Dict[str, Any]:
        """Get statistics about enhanced processing."""
        return {
            'enhanced_enabled': self.enhanced_enabled,
            'chunking_config': self.chunking_config,
            'memory_stats': {}
        }

# Factory function for easy integration
def create_enhanced_analyzer(api_key: str, memory_config: Dict[str, Any]) -> EnhancedGitHubAnalyzerV2:
    """
    Factory function to create enhanced analyzer.
    Drop-in replacement for existing EnhancedGitHubAnalyzer.
    """
    return EnhancedGitHubAnalyzerV2(api_key, memory_config)

# Backward compatibility alias
EnhancedGitHubAnalyzer = EnhancedGitHubAnalyzerV2