codenuk_backend_mine/services/ai-analysis-service/enhanced_chunking.py

#!/usr/bin/env python3
"""
Enhanced Chunking System for AI Analysis Service
Implements intelligent file chunking with zero disruption to existing flows.

Author: Senior Engineer (20+ years experience)
Version: 1.0.0
"""

import re
import os
import json
import hashlib
import asyncio
from typing import Dict, List, Optional, Tuple, Any
from dataclasses import dataclass
from pathlib import Path
import logging

@dataclass
class ChunkInfo:
    """Information about a file chunk."""
    chunk_id: int
    content: str
    start_line: int
    end_line: int
    chunk_type: str  # 'function', 'class', 'import', 'main', 'utility'
    context: str
    is_complete: bool
    tokens_estimate: int
    language: str = "Unknown"  # Programming language of the chunk

@dataclass
class ChunkAnalysis:
    """Analysis result for a single chunk."""
    chunk_id: int
    issues_found: List[str]
    recommendations: List[str]
    severity_score: float
    detailed_analysis: str
    chunk_type: str
    context: str

@dataclass
class FileChunkingResult:
    """Result of chunking a file."""
    file_path: str
    language: str
    total_chunks: int
    chunks: List[ChunkInfo]
    is_chunked: bool
    original_tokens: int
    chunked_tokens: int
    savings_percentage: float

class IntelligentChunker:
    """
    Intelligent file chunking system that breaks large files into semantic chunks
    while preserving context and relationships.
    """

    def __init__(self, max_tokens_per_chunk: int = 4000, overlap_lines: int = 5):
        self.max_tokens = max_tokens_per_chunk
        self.overlap_lines = overlap_lines
        self.logger = logging.getLogger(__name__)

        # Language-specific patterns for intelligent chunking
        self.language_patterns = {
            'python': {
                'function': r'^def\s+\w+',
                'class': r'^class\s+\w+',
                'import': r'^(import|from)\s+',
                'comment': r'^\s*#',
                'docstring': r'^\s*""".*"""'
            },
            'javascript': {
                'function': r'^(function\s+\w+|const\s+\w+\s*=\s*(async\s+)?\(|export\s+(function|const))',
                'class': r'^class\s+\w+',
                'import': r'^(import|const\s+\w+\s*=\s*require)',
                'comment': r'^\s*//',
                'jsdoc': r'^\s*/\*\*'
            },
            'typescript': {
                'function': r'^(function\s+\w+|const\s+\w+\s*=\s*(async\s+)?\(|export\s+(function|const))',
                'class': r'^class\s+\w+',
                'interface': r'^interface\s+\w+',
                'import': r'^(import|const\s+\w+\s*=\s*require)',
                'comment': r'^\s*//',
                'jsdoc': r'^\s*/\*\*'
            },
            'java': {
                'function': r'^\s*(public|private|protected)?\s*(static\s+)?\w+\s+\w+\s*\(',
                'class': r'^class\s+\w+',
                'import': r'^import\s+',
                'comment': r'^\s*//',
                'javadoc': r'^\s*/\*\*'
            },
            'cpp': {
                'function': r'^\w+\s+\w+\s*\(',
                'class': r'^class\s+\w+',
                'include': r'^#include\s*<',
                'comment': r'^\s*//',
                'block_comment': r'^\s*/\*'
            }
        }

    def estimate_tokens(self, text: str) -> int:
        """Estimate token count for text (rough approximation)."""
        return len(text) // 4

    def detect_language(self, file_path: str) -> str:
        """Detect programming language from file extension."""
        ext = Path(file_path).suffix.lower()
        language_map = {
            '.py': 'python',
            '.js': 'javascript',
            '.ts': 'typescript',
            '.tsx': 'typescript',
            '.jsx': 'javascript',
            '.java': 'java',
            '.cpp': 'cpp',
            '.c': 'cpp',
            '.cs': 'csharp',
            '.go': 'go',
            '.rs': 'rust',
            '.php': 'php',
            '.rb': 'ruby'
        }
        return language_map.get(ext, 'unknown')

    def chunk_file(self, file_path: str, content: str) -> FileChunkingResult:
        """
        Intelligently chunk a file based on its programming language and structure.
        """
        language = self.detect_language(file_path)
        lines = content.split('\n')
        original_tokens = self.estimate_tokens(content)

        # If file is small enough, don't chunk
        if original_tokens <= self.max_tokens:
            return FileChunkingResult(
                file_path=file_path,
                language=language,
                total_chunks=1,
                chunks=[ChunkInfo(
                    chunk_id=0,
                    content=content,
                    start_line=0,
                    end_line=len(lines),
                    chunk_type='complete',
                    context='',
                    is_complete=True,
                    tokens_estimate=original_tokens,
                    language=language
                )],
                is_chunked=False,
                original_tokens=original_tokens,
                chunked_tokens=original_tokens,
                savings_percentage=0.0
            )

        # Chunk the file intelligently
        chunks = self._chunk_by_language(content, language, file_path)

        # Calculate savings
        chunked_tokens = sum(chunk.tokens_estimate for chunk in chunks)
        savings = max(0, (original_tokens - chunked_tokens) / original_tokens * 100)

        return FileChunkingResult(
            file_path=file_path,
            language=language,
            total_chunks=len(chunks),
            chunks=chunks,
            is_chunked=True,
            original_tokens=original_tokens,
            chunked_tokens=chunked_tokens,
            savings_percentage=savings
        )

    def _chunk_by_language(self, content: str, language: str, file_path: str) -> List[ChunkInfo]:
        """Chunk file based on language-specific patterns."""
        lines = content.split('\n')
        patterns = self.language_patterns.get(language, self.language_patterns['python'])

        chunks = []
        current_chunk = []
        current_tokens = 0
        chunk_id = 0
        start_line = 0

        # Extract imports and global declarations first
        imports, main_content = self._extract_imports(lines, patterns)
        if imports:
            chunks.append(ChunkInfo(
                chunk_id=chunk_id,
                content='\n'.join(imports),
                start_line=0,
                end_line=len(imports),
                chunk_type='import',
                context='File imports and global declarations',
                is_complete=True,
                tokens_estimate=self.estimate_tokens('\n'.join(imports)),
                language=language
            ))
            chunk_id += 1

        # Process main content
        for i, line in enumerate(main_content):
            current_chunk.append(line)
            current_tokens += self.estimate_tokens(line)

            # Check if we should create a chunk
            should_chunk = (
                current_tokens >= self.max_tokens or
                self._is_logical_boundary(line, patterns) or
                i == len(main_content) - 1
            )

            if should_chunk and current_chunk:
                # Determine chunk type
                chunk_type = self._determine_chunk_type(current_chunk, patterns)
                context = self._generate_context(current_chunk, chunk_type, language)

                chunks.append(ChunkInfo(
                    chunk_id=chunk_id,
                    content='\n'.join(current_chunk),
                    start_line=start_line,
                    end_line=start_line + len(current_chunk),
                    chunk_type=chunk_type,
                    context=context,
                    is_complete=False,
                    tokens_estimate=current_tokens,
                    language=language
                ))

                # Prepare for next chunk with overlap
                overlap = current_chunk[-self.overlap_lines:] if len(current_chunk) > self.overlap_lines else []
                current_chunk = overlap
                current_tokens = self.estimate_tokens('\n'.join(overlap))
                start_line += len(current_chunk) - len(overlap)
                chunk_id += 1

        return chunks

    def _extract_imports(self, lines: List[str], patterns: Dict[str, str]) -> Tuple[List[str], List[str]]:
        """Extract import statements and return them separately."""
        imports = []
        main_content = []

        for line in lines:
            if re.match(patterns.get('import', r'^(import|from)'), line.strip()):
                imports.append(line)
            else:
                main_content.append(line)

        return imports, main_content

    def _is_logical_boundary(self, line: str, patterns: Dict[str, str]) -> bool:
        """Check if line represents a logical boundary for chunking."""
        line_stripped = line.strip()

        # Function/class definitions
        if (re.match(patterns.get('function', r'^def\s+'), line_stripped) or
            re.match(patterns.get('class', r'^class\s+'), line_stripped)):
            return True

        # Major comments or documentation
        if (re.match(patterns.get('comment', r'^\s*#'), line_stripped) and
            len(line_stripped) > 50):  # Significant comment
            return True

        return False

    def _determine_chunk_type(self, chunk_lines: List[str], patterns: Dict[str, str]) -> str:
        """Determine the type of chunk based on its content."""
        content = '\n'.join(chunk_lines)

        if re.search(patterns.get('function', r'^def\s+'), content, re.MULTILINE):
            return 'function'
        elif re.search(patterns.get('class', r'^class\s+'), content, re.MULTILINE):
            return 'class'
        elif re.search(patterns.get('import', r'^(import|from)'), content, re.MULTILINE):
            return 'import'
        else:
            return 'main'

    def _generate_context(self, chunk_lines: List[str], chunk_type: str, language: str) -> str:
        """Generate contextual information for a chunk."""
        if chunk_type == 'import':
            return f"Import statements and global declarations for {language} file"
        elif chunk_type == 'function':
            return f"Function definitions and related code in {language}"
        elif chunk_type == 'class':
            return f"Class definitions and methods in {language}"
        else:
            return f"Main logic and implementation code in {language}"

class ChunkAnalyzer:
    """
    Analyzes individual chunks with context awareness and combines results.
    """

    def __init__(self, claude_client, memory_manager):
        self.claude_client = claude_client
        self.memory_manager = memory_manager
        self.logger = logging.getLogger(__name__)

    async def analyze_chunks(self, file_path: str, chunks: List[ChunkInfo], repo_id: str) -> List[ChunkAnalysis]:
        """Analyze all chunks of a file with context awareness."""
        if len(chunks) == 1 and chunks[0].is_complete:
            # Single chunk - use existing analysis
            return await self._analyze_single_chunk(file_path, chunks[0], repo_id)

        # Multiple chunks - analyze with context
        chunk_analyses = []

        for i, chunk in enumerate(chunks):
            try:
                analysis = await self._analyze_chunk_with_context(
                    file_path, chunk, i, len(chunks), repo_id
                )
                chunk_analyses.append(analysis)

                # Small delay to respect rate limits
                await asyncio.sleep(0.1)

            except Exception as e:
                self.logger.error(f"Error analyzing chunk {i} of {file_path}: {e}")
                # Create fallback analysis
                chunk_analyses.append(ChunkAnalysis(
                    chunk_id=chunk.chunk_id,
                    issues_found=[f"Analysis failed: {str(e)}"],
                    recommendations=["Review this section manually"],
                    severity_score=5.0,
                    detailed_analysis=f"Analysis failed due to error: {str(e)}",
                    chunk_type=chunk.chunk_type,
                    context=chunk.context
                ))

        return chunk_analyses

    async def _analyze_single_chunk(self, file_path: str, chunk: ChunkInfo, repo_id: str) -> List[ChunkAnalysis]:
        """Analyze a single complete chunk using existing logic."""
        try:
            # Use the existing analysis logic but optimized for single chunk
            analysis_prompt = f"""
            Analyze this code file for quality, security, and best practices.

            File: {file_path}
            Language: {chunk.language}

            Code:
            {chunk.content}

            Provide a comprehensive analysis focusing on:
            1. Code quality and maintainability
            2. Security vulnerabilities
            3. Performance issues
            4. Best practices adherence
            5. Specific recommendations for improvement

            Format your response as JSON with these fields:
            - issues_found: List of specific issues
            - recommendations: List of improvement suggestions
            - severity_score: Number from 1-10 (10 being best quality)
            - detailed_analysis: Comprehensive analysis text
            """

            # Make API call to Claude using the anthropic client
            response = self.claude_client.messages.create(
                model=os.getenv("CLAUDE_MODEL", "claude-3-5-haiku-latest"),
                max_tokens=2048,
                messages=[{
                    "role": "user",
                    "content": analysis_prompt
                }]
            )

            # Parse response and create analysis
            response_text = response.content[0].text if response.content else ""
            analysis_data = self._parse_analysis_response(response_text)

            return [ChunkAnalysis(
                chunk_id=chunk.chunk_id,
                issues_found=analysis_data.get('issues_found', []),
                recommendations=analysis_data.get('recommendations', []),
                severity_score=analysis_data.get('severity_score', 5.0),
                detailed_analysis=analysis_data.get('detailed_analysis', 'Analysis completed'),
                chunk_type=chunk.chunk_type,
                context=chunk.context
            )]

        except Exception as e:
            self.logger.error(f"Error analyzing single chunk for {file_path}: {e}")
            return [ChunkAnalysis(
                chunk_id=chunk.chunk_id,
                issues_found=[f"Analysis failed: {str(e)}"],
                recommendations=["Review this section manually"],
                severity_score=5.0,
                detailed_analysis=f"Analysis failed due to error: {str(e)}",
                chunk_type=chunk.chunk_type,
                context=chunk.context
            )]

    def _parse_analysis_response(self, response: str) -> Dict[str, Any]:
        """Parse Claude's analysis response into structured data."""
        try:
            # Clean the response by removing invalid control characters
            cleaned_response = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]', '', response)

            # Try to extract JSON from response
            if '{' in cleaned_response and '}' in cleaned_response:
                start = cleaned_response.find('{')
                end = cleaned_response.rfind('}') + 1
                json_str = cleaned_response[start:end]

                # Additional cleaning for common JSON issues
                json_str = json_str.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')

                try:
                    return json.loads(json_str)
                except json.JSONDecodeError as json_err:
                    # Try to fix common JSON issues
                    json_str = re.sub(r',\s*}', '}', json_str)  # Remove trailing commas
                    json_str = re.sub(r',\s*]', ']', json_str)   # Remove trailing commas in arrays
                    json_str = re.sub(r'(\w+):', r'"\1":', json_str)  # Quote unquoted keys

                    try:
                        return json.loads(json_str)
                    except json.JSONDecodeError:
                        # If still failing, create a structured response from the text
                        return self._create_fallback_response(cleaned_response)
            else:
                # Fallback parsing
                return self._create_fallback_response(cleaned_response)

        except Exception as e:
            self.logger.error(f"Error parsing analysis response: {e}")
            return self._create_fallback_response(response)

    def _create_fallback_response(self, response_text: str) -> Dict[str, Any]:
        """Create a structured response when JSON parsing fails."""
        # Extract basic information from the text response
        issues = []
        recommendations = []

        # Look for common patterns in the response
        if 'error' in response_text.lower() or 'issue' in response_text.lower():
            issues.append('Code issues detected (parsing failed)')
        if 'improve' in response_text.lower() or 'recommend' in response_text.lower():
            recommendations.append('Code improvements suggested (parsing failed)')

        if not issues:
            issues.append('Analysis completed (detailed parsing unavailable)')
        if not recommendations:
            recommendations.append('Review code manually')

        return {
            'issues_found': issues,
            'recommendations': recommendations,
            'severity_score': 3.0,  # Medium severity for fallback
            'detailed_analysis': response_text[:500] + '...' if len(response_text) > 500 else response_text
        }

    async def _analyze_chunk_with_context(self, file_path: str, chunk: ChunkInfo,
                                        chunk_index: int, total_chunks: int, repo_id: str) -> ChunkAnalysis:
        """Analyze a single chunk with file and repository context."""

        # Get relevant context from memory system
        context_memories = await self._get_chunk_context(file_path, chunk, repo_id)

        # Build enhanced prompt with context
        prompt = self._build_chunk_analysis_prompt(
            file_path, chunk, chunk_index, total_chunks, context_memories
        )

        try:
            # Rate limiting
            await asyncio.sleep(0.1)  # Small delay between requests

            # Send to Claude API
            message = self.claude_client.messages.create(
                model=os.getenv("CLAUDE_MODEL", "claude-3-5-haiku-latest"),
                max_tokens=2048,
                temperature=0.1,
                messages=[{"role": "user", "content": prompt}]
            )

            analysis_text = message.content[0].text.strip()

            # Parse the analysis
            return self._parse_chunk_analysis(analysis_text, chunk)

        except Exception as e:
            self.logger.error(f"Claude API error for chunk {chunk_index}: {e}")
            raise

    async def _get_chunk_context(self, file_path: str, chunk: ChunkInfo, repo_id: str) -> Dict[str, Any]:
        """Get relevant context for chunk analysis."""
        context = {
            'similar_code': [],
            'repository_patterns': [],
            'best_practices': []
        }

        try:
            # Search for similar code patterns
            similar_code = await self.memory_manager.search_similar_code(
                f"{chunk.chunk_type} {chunk.context}", repo_id, limit=3
            )
            context['similar_code'] = similar_code

            # Get relevant best practices
            best_practices = await self.memory_manager.retrieve_persistent_memories(
                f"{chunk.chunk_type} best practices", limit=5
            )
            context['best_practices'] = best_practices

        except Exception as e:
            self.logger.warning(f"Could not retrieve context for chunk: {e}")

        return context

    def _build_chunk_analysis_prompt(self, file_path: str, chunk: ChunkInfo,
                                   chunk_index: int, total_chunks: int,
                                   context_memories: Dict[str, Any]) -> str:
        """Build comprehensive analysis prompt for a chunk."""

        # Build context information
        context_info = ""
        if context_memories['similar_code']:
            context_info += "\nSimilar code patterns found in repository:\n"
            for similar in context_memories['similar_code'][:2]:
                context_info += f"- {similar.get('file_path', 'Unknown')}: {len(similar.get('analysis_data', {}).get('issues_found', []))} issues\n"

        if context_memories['best_practices']:
            context_info += "\nRelevant best practices:\n"
            for practice in context_memories['best_practices'][:3]:
                context_info += f"- {practice['content'][:100]}...\n"

        prompt = f"""
You are a senior software engineer analyzing chunk {chunk_index + 1} of {total_chunks} from file: {file_path}

CHUNK INFORMATION:
- Chunk Type: {chunk.chunk_type}
- Context: {chunk.context}
- Lines: {chunk.start_line}-{chunk.end_line}
- Estimated Tokens: {chunk.tokens_estimate}

{context_info}

CHUNK CODE:
```{self._detect_language_from_path(file_path)}
{chunk.content}
```

Provide a focused analysis of this specific chunk, considering:
1. How it fits into the overall file structure
2. Specific issues within this chunk
3. Recommendations for this chunk
4. Code quality assessment (1-10 scale)
5. Security concerns specific to this chunk
6. Performance implications

Focus on actionable insights for this specific code section.
"""
        return prompt

    def _detect_language_from_path(self, file_path: str) -> str:
        """Detect language from file path."""
        ext = Path(file_path).suffix.lower()
        lang_map = {
            '.py': 'python',
            '.js': 'javascript',
            '.ts': 'typescript',
            '.tsx': 'typescript',
            '.jsx': 'javascript',
            '.java': 'java',
            '.cpp': 'cpp',
            '.c': 'cpp'
        }
        return lang_map.get(ext, 'text')

    def _parse_chunk_analysis(self, analysis_text: str, chunk: ChunkInfo) -> ChunkAnalysis:
        """Parse Claude's analysis response for a chunk."""

        # Extract severity score
        severity_match = re.search(r'(\d+(?:\.\d+)?)/10', analysis_text)
        severity_score = float(severity_match.group(1)) if severity_match else 5.0

        # Extract issues and recommendations
        issues = self._extract_issues_from_analysis(analysis_text)
        recommendations = self._extract_recommendations_from_analysis(analysis_text)

        return ChunkAnalysis(
            chunk_id=chunk.chunk_id,
            issues_found=issues,
            recommendations=recommendations,
            severity_score=severity_score,
            detailed_analysis=analysis_text,
            chunk_type=chunk.chunk_type,
            context=chunk.context
        )

    def _extract_issues_from_analysis(self, analysis_text: str) -> List[str]:
        """Extract issues from analysis text."""
        issues = []
        lines = analysis_text.split('\n')

        issue_keywords = ['issue', 'problem', 'bug', 'vulnerability', 'error', 'warning', 'concern']

        for line in lines:
            line_lower = line.lower().strip()
            if any(keyword in line_lower for keyword in issue_keywords):
                if line.strip() and not line.strip().startswith('#'):
                    issues.append(line.strip())

        return issues[:10]  # Limit to top 10 issues

    def _extract_recommendations_from_analysis(self, analysis_text: str) -> List[str]:
        """Extract recommendations from analysis text."""
        recommendations = []
        lines = analysis_text.split('\n')

        rec_keywords = ['recommend', 'suggest', 'should', 'consider', 'improve']

        for line in lines:
            line_lower = line.lower().strip()
            if any(keyword in line_lower for keyword in rec_keywords):
                if line.strip() and not line.strip().startswith('#'):
                    recommendations.append(line.strip())

        return recommendations[:10]  # Limit to top 10 recommendations

class ChunkResultCombiner:
    """
    Combines analysis results from multiple chunks into a comprehensive file analysis.
    """

    def __init__(self):
        self.logger = logging.getLogger(__name__)

    def combine_chunk_analyses(self, file_path: str, language: str,
                             chunk_analyses: List[ChunkAnalysis],
                             chunking_result: FileChunkingResult) -> Dict[str, Any]:
        """Combine multiple chunk analyses into a single file analysis."""

        if not chunk_analyses:
            return self._create_fallback_analysis(file_path, language)

        # Combine all issues and recommendations
        all_issues = []
        all_recommendations = []

        for analysis in chunk_analyses:
            all_issues.extend(analysis.issues_found)
            all_recommendations.extend(analysis.recommendations)

        # Calculate overall severity score
        severity_scores = [a.severity_score for a in chunk_analyses if a.severity_score > 0]
        overall_severity = sum(severity_scores) / len(severity_scores) if severity_scores else 5.0

        # Create comprehensive analysis
        detailed_analysis = self._create_comprehensive_analysis(chunk_analyses, chunking_result)

        # Calculate statistics
        total_lines = sum(chunk.end_line - chunk.start_line for chunk in chunking_result.chunks)

        return {
            "path": file_path,
            "language": language,
            "lines_of_code": total_lines,
            "complexity_score": self._calculate_complexity_score(chunk_analyses),
            "issues_found": all_issues,
            "recommendations": all_recommendations,
            "detailed_analysis": detailed_analysis,
            "severity_score": overall_severity,
            "chunking_info": {
                "total_chunks": len(chunk_analyses),
                "chunked": chunking_result.is_chunked,
                "savings_percentage": chunking_result.savings_percentage,
                "original_tokens": chunking_result.original_tokens,
                "chunked_tokens": chunking_result.chunked_tokens
            }
        }

    def _create_fallback_analysis(self, file_path: str, language: str) -> Dict[str, Any]:
        """Create fallback analysis when chunk analysis fails."""
        return {
            "path": file_path,
            "language": language,
            "lines_of_code": 0,
            "complexity_score": 5.0,
            "issues_found": ["Analysis failed - manual review recommended"],
            "recommendations": ["Review file manually due to analysis failure"],
            "detailed_analysis": "Analysis could not be completed due to processing errors.",
            "severity_score": 5.0,
            "chunking_info": {
                "total_chunks": 0,
                "chunked": False,
                "savings_percentage": 0.0,
                "original_tokens": 0,
                "chunked_tokens": 0
            }
        }

    def _create_comprehensive_analysis(self, chunk_analyses: List[ChunkAnalysis],
                                    chunking_result: FileChunkingResult) -> str:
        """Create comprehensive analysis from chunk analyses."""

        analysis_parts = []

        # File overview
        analysis_parts.append(f"File Analysis Summary:")
        analysis_parts.append(f"- Total chunks analyzed: {len(chunk_analyses)}")
        analysis_parts.append(f"- Chunking efficiency: {chunking_result.savings_percentage:.1f}% token savings")

        # Chunk-specific findings
        for i, analysis in enumerate(chunk_analyses):
            if analysis.issues_found or analysis.recommendations:
                analysis_parts.append(f"\nChunk {i+1} ({analysis.chunk_type}):")
                if analysis.issues_found:
                    if isinstance(analysis.issues_found, (list, tuple)):
                        analysis_parts.append(f"  Issues: {len(analysis.issues_found)} found")
                    else:
                        analysis_parts.append(f"  Issues: 0 found")
                if analysis.recommendations:
                    if isinstance(analysis.recommendations, (list, tuple)):
                        analysis_parts.append(f"  Recommendations: {len(analysis.recommendations)} provided")
                    else:
                        analysis_parts.append(f"  Recommendations: 0 provided")

        # Overall assessment - calculate safely
        if chunk_analyses and len(chunk_analyses) > 0:
            valid_scores = [a.severity_score for a in chunk_analyses if a.severity_score is not None]
            avg_severity = sum(valid_scores) / len(valid_scores) if valid_scores else 5.0
        else:
            avg_severity = 5.0
        analysis_parts.append(f"\nOverall Assessment:")
        analysis_parts.append(f"- Average quality score: {avg_severity:.1f}/10")
        analysis_parts.append(f"- Total issues found: {sum(len(a.issues_found) if isinstance(a.issues_found, (list, tuple)) else 0 for a in chunk_analyses)}")
        analysis_parts.append(f"- Total recommendations: {sum(len(a.recommendations) if isinstance(a.recommendations, (list, tuple)) else 0 for a in chunk_analyses)}")

        return '\n'.join(analysis_parts)

    def _calculate_complexity_score(self, chunk_analyses: List[ChunkAnalysis]) -> float:
        """Calculate complexity score based on chunk analyses."""
        if not chunk_analyses:
            return 5.0

        # Simple complexity calculation based on issues and severity
        total_issues = sum(len(a.issues_found) if isinstance(a.issues_found, (list, tuple)) else 0 for a in chunk_analyses)
        # Calculate average severity safely
        if chunk_analyses and len(chunk_analyses) > 0:
            valid_scores = [a.severity_score for a in chunk_analyses if a.severity_score is not None]
            avg_severity = sum(valid_scores) / len(valid_scores) if valid_scores else 5.0
        else:
            avg_severity = 5.0

        # Higher complexity = more issues + lower quality
        complexity = min(10.0, (total_issues * 0.5) + (10 - avg_severity))
        return complexity

class EnhancedFileProcessor:
    """
    Main processor that integrates chunking with existing analysis flow.
    Maintains backward compatibility while adding enhanced capabilities.
    """

    def __init__(self, claude_client, memory_manager):
        self.claude_client = claude_client
        self.memory_manager = memory_manager
        self.chunker = IntelligentChunker()
        self.analyzer = ChunkAnalyzer(claude_client, memory_manager)
        self.combiner = ChunkResultCombiner()
        self.logger = logging.getLogger(__name__)

    async def process_file_enhanced(self, file_path: str, content: str, repo_id: str) -> Dict[str, Any]:
        """
        Process a file with enhanced chunking while maintaining compatibility.
        This method can be used as a drop-in replacement for existing analysis.
        """
        try:
            # Step 1: Chunk the file
            chunking_result = self.chunker.chunk_file(file_path, content)

            # Step 2: Analyze chunks
            chunk_analyses = await self.analyzer.analyze_chunks(
                file_path, chunking_result.chunks, repo_id
            )

            # Step 3: Combine results
            file_analysis = self.combiner.combine_chunk_analyses(
                file_path, chunking_result.language, chunk_analyses, chunking_result
            )

            # Step 4: Store in memory system (compatible with existing)
            await self._store_enhanced_analysis(repo_id, file_path, file_analysis, chunking_result)

            return file_analysis

        except Exception as e:
            self.logger.error(f"Enhanced processing failed for {file_path}: {e}")
            # Fallback to basic analysis
            return await self._fallback_analysis(file_path, content, repo_id)

    async def _store_enhanced_analysis(self, repo_id: str, file_path: str,
                                    file_analysis: Dict[str, Any],
                                    chunking_result: FileChunkingResult):
        """Store enhanced analysis in memory system."""
        try:
            # Store file-level analysis (compatible with existing system)
            await self.memory_manager.store_code_analysis(repo_id, file_path, file_analysis)

            # Store chunking metadata for future reference
            chunking_metadata = {
                'chunked': chunking_result.is_chunked,
                'total_chunks': chunking_result.total_chunks,
                'savings_percentage': chunking_result.savings_percentage,
                'original_tokens': chunking_result.original_tokens,
                'chunked_tokens': chunking_result.chunked_tokens
            }

            # Store additional metadata (non-breaking)
            enhanced_data = {**file_analysis, 'chunking_metadata': chunking_metadata}
            await self.memory_manager.store_code_analysis(repo_id, f"{file_path}_enhanced", enhanced_data)

        except Exception as e:
            self.logger.warning(f"Could not store enhanced analysis: {e}")

    async def _fallback_analysis(self, file_path: str, content: str, repo_id: str) -> Dict[str, Any]:
        """Fallback to basic analysis if enhanced processing fails."""
        return {
            "path": file_path,
            "language": self.chunker.detect_language(file_path),
            "lines_of_code": len(content.split('\n')),
            "complexity_score": 5.0,
            "issues_found": ["Enhanced analysis failed - using fallback"],
            "recommendations": ["Review file manually"],
            "detailed_analysis": "Enhanced analysis could not be completed. Basic fallback analysis used.",
            "severity_score": 5.0,
            "chunking_info": {
                "total_chunks": 1,
                "chunked": False,
                "savings_percentage": 0.0,
                "original_tokens": self.chunker.estimate_tokens(content),
                "chunked_tokens": self.chunker.estimate_tokens(content)
            }
        }

# Configuration for enhanced chunking
ENHANCED_CHUNKING_CONFIG = {
    "max_tokens_per_chunk": 4000,
    "overlap_lines": 5,
    "min_chunk_size": 100,
    "preserve_imports": True,
    "preserve_comments": True,
    "enable_context_sharing": True,
    "enable_memory_integration": True
}