826 lines
33 KiB
Python
826 lines
33 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enhanced Chunking System for AI Analysis Service
|
|
Implements intelligent file chunking with zero disruption to existing flows.
|
|
|
|
Author: Senior Engineer (20+ years experience)
|
|
Version: 1.0.0
|
|
"""
|
|
|
|
import re
|
|
import hashlib
|
|
import asyncio
|
|
from typing import Dict, List, Optional, Tuple, Any
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
import logging
|
|
|
|
@dataclass
|
|
class ChunkInfo:
|
|
"""Information about a file chunk."""
|
|
chunk_id: int
|
|
content: str
|
|
start_line: int
|
|
end_line: int
|
|
chunk_type: str # 'function', 'class', 'import', 'main', 'utility'
|
|
context: str
|
|
is_complete: bool
|
|
tokens_estimate: int
|
|
language: str = "Unknown" # Programming language of the chunk
|
|
|
|
@dataclass
|
|
class ChunkAnalysis:
|
|
"""Analysis result for a single chunk."""
|
|
chunk_id: int
|
|
issues_found: List[str]
|
|
recommendations: List[str]
|
|
severity_score: float
|
|
detailed_analysis: str
|
|
chunk_type: str
|
|
context: str
|
|
|
|
@dataclass
|
|
class FileChunkingResult:
|
|
"""Result of chunking a file."""
|
|
file_path: str
|
|
language: str
|
|
total_chunks: int
|
|
chunks: List[ChunkInfo]
|
|
is_chunked: bool
|
|
original_tokens: int
|
|
chunked_tokens: int
|
|
savings_percentage: float
|
|
|
|
class IntelligentChunker:
|
|
"""
|
|
Intelligent file chunking system that breaks large files into semantic chunks
|
|
while preserving context and relationships.
|
|
"""
|
|
|
|
def __init__(self, max_tokens_per_chunk: int = 4000, overlap_lines: int = 5):
|
|
self.max_tokens = max_tokens_per_chunk
|
|
self.overlap_lines = overlap_lines
|
|
self.logger = logging.getLogger(__name__)
|
|
|
|
# Language-specific patterns for intelligent chunking
|
|
self.language_patterns = {
|
|
'python': {
|
|
'function': r'^def\s+\w+',
|
|
'class': r'^class\s+\w+',
|
|
'import': r'^(import|from)\s+',
|
|
'comment': r'^\s*#',
|
|
'docstring': r'^\s*""".*"""'
|
|
},
|
|
'javascript': {
|
|
'function': r'^(function\s+\w+|const\s+\w+\s*=\s*(async\s+)?\(|export\s+(function|const))',
|
|
'class': r'^class\s+\w+',
|
|
'import': r'^(import|const\s+\w+\s*=\s*require)',
|
|
'comment': r'^\s*//',
|
|
'jsdoc': r'^\s*/\*\*'
|
|
},
|
|
'typescript': {
|
|
'function': r'^(function\s+\w+|const\s+\w+\s*=\s*(async\s+)?\(|export\s+(function|const))',
|
|
'class': r'^class\s+\w+',
|
|
'interface': r'^interface\s+\w+',
|
|
'import': r'^(import|const\s+\w+\s*=\s*require)',
|
|
'comment': r'^\s*//',
|
|
'jsdoc': r'^\s*/\*\*'
|
|
},
|
|
'java': {
|
|
'function': r'^\s*(public|private|protected)?\s*(static\s+)?\w+\s+\w+\s*\(',
|
|
'class': r'^class\s+\w+',
|
|
'import': r'^import\s+',
|
|
'comment': r'^\s*//',
|
|
'javadoc': r'^\s*/\*\*'
|
|
},
|
|
'cpp': {
|
|
'function': r'^\w+\s+\w+\s*\(',
|
|
'class': r'^class\s+\w+',
|
|
'include': r'^#include\s*<',
|
|
'comment': r'^\s*//',
|
|
'block_comment': r'^\s*/\*'
|
|
}
|
|
}
|
|
|
|
def estimate_tokens(self, text: str) -> int:
|
|
"""Estimate token count for text (rough approximation)."""
|
|
return len(text) // 4
|
|
|
|
def detect_language(self, file_path: str) -> str:
|
|
"""Detect programming language from file extension."""
|
|
ext = Path(file_path).suffix.lower()
|
|
language_map = {
|
|
'.py': 'python',
|
|
'.js': 'javascript',
|
|
'.ts': 'typescript',
|
|
'.tsx': 'typescript',
|
|
'.jsx': 'javascript',
|
|
'.java': 'java',
|
|
'.cpp': 'cpp',
|
|
'.c': 'cpp',
|
|
'.cs': 'csharp',
|
|
'.go': 'go',
|
|
'.rs': 'rust',
|
|
'.php': 'php',
|
|
'.rb': 'ruby'
|
|
}
|
|
return language_map.get(ext, 'unknown')
|
|
|
|
def chunk_file(self, file_path: str, content: str) -> FileChunkingResult:
|
|
"""
|
|
Intelligently chunk a file based on its programming language and structure.
|
|
"""
|
|
language = self.detect_language(file_path)
|
|
lines = content.split('\n')
|
|
original_tokens = self.estimate_tokens(content)
|
|
|
|
# If file is small enough, don't chunk
|
|
if original_tokens <= self.max_tokens:
|
|
return FileChunkingResult(
|
|
file_path=file_path,
|
|
language=language,
|
|
total_chunks=1,
|
|
chunks=[ChunkInfo(
|
|
chunk_id=0,
|
|
content=content,
|
|
start_line=0,
|
|
end_line=len(lines),
|
|
chunk_type='complete',
|
|
context='',
|
|
is_complete=True,
|
|
tokens_estimate=original_tokens,
|
|
language=language
|
|
)],
|
|
is_chunked=False,
|
|
original_tokens=original_tokens,
|
|
chunked_tokens=original_tokens,
|
|
savings_percentage=0.0
|
|
)
|
|
|
|
# Chunk the file intelligently
|
|
chunks = self._chunk_by_language(content, language, file_path)
|
|
|
|
# Calculate savings
|
|
chunked_tokens = sum(chunk.tokens_estimate for chunk in chunks)
|
|
savings = max(0, (original_tokens - chunked_tokens) / original_tokens * 100)
|
|
|
|
return FileChunkingResult(
|
|
file_path=file_path,
|
|
language=language,
|
|
total_chunks=len(chunks),
|
|
chunks=chunks,
|
|
is_chunked=True,
|
|
original_tokens=original_tokens,
|
|
chunked_tokens=chunked_tokens,
|
|
savings_percentage=savings
|
|
)
|
|
|
|
def _chunk_by_language(self, content: str, language: str, file_path: str) -> List[ChunkInfo]:
|
|
"""Chunk file based on language-specific patterns."""
|
|
lines = content.split('\n')
|
|
patterns = self.language_patterns.get(language, self.language_patterns['python'])
|
|
|
|
chunks = []
|
|
current_chunk = []
|
|
current_tokens = 0
|
|
chunk_id = 0
|
|
start_line = 0
|
|
|
|
# Extract imports and global declarations first
|
|
imports, main_content = self._extract_imports(lines, patterns)
|
|
if imports:
|
|
chunks.append(ChunkInfo(
|
|
chunk_id=chunk_id,
|
|
content='\n'.join(imports),
|
|
start_line=0,
|
|
end_line=len(imports),
|
|
chunk_type='import',
|
|
context='File imports and global declarations',
|
|
is_complete=True,
|
|
tokens_estimate=self.estimate_tokens('\n'.join(imports)),
|
|
language=language
|
|
))
|
|
chunk_id += 1
|
|
|
|
# Process main content
|
|
for i, line in enumerate(main_content):
|
|
current_chunk.append(line)
|
|
current_tokens += self.estimate_tokens(line)
|
|
|
|
# Check if we should create a chunk
|
|
should_chunk = (
|
|
current_tokens >= self.max_tokens or
|
|
self._is_logical_boundary(line, patterns) or
|
|
i == len(main_content) - 1
|
|
)
|
|
|
|
if should_chunk and current_chunk:
|
|
# Determine chunk type
|
|
chunk_type = self._determine_chunk_type(current_chunk, patterns)
|
|
context = self._generate_context(current_chunk, chunk_type, language)
|
|
|
|
chunks.append(ChunkInfo(
|
|
chunk_id=chunk_id,
|
|
content='\n'.join(current_chunk),
|
|
start_line=start_line,
|
|
end_line=start_line + len(current_chunk),
|
|
chunk_type=chunk_type,
|
|
context=context,
|
|
is_complete=False,
|
|
tokens_estimate=current_tokens,
|
|
language=language
|
|
))
|
|
|
|
# Prepare for next chunk with overlap
|
|
overlap = current_chunk[-self.overlap_lines:] if len(current_chunk) > self.overlap_lines else []
|
|
current_chunk = overlap
|
|
current_tokens = self.estimate_tokens('\n'.join(overlap))
|
|
start_line += len(current_chunk) - len(overlap)
|
|
chunk_id += 1
|
|
|
|
return chunks
|
|
|
|
def _extract_imports(self, lines: List[str], patterns: Dict[str, str]) -> Tuple[List[str], List[str]]:
|
|
"""Extract import statements and return them separately."""
|
|
imports = []
|
|
main_content = []
|
|
|
|
for line in lines:
|
|
if re.match(patterns.get('import', r'^(import|from)'), line.strip()):
|
|
imports.append(line)
|
|
else:
|
|
main_content.append(line)
|
|
|
|
return imports, main_content
|
|
|
|
def _is_logical_boundary(self, line: str, patterns: Dict[str, str]) -> bool:
|
|
"""Check if line represents a logical boundary for chunking."""
|
|
line_stripped = line.strip()
|
|
|
|
# Function/class definitions
|
|
if (re.match(patterns.get('function', r'^def\s+'), line_stripped) or
|
|
re.match(patterns.get('class', r'^class\s+'), line_stripped)):
|
|
return True
|
|
|
|
# Major comments or documentation
|
|
if (re.match(patterns.get('comment', r'^\s*#'), line_stripped) and
|
|
len(line_stripped) > 50): # Significant comment
|
|
return True
|
|
|
|
return False
|
|
|
|
def _determine_chunk_type(self, chunk_lines: List[str], patterns: Dict[str, str]) -> str:
|
|
"""Determine the type of chunk based on its content."""
|
|
content = '\n'.join(chunk_lines)
|
|
|
|
if re.search(patterns.get('function', r'^def\s+'), content, re.MULTILINE):
|
|
return 'function'
|
|
elif re.search(patterns.get('class', r'^class\s+'), content, re.MULTILINE):
|
|
return 'class'
|
|
elif re.search(patterns.get('import', r'^(import|from)'), content, re.MULTILINE):
|
|
return 'import'
|
|
else:
|
|
return 'main'
|
|
|
|
def _generate_context(self, chunk_lines: List[str], chunk_type: str, language: str) -> str:
|
|
"""Generate contextual information for a chunk."""
|
|
if chunk_type == 'import':
|
|
return f"Import statements and global declarations for {language} file"
|
|
elif chunk_type == 'function':
|
|
return f"Function definitions and related code in {language}"
|
|
elif chunk_type == 'class':
|
|
return f"Class definitions and methods in {language}"
|
|
else:
|
|
return f"Main logic and implementation code in {language}"
|
|
|
|
class ChunkAnalyzer:
|
|
"""
|
|
Analyzes individual chunks with context awareness and combines results.
|
|
"""
|
|
|
|
def __init__(self, claude_client, memory_manager):
|
|
self.claude_client = claude_client
|
|
self.memory_manager = memory_manager
|
|
self.logger = logging.getLogger(__name__)
|
|
|
|
async def analyze_chunks(self, file_path: str, chunks: List[ChunkInfo], repo_id: str) -> List[ChunkAnalysis]:
|
|
"""Analyze all chunks of a file with context awareness."""
|
|
if len(chunks) == 1 and chunks[0].is_complete:
|
|
# Single chunk - use existing analysis
|
|
return await self._analyze_single_chunk(file_path, chunks[0], repo_id)
|
|
|
|
# Multiple chunks - analyze with context
|
|
chunk_analyses = []
|
|
|
|
for i, chunk in enumerate(chunks):
|
|
try:
|
|
analysis = await self._analyze_chunk_with_context(
|
|
file_path, chunk, i, len(chunks), repo_id
|
|
)
|
|
chunk_analyses.append(analysis)
|
|
|
|
# Small delay to respect rate limits
|
|
await asyncio.sleep(0.1)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error analyzing chunk {i} of {file_path}: {e}")
|
|
# Create fallback analysis
|
|
chunk_analyses.append(ChunkAnalysis(
|
|
chunk_id=chunk.chunk_id,
|
|
issues_found=[f"Analysis failed: {str(e)}"],
|
|
recommendations=["Review this section manually"],
|
|
severity_score=5.0,
|
|
detailed_analysis=f"Analysis failed due to error: {str(e)}",
|
|
chunk_type=chunk.chunk_type,
|
|
context=chunk.context
|
|
))
|
|
|
|
return chunk_analyses
|
|
|
|
async def _analyze_single_chunk(self, file_path: str, chunk: ChunkInfo, repo_id: str) -> List[ChunkAnalysis]:
|
|
"""Analyze a single complete chunk using existing logic."""
|
|
try:
|
|
# Use the existing analysis logic but optimized for single chunk
|
|
analysis_prompt = f"""
|
|
Analyze this code file for quality, security, and best practices.
|
|
|
|
File: {file_path}
|
|
Language: {chunk.language}
|
|
|
|
Code:
|
|
{chunk.content}
|
|
|
|
Provide a comprehensive analysis focusing on:
|
|
1. Code quality and maintainability
|
|
2. Security vulnerabilities
|
|
3. Performance issues
|
|
4. Best practices adherence
|
|
5. Specific recommendations for improvement
|
|
|
|
Format your response as JSON with these fields:
|
|
- issues_found: List of specific issues
|
|
- recommendations: List of improvement suggestions
|
|
- severity_score: Number from 1-10 (10 being best quality)
|
|
- detailed_analysis: Comprehensive analysis text
|
|
"""
|
|
|
|
# Make API call to Claude using the anthropic client
|
|
response = self.claude_client.messages.create(
|
|
model="claude-3-5-sonnet-20241022",
|
|
max_tokens=2048,
|
|
messages=[{
|
|
"role": "user",
|
|
"content": analysis_prompt
|
|
}]
|
|
)
|
|
|
|
# Parse response and create analysis
|
|
response_text = response.content[0].text if response.content else ""
|
|
analysis_data = self._parse_analysis_response(response_text)
|
|
|
|
return [ChunkAnalysis(
|
|
chunk_id=chunk.chunk_id,
|
|
issues_found=analysis_data.get('issues_found', []),
|
|
recommendations=analysis_data.get('recommendations', []),
|
|
severity_score=analysis_data.get('severity_score', 5.0),
|
|
detailed_analysis=analysis_data.get('detailed_analysis', 'Analysis completed'),
|
|
chunk_type=chunk.chunk_type,
|
|
context=chunk.context
|
|
)]
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error analyzing single chunk for {file_path}: {e}")
|
|
return [ChunkAnalysis(
|
|
chunk_id=chunk.chunk_id,
|
|
issues_found=[f"Analysis failed: {str(e)}"],
|
|
recommendations=["Review this section manually"],
|
|
severity_score=5.0,
|
|
detailed_analysis=f"Analysis failed due to error: {str(e)}",
|
|
chunk_type=chunk.chunk_type,
|
|
context=chunk.context
|
|
)]
|
|
|
|
def _parse_analysis_response(self, response: str) -> Dict[str, Any]:
|
|
"""Parse Claude's analysis response into structured data."""
|
|
try:
|
|
import json
|
|
# Try to extract JSON from response
|
|
if '{' in response and '}' in response:
|
|
start = response.find('{')
|
|
end = response.rfind('}') + 1
|
|
json_str = response[start:end]
|
|
return json.loads(json_str)
|
|
else:
|
|
# Fallback parsing
|
|
return {
|
|
'issues_found': ['Unable to parse specific issues'],
|
|
'recommendations': ['Review code manually'],
|
|
'severity_score': 5.0,
|
|
'detailed_analysis': response
|
|
}
|
|
except Exception as e:
|
|
self.logger.error(f"Error parsing analysis response: {e}")
|
|
return {
|
|
'issues_found': ['Analysis parsing failed'],
|
|
'recommendations': ['Review code manually'],
|
|
'severity_score': 5.0,
|
|
'detailed_analysis': response
|
|
}
|
|
|
|
async def _analyze_chunk_with_context(self, file_path: str, chunk: ChunkInfo,
|
|
chunk_index: int, total_chunks: int, repo_id: str) -> ChunkAnalysis:
|
|
"""Analyze a single chunk with file and repository context."""
|
|
|
|
# Get relevant context from memory system
|
|
context_memories = await self._get_chunk_context(file_path, chunk, repo_id)
|
|
|
|
# Build enhanced prompt with context
|
|
prompt = self._build_chunk_analysis_prompt(
|
|
file_path, chunk, chunk_index, total_chunks, context_memories
|
|
)
|
|
|
|
try:
|
|
# Rate limiting
|
|
await asyncio.sleep(0.1) # Small delay between requests
|
|
|
|
# Send to Claude API
|
|
message = self.claude_client.messages.create(
|
|
model="claude-3-5-sonnet-20241022",
|
|
max_tokens=2048,
|
|
temperature=0.1,
|
|
messages=[{"role": "user", "content": prompt}]
|
|
)
|
|
|
|
analysis_text = message.content[0].text.strip()
|
|
|
|
# Parse the analysis
|
|
return self._parse_chunk_analysis(analysis_text, chunk)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Claude API error for chunk {chunk_index}: {e}")
|
|
raise
|
|
|
|
async def _get_chunk_context(self, file_path: str, chunk: ChunkInfo, repo_id: str) -> Dict[str, Any]:
|
|
"""Get relevant context for chunk analysis."""
|
|
context = {
|
|
'similar_code': [],
|
|
'repository_patterns': [],
|
|
'best_practices': []
|
|
}
|
|
|
|
try:
|
|
# Search for similar code patterns
|
|
similar_code = await self.memory_manager.search_similar_code(
|
|
f"{chunk.chunk_type} {chunk.context}", repo_id, limit=3
|
|
)
|
|
context['similar_code'] = similar_code
|
|
|
|
# Get relevant best practices
|
|
best_practices = await self.memory_manager.retrieve_persistent_memories(
|
|
f"{chunk.chunk_type} best practices", limit=5
|
|
)
|
|
context['best_practices'] = best_practices
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"Could not retrieve context for chunk: {e}")
|
|
|
|
return context
|
|
|
|
def _build_chunk_analysis_prompt(self, file_path: str, chunk: ChunkInfo,
|
|
chunk_index: int, total_chunks: int,
|
|
context_memories: Dict[str, Any]) -> str:
|
|
"""Build comprehensive analysis prompt for a chunk."""
|
|
|
|
# Build context information
|
|
context_info = ""
|
|
if context_memories['similar_code']:
|
|
context_info += "\nSimilar code patterns found in repository:\n"
|
|
for similar in context_memories['similar_code'][:2]:
|
|
context_info += f"- {similar.get('file_path', 'Unknown')}: {len(similar.get('analysis_data', {}).get('issues_found', []))} issues\n"
|
|
|
|
if context_memories['best_practices']:
|
|
context_info += "\nRelevant best practices:\n"
|
|
for practice in context_memories['best_practices'][:3]:
|
|
context_info += f"- {practice['content'][:100]}...\n"
|
|
|
|
prompt = f"""
|
|
You are a senior software engineer analyzing chunk {chunk_index + 1} of {total_chunks} from file: {file_path}
|
|
|
|
CHUNK INFORMATION:
|
|
- Chunk Type: {chunk.chunk_type}
|
|
- Context: {chunk.context}
|
|
- Lines: {chunk.start_line}-{chunk.end_line}
|
|
- Estimated Tokens: {chunk.tokens_estimate}
|
|
|
|
{context_info}
|
|
|
|
CHUNK CODE:
|
|
```{self._detect_language_from_path(file_path)}
|
|
{chunk.content}
|
|
```
|
|
|
|
Provide a focused analysis of this specific chunk, considering:
|
|
1. How it fits into the overall file structure
|
|
2. Specific issues within this chunk
|
|
3. Recommendations for this chunk
|
|
4. Code quality assessment (1-10 scale)
|
|
5. Security concerns specific to this chunk
|
|
6. Performance implications
|
|
|
|
Focus on actionable insights for this specific code section.
|
|
"""
|
|
return prompt
|
|
|
|
def _detect_language_from_path(self, file_path: str) -> str:
|
|
"""Detect language from file path."""
|
|
ext = Path(file_path).suffix.lower()
|
|
lang_map = {
|
|
'.py': 'python',
|
|
'.js': 'javascript',
|
|
'.ts': 'typescript',
|
|
'.tsx': 'typescript',
|
|
'.jsx': 'javascript',
|
|
'.java': 'java',
|
|
'.cpp': 'cpp',
|
|
'.c': 'cpp'
|
|
}
|
|
return lang_map.get(ext, 'text')
|
|
|
|
def _parse_chunk_analysis(self, analysis_text: str, chunk: ChunkInfo) -> ChunkAnalysis:
|
|
"""Parse Claude's analysis response for a chunk."""
|
|
|
|
# Extract severity score
|
|
severity_match = re.search(r'(\d+(?:\.\d+)?)/10', analysis_text)
|
|
severity_score = float(severity_match.group(1)) if severity_match else 5.0
|
|
|
|
# Extract issues and recommendations
|
|
issues = self._extract_issues_from_analysis(analysis_text)
|
|
recommendations = self._extract_recommendations_from_analysis(analysis_text)
|
|
|
|
return ChunkAnalysis(
|
|
chunk_id=chunk.chunk_id,
|
|
issues_found=issues,
|
|
recommendations=recommendations,
|
|
severity_score=severity_score,
|
|
detailed_analysis=analysis_text,
|
|
chunk_type=chunk.chunk_type,
|
|
context=chunk.context
|
|
)
|
|
|
|
def _extract_issues_from_analysis(self, analysis_text: str) -> List[str]:
|
|
"""Extract issues from analysis text."""
|
|
issues = []
|
|
lines = analysis_text.split('\n')
|
|
|
|
issue_keywords = ['issue', 'problem', 'bug', 'vulnerability', 'error', 'warning', 'concern']
|
|
|
|
for line in lines:
|
|
line_lower = line.lower().strip()
|
|
if any(keyword in line_lower for keyword in issue_keywords):
|
|
if line.strip() and not line.strip().startswith('#'):
|
|
issues.append(line.strip())
|
|
|
|
return issues[:10] # Limit to top 10 issues
|
|
|
|
def _extract_recommendations_from_analysis(self, analysis_text: str) -> List[str]:
|
|
"""Extract recommendations from analysis text."""
|
|
recommendations = []
|
|
lines = analysis_text.split('\n')
|
|
|
|
rec_keywords = ['recommend', 'suggest', 'should', 'consider', 'improve']
|
|
|
|
for line in lines:
|
|
line_lower = line.lower().strip()
|
|
if any(keyword in line_lower for keyword in rec_keywords):
|
|
if line.strip() and not line.strip().startswith('#'):
|
|
recommendations.append(line.strip())
|
|
|
|
return recommendations[:10] # Limit to top 10 recommendations
|
|
|
|
class ChunkResultCombiner:
|
|
"""
|
|
Combines analysis results from multiple chunks into a comprehensive file analysis.
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.logger = logging.getLogger(__name__)
|
|
|
|
def combine_chunk_analyses(self, file_path: str, language: str,
|
|
chunk_analyses: List[ChunkAnalysis],
|
|
chunking_result: FileChunkingResult) -> Dict[str, Any]:
|
|
"""Combine multiple chunk analyses into a single file analysis."""
|
|
|
|
if not chunk_analyses:
|
|
return self._create_fallback_analysis(file_path, language)
|
|
|
|
# Combine all issues and recommendations
|
|
all_issues = []
|
|
all_recommendations = []
|
|
|
|
for analysis in chunk_analyses:
|
|
all_issues.extend(analysis.issues_found)
|
|
all_recommendations.extend(analysis.recommendations)
|
|
|
|
# Calculate overall severity score
|
|
severity_scores = [a.severity_score for a in chunk_analyses if a.severity_score > 0]
|
|
overall_severity = sum(severity_scores) / len(severity_scores) if severity_scores else 5.0
|
|
|
|
# Create comprehensive analysis
|
|
detailed_analysis = self._create_comprehensive_analysis(chunk_analyses, chunking_result)
|
|
|
|
# Calculate statistics
|
|
total_lines = sum(chunk.end_line - chunk.start_line for chunk in chunking_result.chunks)
|
|
|
|
return {
|
|
"path": file_path,
|
|
"language": language,
|
|
"lines_of_code": total_lines,
|
|
"complexity_score": self._calculate_complexity_score(chunk_analyses),
|
|
"issues_found": all_issues,
|
|
"recommendations": all_recommendations,
|
|
"detailed_analysis": detailed_analysis,
|
|
"severity_score": overall_severity,
|
|
"chunking_info": {
|
|
"total_chunks": len(chunk_analyses),
|
|
"chunked": chunking_result.is_chunked,
|
|
"savings_percentage": chunking_result.savings_percentage,
|
|
"original_tokens": chunking_result.original_tokens,
|
|
"chunked_tokens": chunking_result.chunked_tokens
|
|
}
|
|
}
|
|
|
|
def _create_fallback_analysis(self, file_path: str, language: str) -> Dict[str, Any]:
|
|
"""Create fallback analysis when chunk analysis fails."""
|
|
return {
|
|
"path": file_path,
|
|
"language": language,
|
|
"lines_of_code": 0,
|
|
"complexity_score": 5.0,
|
|
"issues_found": ["Analysis failed - manual review recommended"],
|
|
"recommendations": ["Review file manually due to analysis failure"],
|
|
"detailed_analysis": "Analysis could not be completed due to processing errors.",
|
|
"severity_score": 5.0,
|
|
"chunking_info": {
|
|
"total_chunks": 0,
|
|
"chunked": False,
|
|
"savings_percentage": 0.0,
|
|
"original_tokens": 0,
|
|
"chunked_tokens": 0
|
|
}
|
|
}
|
|
|
|
def _create_comprehensive_analysis(self, chunk_analyses: List[ChunkAnalysis],
|
|
chunking_result: FileChunkingResult) -> str:
|
|
"""Create comprehensive analysis from chunk analyses."""
|
|
|
|
analysis_parts = []
|
|
|
|
# File overview
|
|
analysis_parts.append(f"File Analysis Summary:")
|
|
analysis_parts.append(f"- Total chunks analyzed: {len(chunk_analyses)}")
|
|
analysis_parts.append(f"- Chunking efficiency: {chunking_result.savings_percentage:.1f}% token savings")
|
|
|
|
# Chunk-specific findings
|
|
for i, analysis in enumerate(chunk_analyses):
|
|
if analysis.issues_found or analysis.recommendations:
|
|
analysis_parts.append(f"\nChunk {i+1} ({analysis.chunk_type}):")
|
|
if analysis.issues_found:
|
|
if isinstance(analysis.issues_found, (list, tuple)):
|
|
analysis_parts.append(f" Issues: {len(analysis.issues_found)} found")
|
|
else:
|
|
analysis_parts.append(f" Issues: 0 found")
|
|
if analysis.recommendations:
|
|
if isinstance(analysis.recommendations, (list, tuple)):
|
|
analysis_parts.append(f" Recommendations: {len(analysis.recommendations)} provided")
|
|
else:
|
|
analysis_parts.append(f" Recommendations: 0 provided")
|
|
|
|
# Overall assessment - calculate safely
|
|
if chunk_analyses and len(chunk_analyses) > 0:
|
|
valid_scores = [a.severity_score for a in chunk_analyses if a.severity_score is not None]
|
|
avg_severity = sum(valid_scores) / len(valid_scores) if valid_scores else 5.0
|
|
else:
|
|
avg_severity = 5.0
|
|
analysis_parts.append(f"\nOverall Assessment:")
|
|
analysis_parts.append(f"- Average quality score: {avg_severity:.1f}/10")
|
|
analysis_parts.append(f"- Total issues found: {sum(len(a.issues_found) if isinstance(a.issues_found, (list, tuple)) else 0 for a in chunk_analyses)}")
|
|
analysis_parts.append(f"- Total recommendations: {sum(len(a.recommendations) if isinstance(a.recommendations, (list, tuple)) else 0 for a in chunk_analyses)}")
|
|
|
|
return '\n'.join(analysis_parts)
|
|
|
|
def _calculate_complexity_score(self, chunk_analyses: List[ChunkAnalysis]) -> float:
|
|
"""Calculate complexity score based on chunk analyses."""
|
|
if not chunk_analyses:
|
|
return 5.0
|
|
|
|
# Simple complexity calculation based on issues and severity
|
|
total_issues = sum(len(a.issues_found) if isinstance(a.issues_found, (list, tuple)) else 0 for a in chunk_analyses)
|
|
# Calculate average severity safely
|
|
if chunk_analyses and len(chunk_analyses) > 0:
|
|
valid_scores = [a.severity_score for a in chunk_analyses if a.severity_score is not None]
|
|
avg_severity = sum(valid_scores) / len(valid_scores) if valid_scores else 5.0
|
|
else:
|
|
avg_severity = 5.0
|
|
|
|
# Higher complexity = more issues + lower quality
|
|
complexity = min(10.0, (total_issues * 0.5) + (10 - avg_severity))
|
|
return complexity
|
|
|
|
class EnhancedFileProcessor:
|
|
"""
|
|
Main processor that integrates chunking with existing analysis flow.
|
|
Maintains backward compatibility while adding enhanced capabilities.
|
|
"""
|
|
|
|
def __init__(self, claude_client, memory_manager):
|
|
self.claude_client = claude_client
|
|
self.memory_manager = memory_manager
|
|
self.chunker = IntelligentChunker()
|
|
self.analyzer = ChunkAnalyzer(claude_client, memory_manager)
|
|
self.combiner = ChunkResultCombiner()
|
|
self.logger = logging.getLogger(__name__)
|
|
|
|
async def process_file_enhanced(self, file_path: str, content: str, repo_id: str) -> Dict[str, Any]:
|
|
"""
|
|
Process a file with enhanced chunking while maintaining compatibility.
|
|
This method can be used as a drop-in replacement for existing analysis.
|
|
"""
|
|
try:
|
|
# Step 1: Chunk the file
|
|
chunking_result = self.chunker.chunk_file(file_path, content)
|
|
|
|
# Step 2: Analyze chunks
|
|
chunk_analyses = await self.analyzer.analyze_chunks(
|
|
file_path, chunking_result.chunks, repo_id
|
|
)
|
|
|
|
# Step 3: Combine results
|
|
file_analysis = self.combiner.combine_chunk_analyses(
|
|
file_path, chunking_result.language, chunk_analyses, chunking_result
|
|
)
|
|
|
|
# Step 4: Store in memory system (compatible with existing)
|
|
await self._store_enhanced_analysis(repo_id, file_path, file_analysis, chunking_result)
|
|
|
|
return file_analysis
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Enhanced processing failed for {file_path}: {e}")
|
|
# Fallback to basic analysis
|
|
return await self._fallback_analysis(file_path, content, repo_id)
|
|
|
|
async def _store_enhanced_analysis(self, repo_id: str, file_path: str,
|
|
file_analysis: Dict[str, Any],
|
|
chunking_result: FileChunkingResult):
|
|
"""Store enhanced analysis in memory system."""
|
|
try:
|
|
# Store file-level analysis (compatible with existing system)
|
|
await self.memory_manager.store_code_analysis(repo_id, file_path, file_analysis)
|
|
|
|
# Store chunking metadata for future reference
|
|
chunking_metadata = {
|
|
'chunked': chunking_result.is_chunked,
|
|
'total_chunks': chunking_result.total_chunks,
|
|
'savings_percentage': chunking_result.savings_percentage,
|
|
'original_tokens': chunking_result.original_tokens,
|
|
'chunked_tokens': chunking_result.chunked_tokens
|
|
}
|
|
|
|
# Store additional metadata (non-breaking)
|
|
enhanced_data = {**file_analysis, 'chunking_metadata': chunking_metadata}
|
|
await self.memory_manager.store_code_analysis(repo_id, f"{file_path}_enhanced", enhanced_data)
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"Could not store enhanced analysis: {e}")
|
|
|
|
async def _fallback_analysis(self, file_path: str, content: str, repo_id: str) -> Dict[str, Any]:
|
|
"""Fallback to basic analysis if enhanced processing fails."""
|
|
return {
|
|
"path": file_path,
|
|
"language": self.chunker.detect_language(file_path),
|
|
"lines_of_code": len(content.split('\n')),
|
|
"complexity_score": 5.0,
|
|
"issues_found": ["Enhanced analysis failed - using fallback"],
|
|
"recommendations": ["Review file manually"],
|
|
"detailed_analysis": "Enhanced analysis could not be completed. Basic fallback analysis used.",
|
|
"severity_score": 5.0,
|
|
"chunking_info": {
|
|
"total_chunks": 1,
|
|
"chunked": False,
|
|
"savings_percentage": 0.0,
|
|
"original_tokens": self.chunker.estimate_tokens(content),
|
|
"chunked_tokens": self.chunker.estimate_tokens(content)
|
|
}
|
|
}
|
|
|
|
# Configuration for enhanced chunking
|
|
ENHANCED_CHUNKING_CONFIG = {
|
|
"max_tokens_per_chunk": 4000,
|
|
"overlap_lines": 5,
|
|
"min_chunk_size": 100,
|
|
"preserve_imports": True,
|
|
"preserve_comments": True,
|
|
"enable_context_sharing": True,
|
|
"enable_memory_integration": True
|
|
}
|