#!/usr/bin/env python3 """ AI Analysis Service HTTP Server Provides REST API endpoints for repository analysis. """ import os import asyncio import json import tempfile import shutil import time import hashlib import traceback from pathlib import Path from typing import Dict, Any, Optional, List from datetime import datetime from fastapi import FastAPI, HTTPException, BackgroundTasks from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import FileResponse, JSONResponse from pydantic import BaseModel import uvicorn import httpx import redis # Import the AI analysis components # Note: ai-analyze.py has a hyphen, so we need to handle the import specially import sys import importlib.util # Load the ai-analyze.py module spec = importlib.util.spec_from_file_location("ai_analyze", "ai-analyze.py") ai_analyze_module = importlib.util.module_from_spec(spec) sys.modules["ai_analyze"] = ai_analyze_module spec.loader.exec_module(ai_analyze_module) # Now import the classes from ai_analyze import EnhancedGitHubAnalyzer, get_memory_config # Import enhanced analyzer (backward compatible) try: from enhanced_analyzer import EnhancedGitHubAnalyzerV2, create_enhanced_analyzer ENHANCED_ANALYZER_AVAILABLE = True except ImportError as e: print(f"Enhanced analyzer not available: {e}") ENHANCED_ANALYZER_AVAILABLE = False app = FastAPI( title="AI Analysis Service", description="AI-powered repository analysis with memory system", version="1.0.0" ) # CORS middleware app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # Global analyzer instance analyzer = None # Rate limiter for Claude API class ClaudeRateLimiter: def __init__(self, requests_per_minute: int = 90): self.requests_per_minute = requests_per_minute self.requests = [] self.lock = asyncio.Lock() async def wait_if_needed(self): """Wait if rate limit would be exceeded.""" async with self.lock: now = time.time() # Remove requests older than 1 minute self.requests = [req_time for req_time in self.requests if now - req_time < 60] if len(self.requests) >= self.requests_per_minute: sleep_time = 60 - (now - self.requests[0]) if sleep_time > 0: await asyncio.sleep(sleep_time) self.requests.append(now) # Git Integration Service Client class GitIntegrationClient: def __init__(self): self.base_url = os.getenv('GIT_INTEGRATION_SERVICE_URL', 'http://git-integration:8012') self.timeout = 30.0 async def get_repository_info(self, repository_id: str, user_id: str) -> Dict[str, Any]: """Get repository information from git-integration service.""" try: async with httpx.AsyncClient(timeout=self.timeout) as client: # Get repository info from the diffs endpoint response = await client.get( f"{self.base_url}/api/diffs/repositories", headers={'x-user-id': user_id} ) if response.status_code == 200: data = response.json() if data.get('success') and 'data' in data: repositories = data['data'].get('repositories', []) for repo in repositories: if repo.get('id') == repository_id: return { 'id': repo.get('id'), 'name': repo.get('repository_name'), 'owner': repo.get('owner_name'), 'provider': repo.get('provider_name', 'github'), 'local_path': f"/tmp/attached-repos/{repo.get('owner_name')}__{repo.get('repository_name')}__main", 'repository_url': f"https://github.com/{repo.get('owner_name')}/{repo.get('repository_name')}" } raise Exception(f"Repository {repository_id} not found") else: raise Exception(f"Invalid response format: {data}") else: raise Exception(f"Failed to get repository info: {response.text}") except Exception as e: raise Exception(f"Git-integration service communication failed: {e}") # Analysis Cache class AnalysisCache: def __init__(self): try: self.redis = redis.Redis( host=os.getenv('REDIS_HOST', 'redis'), port=int(os.getenv('REDIS_PORT', 6379)), password=os.getenv('REDIS_PASSWORD', ''), decode_responses=True ) self.cache_ttl = 86400 # 24 hours except Exception as e: print(f"Warning: Redis connection failed: {e}") self.redis = None async def get_cached_analysis(self, file_hash: str) -> Optional[Dict[str, Any]]: """Get cached analysis result.""" if not self.redis: return None try: cache_key = f"analysis:{file_hash}" cached_data = self.redis.get(cache_key) return json.loads(cached_data) if cached_data else None except Exception: return None async def cache_analysis(self, file_hash: str, result: Dict[str, Any]): """Cache analysis result.""" if not self.redis: return try: cache_key = f"analysis:{file_hash}" self.redis.setex(cache_key, self.cache_ttl, json.dumps(result)) except Exception as e: print(f"Warning: Failed to cache analysis: {e}") # Content Optimizer class ContentOptimizer: @staticmethod def optimize_content_for_claude(content: str, max_tokens: int = 8000) -> str: """Optimize file content for Claude API limits.""" if content is None: return "" if len(content) > max_tokens * 4: # Rough token estimation # Extract important lines lines = content.split('\n') important_lines = [] for line in lines: # Keep imports, function definitions, class definitions if (line.strip().startswith(('import ', 'from ', 'def ', 'class ', 'export ', 'const ', 'let ', 'var ')) or line.strip().startswith(('function ', 'class ', 'interface ', 'type '))): important_lines.append(line) # Limit to 200 lines important_lines = important_lines[:200] optimized_content = '\n'.join(important_lines) optimized_content += f"\n\n... [Content truncated for analysis - {len(content)} chars total]" return optimized_content return content # Sanitizers to ensure JSON-serializable, primitive types def sanitize_analysis_result(analysis): """Ensure analysis object only contains JSON-serializable types.""" try: print(f"🔍 Sanitizing analysis object...") # Sanitize repo_path try: if hasattr(analysis, 'repo_path'): analysis.repo_path = str(analysis.repo_path) if analysis.repo_path else "" except Exception as e: print(f"⚠️ Error sanitizing repo_path: {e}") analysis.repo_path = "" # Sanitize file_analyses list try: if hasattr(analysis, 'file_analyses') and analysis.file_analyses: print(f"🔍 Sanitizing {len(analysis.file_analyses)} file analyses...") for idx, fa in enumerate(analysis.file_analyses): try: # Path to string if hasattr(fa, 'path'): fa.path = str(fa.path) # issues_found to list of strings if hasattr(fa, 'issues_found'): issues = fa.issues_found if isinstance(issues, str): fa.issues_found = [issues] elif isinstance(issues, (list, tuple)): fa.issues_found = [str(x) for x in issues] else: fa.issues_found = [] else: fa.issues_found = [] # recommendations to list of strings if hasattr(fa, 'recommendations'): recs = fa.recommendations if isinstance(recs, str): fa.recommendations = [recs] elif isinstance(recs, (list, tuple)): fa.recommendations = [str(x) for x in recs] else: fa.recommendations = [] else: fa.recommendations = [] except Exception as fa_err: print(f"⚠️ Error sanitizing file[{idx}]: {fa_err}") # Ensure fields exist even if there's an error if not hasattr(fa, 'path'): fa.path = "" if not hasattr(fa, 'issues_found'): fa.issues_found = [] if not hasattr(fa, 'recommendations'): fa.recommendations = [] except Exception as files_err: print(f"⚠️ Error iterating file_analyses: {files_err}") print(f"✅ Analysis object sanitized successfully") return analysis except Exception as e: print(f"❌ Critical sanitization error: {e}") import traceback traceback.print_exc() return analysis # Global instances rate_limiter = ClaudeRateLimiter() git_client = GitIntegrationClient() analysis_cache = AnalysisCache() content_optimizer = ContentOptimizer() class AnalysisRequest(BaseModel): repo_path: str output_format: str = "pdf" # pdf, json max_files: int = 50 class RepositoryAnalysisRequest(BaseModel): repository_id: str user_id: str output_format: str = "pdf" # pdf, json max_files: int = 0 # 0 = unlimited files analysis_type: str = "full" # fast, basic, full class AnalysisResponse(BaseModel): success: bool message: str analysis_id: Optional[str] = None report_path: Optional[str] = None stats: Optional[Dict[str, Any]] = None @app.on_event("startup") async def startup_event(): """Initialize the analyzer on startup.""" global analyzer try: # Load environment variables from dotenv import load_dotenv load_dotenv() # Get API key api_key = os.getenv('ANTHROPIC_API_KEY') if not api_key: raise Exception("ANTHROPIC_API_KEY not found in environment") # Initialize analyzer with enhanced capabilities if available config = get_memory_config() # Add performance optimization settings to config config.update({ 'max_workers': 50, # Increased parallel processing workers 'batch_size': 200, # Increased batch processing size 'cache_ttl': 3600, # Cache TTL (1 hour) 'max_file_size': 0, # No file size limit (0 = unlimited) 'analysis_timeout': 1800, # 30 minute timeout for large repositories 'fast_mode': False, # Disable fast mode to use full AI analysis 'redis_host': 'pipeline_redis', # Use Docker service name for Redis 'redis_port': 6379, # Use standard Redis port 'redis_password': 'redis_secure_2024', 'mongodb_url': 'mongodb://pipeline_admin:mongo_secure_2024@pipeline_mongodb:27017/', 'postgres_host': 'pipeline_postgres', 'postgres_password': 'secure_pipeline_2024' }) if ENHANCED_ANALYZER_AVAILABLE: print("✅ Using Enhanced Analyzer with intelligent chunking and parallel processing") analyzer = create_enhanced_analyzer(api_key, config) else: print("✅ Using Standard Analyzer with performance optimizations") analyzer = EnhancedGitHubAnalyzer(api_key, config) print("✅ AI Analysis Service initialized successfully") except Exception as e: print(f"❌ Failed to initialize AI Analysis Service: {e}") raise @app.get("/health") async def health_check(): """Health check endpoint.""" return { "status": "healthy", "service": "ai-analysis-service", "timestamp": datetime.now().isoformat(), "version": "1.0.0" } @app.post("/analyze") async def analyze_repository(request: AnalysisRequest, background_tasks: BackgroundTasks): """Analyze a repository using direct file path.""" try: if not analyzer: raise HTTPException(status_code=500, detail="Analyzer not initialized") # Generate unique analysis ID analysis_id = f"analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}" # Create temporary directory for this analysis temp_dir = tempfile.mkdtemp(prefix=f"ai_analysis_{analysis_id}_") try: # Run analysis analysis = await analyzer.analyze_repository_with_memory( request.repo_path ) # Ensure fields are JSON-safe and types are normalized analysis = sanitize_analysis_result(analysis) # DEBUG: Log field types print(f"DEBUG: repo_path type: {type(analysis.repo_path)}") if analysis.file_analyses: for i, fa in enumerate(analysis.file_analyses[:3]): # Check first 3 print(f"DEBUG FA[{i}]: path type={type(fa.path)}, issues_found type={type(fa.issues_found)}, recommendations type={type(fa.recommendations)}") if fa.issues_found: print(f" issues_found[0] type: {type(fa.issues_found[0])}") if fa.recommendations: print(f" recommendations[0] type: {type(fa.recommendations[0])}") # Generate report if request.output_format == "pdf": report_path = f"reports/{analysis_id}_analysis.pdf" try: analyzer.create_pdf_report(analysis, report_path) except Exception as pdf_err: print(f"⚠️ PDF generation failed: {pdf_err}, falling back to JSON") report_path = f"reports/{analysis_id}_analysis.json" with open(report_path, 'w') as f: json.dump({ "repo_path": str(analysis.repo_path), "total_files": analysis.total_files, "total_lines": analysis.total_lines, "languages": analysis.languages, "code_quality_score": analysis.code_quality_score, "architecture_assessment": analysis.architecture_assessment, "security_assessment": analysis.security_assessment, "executive_summary": analysis.executive_summary, "file_analyses": [ { "path": str(fa.path), "language": fa.language, "lines_of_code": fa.lines_of_code, "severity_score": fa.severity_score, "issues_found": [str(issue) for issue in fa.issues_found] if isinstance(fa.issues_found, (list, tuple)) else [], "recommendations": [str(rec) for rec in fa.recommendations] if isinstance(fa.recommendations, (list, tuple)) else [] } for fa in analysis.file_analyses ] }, f, indent=2) # Calculate stats - ensure all fields are properly typed stats = { "total_files": analysis.total_files, "total_lines": analysis.total_lines, "languages": analysis.languages, "code_quality_score": analysis.code_quality_score, "high_quality_files": len([fa for fa in analysis.file_analyses if fa.severity_score >= 8]), "medium_quality_files": len([fa for fa in analysis.file_analyses if 5 <= fa.severity_score < 8]), "low_quality_files": len([fa for fa in analysis.file_analyses if fa.severity_score < 5]), "total_issues": sum(len(fa.issues_found) if isinstance(fa.issues_found, (list, tuple)) else 0 for fa in analysis.file_analyses) } # Pre-sanitize all file analyses before stats calculation if hasattr(analysis, 'file_analyses'): for fa in analysis.file_analyses: # Force issues_found to be a list if not isinstance(fa.issues_found, list): if isinstance(fa.issues_found, tuple): fa.issues_found = list(fa.issues_found) else: fa.issues_found = [] # Force recommendations to be a list if not isinstance(fa.recommendations, list): if isinstance(fa.recommendations, tuple): fa.recommendations = list(fa.recommendations) else: fa.recommendations = [] # Now calculate stats safely stats = { "total_files": analysis.total_files, "total_lines": analysis.total_lines, "languages": analysis.languages, "code_quality_score": analysis.code_quality_score, "high_quality_files": len([fa for fa in analysis.file_analyses if fa.severity_score >= 8]), "medium_quality_files": len([fa for fa in analysis.file_analyses if 5 <= fa.severity_score < 8]), "low_quality_files": len([fa for fa in analysis.file_analyses if fa.severity_score < 5]), "total_issues": sum(len(fa.issues_found) for fa in analysis.file_analyses) } # Use dictionary instead of Pydantic model to avoid serialization issues return { "success": True, "message": "Analysis completed successfully", "analysis_id": analysis_id, "report_path": report_path, "stats": stats } finally: # Cleanup temporary directory if os.path.exists(temp_dir): shutil.rmtree(temp_dir) except Exception as e: return AnalysisResponse( success=False, message=f"Analysis failed: {str(e)}", analysis_id=None, report_path=None, stats=None ) @app.post("/analyze-repository") async def analyze_repository_by_id(request: RepositoryAnalysisRequest, background_tasks: BackgroundTasks): """Analyze a repository by ID using git-integration service.""" global os, shutil, tempfile, json # Ensure we're using the module-level imports, not shadowed local variables try: print(f"🔍 [DEBUG] Analysis request received: {request}") if not analyzer: raise HTTPException(status_code=500, detail="Analyzer not initialized") # Get repository information from git-integration service try: repo_info = await git_client.get_repository_info(request.repository_id, request.user_id) local_path = repo_info.get('local_path') # Keep for compatibility but don't check file system # Note: We no longer check local_path existence since we use API approach except Exception as e: raise HTTPException( status_code=500, detail=f"Failed to get repository info: {str(e)}" ) # Generate unique analysis ID analysis_id = f"repo_analysis_{request.repository_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}" # Create temporary directory for this analysis temp_dir = tempfile.mkdtemp(prefix=f"ai_analysis_{analysis_id}_") try: # Check if fast mode is enabled if request.analysis_type == "fast" or request.analysis_type == "basic": # Run fast analysis with timeout analysis = await analyze_repository_fast( local_path, request.repository_id, request.user_id, request.max_files ) else: # Run full analysis with rate limiting and caching analysis = await analyze_repository_with_optimizations( local_path, request.repository_id, request.user_id, request.max_files ) # Normalize types before serialization/PDF analysis = sanitize_analysis_result(analysis) # DEBUG: Log field types print(f"DEBUG: repo_path type: {type(analysis.repo_path)}") if analysis.file_analyses: for i, fa in enumerate(analysis.file_analyses[:3]): # Check first 3 print(f"DEBUG FA[{i}]: path type={type(fa.path)}, issues_found type={type(fa.issues_found)}, recommendations type={type(fa.recommendations)}") if fa.issues_found: print(f" issues_found[0] type: {type(fa.issues_found[0])}") if fa.recommendations: print(f" recommendations[0] type: {type(fa.recommendations[0])}") try: # Generate report if request.output_format == "pdf": report_path = f"reports/{analysis_id}_analysis.pdf" try: analyzer.create_pdf_report(analysis, report_path) except Exception as pdf_err: print(f"⚠️ PDF generation failed: {pdf_err}, falling back to JSON") report_path = f"reports/{analysis_id}_analysis.json" with open(report_path, 'w') as f: json.dump({ "repository_id": request.repository_id, "repo_path": str(analysis.repo_path), "total_files": analysis.total_files, "total_lines": analysis.total_lines, "languages": analysis.languages, "code_quality_score": analysis.code_quality_score, "architecture_assessment": analysis.architecture_assessment, "security_assessment": analysis.security_assessment, "executive_summary": analysis.executive_summary, "file_analyses": [ { "path": str(fa.path), "language": fa.language, "lines_of_code": fa.lines_of_code, "severity_score": fa.severity_score, "issues_found": [str(issue) for issue in fa.issues_found] if isinstance(fa.issues_found, (list, tuple)) else [], "recommendations": [str(rec) for rec in fa.recommendations] if isinstance(fa.recommendations, (list, tuple)) else [] } for fa in analysis.file_analyses ] }, f, indent=2) else: report_path = f"reports/{analysis_id}_analysis.json" with open(report_path, 'w') as f: json.dump({ "repository_id": request.repository_id, "repo_path": str(analysis.repo_path), "total_files": analysis.total_files, "total_lines": analysis.total_lines, "languages": analysis.languages, "code_quality_score": analysis.code_quality_score, "architecture_assessment": analysis.architecture_assessment, "security_assessment": analysis.security_assessment, "executive_summary": analysis.executive_summary, "file_analyses": [ { "path": str(fa.path), "language": fa.language, "lines_of_code": fa.lines_of_code, "severity_score": fa.severity_score, "issues_found": [str(issue) for issue in fa.issues_found] if isinstance(fa.issues_found, (list, tuple)) else [], "recommendations": [str(rec) for rec in fa.recommendations] if isinstance(fa.recommendations, (list, tuple)) else [] } for fa in analysis.file_analyses ] }, f, indent=2) except Exception as report_err: print(f"ERROR during report generation: {report_err}") import traceback traceback.print_exc() raise print("✅ Report generated successfully, now calculating stats...") try: print("Calculating stats...") # Calculate stats stats = { "repository_id": request.repository_id, "total_files": analysis.total_files, "total_lines": analysis.total_lines, "languages": analysis.languages, "code_quality_score": analysis.code_quality_score, "high_quality_files": len([fa for fa in analysis.file_analyses if fa.severity_score >= 8]), "medium_quality_files": len([fa for fa in analysis.file_analyses if 5 <= fa.severity_score < 8]), "low_quality_files": len([fa for fa in analysis.file_analyses if fa.severity_score < 5]), "total_issues": sum(len(fa.issues_found) if isinstance(fa.issues_found, (list, tuple)) else 0 for fa in analysis.file_analyses) } # Pre-sanitize all file analyses before stats calculation if hasattr(analysis, 'file_analyses'): for fa in analysis.file_analyses: # Force issues_found to be a list if not isinstance(fa.issues_found, list): if isinstance(fa.issues_found, tuple): fa.issues_found = list(fa.issues_found) else: fa.issues_found = [] # Force recommendations to be a list if not isinstance(fa.recommendations, list): if isinstance(fa.recommendations, tuple): fa.recommendations = list(fa.recommendations) else: fa.recommendations = [] # Now calculate stats safely stats = { "repository_id": request.repository_id, "total_files": analysis.total_files, "total_lines": analysis.total_lines, "languages": analysis.languages, "code_quality_score": analysis.code_quality_score, "high_quality_files": len([fa for fa in analysis.file_analyses if fa.severity_score >= 8]), "medium_quality_files": len([fa for fa in analysis.file_analyses if 5 <= fa.severity_score < 8]), "low_quality_files": len([fa for fa in analysis.file_analyses if fa.severity_score < 5]), "total_issues": sum(len(fa.issues_found) for fa in analysis.file_analyses) } # Use dictionary instead of Pydantic model to avoid serialization issues return { "success": True, "message": "Repository analysis completed successfully", "analysis_id": analysis_id, "report_path": report_path, "stats": stats } except Exception as e: print(f"❌ Repository analysis failed: {str(e)}") return AnalysisResponse( success=False, message=f"Repository analysis failed: {str(e)}" ) finally: # Cleanup temporary directory if 'temp_dir' in locals(): if os.path.exists(temp_dir): shutil.rmtree(temp_dir) except HTTPException: raise except Exception as e: import traceback traceback.print_exc() print(f"❌ Repository analysis failed: {str(e)}") tb_lines = traceback.format_exception(type(e), e, e.__traceback__) print("FULL TRACEBACK:") for line in tb_lines: print(line.rstrip()) return { "success": False, "message": f"Repository analysis failed: {str(e)}", "analysis_id": None, "report_path": None, "stats": None } async def analyze_repository_fast(local_path: str, repository_id: str, user_id: str, max_files: int = 50): """Fast analysis with timeout and limited files for quick results.""" try: print(f"🚀 Starting FAST analysis for repository {repository_id}") # Set a timeout for fast analysis import asyncio timeout_seconds = 60 # 1 minute timeout for fast analysis async def run_analysis(): # Get repository files from API (limited to max_files) files_data = await get_repository_files_from_api(repository_id, user_id, max_files) if not files_data: raise Exception("No files found in repository") print(f"📁 Found {len(files_data)} files for fast analysis") # Create a simple analysis without AI processing from ai_analyze import FileAnalysis, RepositoryAnalysis file_analyses = [] total_lines = 0 languages = set() for file_path, content in files_data[:max_files]: # Limit to max_files # files_data is a list of tuples (file_path, content) # Basic analysis without AI lines = len(content.splitlines()) if content else 0 total_lines += lines # Enhanced language detection language = "Unknown" if '.' in file_path: ext = '.' + file_path.split('.')[-1].lower() language_map = { '.py': 'Python', '.js': 'JavaScript', '.ts': 'TypeScript', '.tsx': 'TypeScript', '.jsx': 'JavaScript', '.java': 'Java', '.cpp': 'C++', '.c': 'C', '.cs': 'C#', '.go': 'Go', '.rs': 'Rust', '.php': 'PHP', '.rb': 'Ruby', '.swift': 'Swift', '.kt': 'Kotlin', '.html': 'HTML', '.htm': 'HTML', '.css': 'CSS', '.scss': 'SCSS', '.sass': 'SASS', '.sql': 'SQL', '.json': 'JSON', '.yaml': 'YAML', '.yml': 'YAML', '.md': 'Markdown', '.txt': 'Text', '.xml': 'XML', '.sh': 'Shell', '.bash': 'Shell', '.zsh': 'Shell', '.fish': 'Shell', '.dockerfile': 'Docker', '.dockerignore': 'Docker', '.gitignore': 'Git', '.gitattributes': 'Git', '.env': 'Environment', '.ini': 'Config', '.cfg': 'Config', '.conf': 'Config', '.toml': 'TOML', '.lock': 'Lock File', '.log': 'Log', '.tmp': 'Temporary', '.temp': 'Temporary' } language = language_map.get(ext, 'Unknown') else: # Try to detect from filename filename = file_path.lower() if 'dockerfile' in filename: language = 'Docker' elif 'makefile' in filename: language = 'Makefile' elif 'readme' in filename: language = 'Markdown' elif 'license' in filename: language = 'Text' elif 'changelog' in filename: language = 'Text' languages.add(language) # Perform smart fast analysis issues_found = [] recommendations = [] complexity_score = 5.0 severity_score = 7.0 # Basic code quality analysis if lines > 500: issues_found.append("Large file - consider breaking into smaller modules") recommendations.append("Split into smaller, focused files") complexity_score += 2 severity_score -= 1 if lines < 10: issues_found.append("Very small file - might be incomplete") recommendations.append("Review if this file is necessary") severity_score -= 0.5 # Language-specific analysis if language == "Python": if "import" not in content and "def" not in content and "class" not in content: issues_found.append("Python file without imports, functions, or classes") recommendations.append("Add proper Python structure") severity_score -= 1 if "print(" in content and "def " not in content: issues_found.append("Contains print statements - consider logging") recommendations.append("Use proper logging instead of print statements") complexity_score += 1 elif language == "JavaScript": if "console.log" in content and "function" not in content: issues_found.append("Contains console.log statements") recommendations.append("Use proper logging or remove debug statements") complexity_score += 1 elif language == "Markdown": if lines < 5: issues_found.append("Very short documentation") recommendations.append("Add more detailed documentation") severity_score += 1 # Calculate final scores complexity_score = max(1.0, min(10.0, complexity_score)) severity_score = max(1.0, min(10.0, severity_score)) # Generate detailed analysis detailed_analysis = f"Fast analysis of {file_path}: {lines} lines, {language} code. " if issues_found: detailed_analysis += f"Issues found: {len(issues_found)}. " else: detailed_analysis += "No major issues detected. " detailed_analysis += f"Complexity: {complexity_score:.1f}/10, Quality: {severity_score:.1f}/10" # Create smart file analysis file_analysis = FileAnalysis( path=str(file_path), language=language, lines_of_code=lines, complexity_score=complexity_score, issues_found=issues_found if issues_found else ["No issues detected in fast analysis"], recommendations=recommendations if recommendations else ["File appears well-structured"], detailed_analysis=detailed_analysis, severity_score=severity_score ) file_analyses.append(file_analysis) # Create language count dictionary language_counts = {} for file_analysis in file_analyses: lang = file_analysis.language language_counts[lang] = language_counts.get(lang, 0) + 1 # Create repository analysis analysis = RepositoryAnalysis( repo_path=local_path, total_files=len(file_analyses), total_lines=total_lines, languages=language_counts, code_quality_score=7.5, # Default good score architecture_assessment="Fast analysis - architecture details require full analysis", security_assessment="Fast analysis - security details require full analysis", executive_summary=f"Fast analysis completed for {len(file_analyses)} files. Total lines: {total_lines}. Languages: {', '.join(language_counts.keys())}", file_analyses=file_analyses ) return analysis # Run with timeout analysis = await asyncio.wait_for(run_analysis(), timeout=timeout_seconds) print(f"✅ Fast analysis completed in under {timeout_seconds} seconds") return analysis except asyncio.TimeoutError: print(f"⏰ Fast analysis timed out after {timeout_seconds} seconds") raise Exception(f"Fast analysis timed out after {timeout_seconds} seconds") except Exception as e: print(f"❌ Fast analysis failed: {e}") raise e async def get_repository_files_from_api(repository_id: str, user_id: str, max_files: int = 100): """Get repository files from Git Integration Service API.""" try: print(f"🔍 [DEBUG] Getting repository files for {repository_id} with user {user_id}") # Get all files by scanning all directories recursively async with httpx.AsyncClient(timeout=30.0) as client: # First, get all directories from the repository print(f"🔍 [DEBUG] Getting all directories for repository") # Get all directories from database directories_query = f""" SELECT DISTINCT rd.relative_path FROM repository_directories rd WHERE rd.repository_id = '{repository_id}' ORDER BY rd.relative_path """ # We need to get all directories and then scan each one # Let's use a different approach - get all files directly from the database all_files_query = f""" SELECT file->>'relative_path' as relative_path, file->>'filename' as filename FROM repository_files rf, jsonb_array_elements(rf.files) as file WHERE rf.repository_id = '{repository_id}' ORDER BY file->>'relative_path' """ # Get all directories by making multiple structure requests all_directories = set() all_directories.add('') # Add root directory # First, get root structure structure_response = await client.get( f"{git_client.base_url}/api/github/repository/{repository_id}/structure", headers={'x-user-id': user_id} ) if structure_response.status_code != 200: raise Exception(f"Failed to get repository structure: {structure_response.text}") structure_data = structure_response.json() if not structure_data.get('success'): raise Exception(f"Git Integration Service error: {structure_data.get('message', 'Unknown error')}") # Get all directories from root structure structure_items = structure_data.get('data', {}).get('structure', []) directories_to_scan = [] for item in structure_items: if isinstance(item, dict) and item.get('type') == 'directory': dir_path = item.get('path', '') if dir_path: all_directories.add(dir_path) directories_to_scan.append(dir_path) print(f"🔍 [DEBUG] Found directory: {dir_path}") # Now scan each directory to find subdirectories for directory in directories_to_scan: try: print(f"🔍 [DEBUG] Getting structure for directory: '{directory}'") dir_structure_response = await client.get( f"{git_client.base_url}/api/github/repository/{repository_id}/structure", params={'path': directory}, headers={'x-user-id': user_id} ) if dir_structure_response.status_code == 200: dir_structure_data = dir_structure_response.json() if dir_structure_data.get('success'): dir_items = dir_structure_data.get('data', {}).get('structure', []) for item in dir_items: if isinstance(item, dict) and item.get('type') == 'directory': subdir_path = item.get('path', '') if subdir_path and subdir_path not in all_directories: all_directories.add(subdir_path) directories_to_scan.append(subdir_path) print(f"🔍 [DEBUG] Found subdirectory: {subdir_path}") else: print(f"⚠️ [DEBUG] Failed to get structure for directory '{directory}': {dir_structure_data.get('message')}") else: print(f"⚠️ [DEBUG] Failed to get structure for directory '{directory}': HTTP {dir_structure_response.status_code}") except Exception as e: print(f"⚠️ [DEBUG] Error getting structure for directory '{directory}': {e}") print(f"🔍 [DEBUG] Found {len(all_directories)} total directories to scan") # Scan each directory for files files_to_analyze = [] for directory in all_directories: try: print(f"🔍 [DEBUG] Scanning directory: '{directory}'") files_response = await client.get( f"{git_client.base_url}/api/github/repository/{repository_id}/files", params={'directory_path': directory} if directory else {}, headers={'x-user-id': user_id} ) if files_response.status_code == 200: files_data = files_response.json() if files_data.get('success'): dir_files = files_data.get('data', {}).get('files', []) for file_info in dir_files: file_path = file_info.get('relative_path', '') if file_path: files_to_analyze.append((file_path, None)) print(f"🔍 [DEBUG] Found file in '{directory}': {file_path}") else: print(f"⚠️ [DEBUG] Failed to get files from directory '{directory}': {files_data.get('message')}") else: print(f"⚠️ [DEBUG] Failed to get files from directory '{directory}': HTTP {files_response.status_code}") except Exception as e: print(f"⚠️ [DEBUG] Error scanning directory '{directory}': {e}") print(f"🔍 [DEBUG] Found {len(files_to_analyze)} total files after scanning all directories") print(f"🔍 [DEBUG] Found {len(files_to_analyze)} files to analyze") # Limit files if needed (0 means unlimited) if max_files > 0 and len(files_to_analyze) > max_files: files_to_analyze = files_to_analyze[:max_files] print(f"🔍 [DEBUG] Limited to {max_files} files") # Fetch file content for each file files_with_content = [] for i, (file_path, _) in enumerate(files_to_analyze): try: print(f"🔍 [DEBUG] Fetching content for file {i+1}/{len(files_to_analyze)}: {file_path}") # Get file content from Git Integration Service content_response = await client.get( f"{git_client.base_url}/api/github/repository/{repository_id}/file-content", params={'file_path': file_path}, headers={'x-user-id': user_id} ) if content_response.status_code == 200: content_data = content_response.json() if content_data.get('success'): # Content is nested in data.content content = content_data.get('data', {}).get('content', '') files_with_content.append((file_path, content)) print(f"🔍 [DEBUG] Successfully got content for {file_path} ({len(content)} chars)") else: print(f"Warning: Failed to get content for {file_path}: {content_data.get('message')}") else: print(f"Warning: Failed to get content for {file_path}: HTTP {content_response.status_code}") except Exception as e: print(f"Warning: Error getting content for {file_path}: {e}") continue print(f"🔍 [DEBUG] Returning {len(files_with_content)} files with content") return files_with_content except Exception as e: print(f"Error getting repository files from API: {e}") import traceback traceback.print_exc() return [] async def analyze_repository_with_optimizations(repo_path: str, repository_id: str, user_id: str, max_files: int = 100): """Analyze repository with rate limiting, caching, and content optimization.""" from pathlib import Path try: # Get repository files from Git Integration Service API files_to_analyze = await get_repository_files_from_api(repository_id, user_id, max_files) if not files_to_analyze: raise Exception("No files found to analyze") print(f"Starting optimized analysis of {len(files_to_analyze)} files...") file_analyses = [] processed_files = 0 for i, (file_path, content) in enumerate(files_to_analyze): print(f"Analyzing file {i+1}/{len(files_to_analyze)}: {file_path}") # Generate file hash for caching file_hash = hashlib.sha256((content or '').encode()).hexdigest() # Check cache first cached_analysis = await analysis_cache.get_cached_analysis(file_hash) if cached_analysis: print(f"Using cached analysis for {file_path}") # Convert cached dictionary back to analysis object from ai_analyze import FileAnalysis cached_obj = FileAnalysis( path=cached_analysis["path"], language=cached_analysis["language"], lines_of_code=cached_analysis["lines_of_code"], complexity_score=cached_analysis["complexity_score"], issues_found=cached_analysis["issues_found"], recommendations=cached_analysis["recommendations"], detailed_analysis=cached_analysis["detailed_analysis"], severity_score=cached_analysis["severity_score"] ) file_analyses.append(cached_obj) processed_files += 1 continue # Rate limiting await rate_limiter.wait_if_needed() # Optimize content for Claude API optimized_content = content_optimizer.optimize_content_for_claude(content) # Analyze file with memory try: # Convert string file path to Path object file_path_obj = Path(file_path) # Use enhanced analysis if available, fallback to standard if hasattr(analyzer, 'analyze_file_with_memory_enhanced'): print(f"🔍 [DEBUG] Using ENHANCED analysis method for {file_path}") analysis = await analyzer.analyze_file_with_memory_enhanced( file_path_obj, optimized_content, repository_id ) else: print(f"🔍 [DEBUG] Using STANDARD analysis method for {file_path}") analysis = await analyzer.analyze_file_with_memory( file_path_obj, optimized_content, repository_id ) # Cache the result analysis_dict = { "path": str(analysis.path), "language": analysis.language, "lines_of_code": analysis.lines_of_code, "complexity_score": analysis.complexity_score, "issues_found": analysis.issues_found, "recommendations": analysis.recommendations, "detailed_analysis": analysis.detailed_analysis, "severity_score": analysis.severity_score } await analysis_cache.cache_analysis(file_hash, analysis_dict) file_analyses.append(analysis) processed_files += 1 except Exception as e: print(f"Error analyzing {file_path}: {e}") # Continue with other files continue # Repository-level analysis print("Performing repository-level analysis...") # Use a temporary directory path since we don't have a local repo_path temp_repo_path = f"/tmp/repo_{repository_id}" if repo_path is None else repo_path # Create proper context_memories structure context_memories = { 'persistent_knowledge': [], 'similar_analyses': [] } # Repository-level analysis with enhanced context try: print(f"DEBUG: Calling analyze_repository_overview_with_memory...") architecture_assessment, security_assessment = await analyzer.analyze_repository_overview_with_memory( temp_repo_path, file_analyses, context_memories, repository_id ) print(f"DEBUG: analyze_repository_overview_with_memory completed") except Exception as ov_err: print(f"ERROR in analyze_repository_overview_with_memory: {ov_err}") import traceback traceback.print_exc() architecture_assessment = f"Error: {str(ov_err)}" security_assessment = f"Error: {str(ov_err)}" # Create repository analysis result from ai_analyze import RepositoryAnalysis # Calculate code quality score safely if file_analyses and len(file_analyses) > 0: valid_scores = [fa.severity_score for fa in file_analyses if fa.severity_score is not None] code_quality_score = sum(valid_scores) / len(valid_scores) if valid_scores else 5.0 else: code_quality_score = 5.0 # Calculate total lines safely total_lines = sum(fa.lines_of_code for fa in file_analyses if fa.lines_of_code is not None) if file_analyses else 0 # Get languages safely - count occurrences of each language if file_analyses: from collections import Counter language_list = [fa.language for fa in file_analyses if fa.language is not None] languages = dict(Counter(language_list)) else: languages = {} # DEBUG: Check file_analyses before creating RepositoryAnalysis print(f"DEBUG: About to create RepositoryAnalysis with {len(file_analyses)} file_analyses") if file_analyses: for i, fa in enumerate(file_analyses[:2]): try: print(f" FA[{i}]: path type={type(fa.path).__name__}, issues={type(fa.issues_found).__name__}, recs={type(fa.recommendations).__name__}") except Exception as debug_err: print(f" FA[{i}]: DEBUG ERROR - {debug_err}") return RepositoryAnalysis( repo_path=str(temp_repo_path), total_files=len(files_to_analyze), total_lines=total_lines, languages=languages, code_quality_score=code_quality_score, architecture_assessment=architecture_assessment or "Analysis in progress", security_assessment=security_assessment or "Analysis in progress", file_analyses=file_analyses, executive_summary=f"Analysis completed for {processed_files} files in repository {repository_id}", high_quality_files=[] ) except Exception as e: print(f"Error in optimized analysis: {e}") raise @app.get("/repository/{repository_id}/info") async def get_repository_info(repository_id: str, user_id: str): """Get repository information from git-integration service.""" try: repo_info = await git_client.get_repository_info(repository_id, user_id) return { "success": True, "repository_info": repo_info } except Exception as e: raise HTTPException( status_code=500, detail=f"Failed to get repository info: {str(e)}" ) @app.get("/reports/{filename}") async def download_report(filename: str): """Download analysis report.""" report_path = f"reports/{filename}" if not os.path.exists(report_path): raise HTTPException(status_code=404, detail="Report not found") return FileResponse( report_path, media_type='application/pdf', headers={ 'Content-Disposition': f'inline; filename="{filename}"' } ) @app.get("/memory/stats") async def get_memory_stats(): """Get memory system statistics.""" try: if not analyzer: raise HTTPException(status_code=500, detail="Analyzer not initialized") stats = await analyzer.memory_manager.get_memory_stats() return { "success": True, "memory_stats": stats } except Exception as e: raise HTTPException(status_code=500, detail=f"Failed to get memory stats: {str(e)}") @app.post("/memory/query") async def query_memory(query: str, repo_context: str = ""): """Query the memory system.""" try: if not analyzer: raise HTTPException(status_code=500, detail="Analyzer not initialized") result = await analyzer.query_memory(query, repo_context) return { "success": True, "query": query, "result": result } except Exception as e: raise HTTPException(status_code=500, detail=f"Memory query failed: {str(e)}") @app.get("/enhanced/status") async def get_enhanced_status(): """Get enhanced processing status and statistics.""" return { "success": True, "enhanced_available": ENHANCED_ANALYZER_AVAILABLE, "message": "Enhanced chunking system is active" } @app.post("/enhanced/toggle") async def toggle_enhanced_processing(enabled: bool = True): """Toggle enhanced processing on/off.""" return { "success": True, "message": f"Enhanced processing {'enabled' if enabled else 'disabled'}", "enhanced_enabled": enabled } if __name__ == "__main__": port = int(os.getenv('PORT', 8022)) host = os.getenv('HOST', '0.0.0.0') print(f"🚀 Starting AI Analysis Service on {host}:{port}") uvicorn.run(app, host=host, port=port)