#!/usr/bin/env python3 """ Robust GitHub Repository AI Analysis Tool Simplified version with better error handling and JSON parsing. """ import os import asyncio from pathlib import Path from typing import Dict, List, Optional, Tuple from datetime import datetime import argparse from dataclasses import dataclass import shutil import tempfile import json import re from collections import Counter # Core packages import anthropic from dotenv import load_dotenv import git # PDF generation from reportlab.lib.pagesizes import A4 from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle from reportlab.lib.enums import TA_CENTER, TA_LEFT from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak, Table, TableStyle from reportlab.lib import colors @dataclass class FileAnalysis: path: str language: str lines_of_code: int complexity_score: float issues_found: List[str] recommendations: List[str] detailed_analysis: str severity_score: float @dataclass class RepositoryAnalysis: repo_path: str total_files: int total_lines: int languages: Dict[str, int] architecture_assessment: str security_assessment: str code_quality_score: float file_analyses: List[FileAnalysis] executive_summary: str class RobustGitHubAnalyzer: def __init__(self, api_key: str): self.client = anthropic.Anthropic(api_key=api_key) self.temp_dir = None # Language mapping for file detection self.language_map = { '.py': 'Python', '.js': 'JavaScript', '.ts': 'TypeScript', '.tsx': 'TypeScript', '.jsx': 'JavaScript', '.java': 'Java', '.cpp': 'C++', '.c': 'C', '.cs': 'C#', '.go': 'Go', '.rs': 'Rust', '.php': 'PHP', '.rb': 'Ruby', '.swift': 'Swift', '.kt': 'Kotlin', '.html': 'HTML', '.css': 'CSS', '.scss': 'SCSS', '.sass': 'SASS', '.sql': 'SQL', '.yaml': 'YAML', '.yml': 'YAML', '.json': 'JSON', '.xml': 'XML', '.sh': 'Shell', '.dockerfile': 'Docker', '.md': 'Markdown', '.txt': 'Text' } # Code file extensions to analyze self.code_extensions = set(self.language_map.keys()) def clone_repository(self, repo_path: str) -> str: """Clone repository or use existing path.""" if os.path.exists(repo_path): print(f"Using existing repository: {repo_path}") return repo_path else: print(f"Cloning repository: {repo_path}") self.temp_dir = tempfile.mkdtemp(prefix="repo_analysis_") try: git.Repo.clone_from(repo_path, self.temp_dir) return self.temp_dir except Exception as e: raise Exception(f"Failed to clone repository: {e}") def get_file_language(self, file_path: Path) -> str: """Get programming language from file extension.""" return self.language_map.get(file_path.suffix.lower(), 'Unknown') def calculate_complexity_score(self, content: str) -> float: """Calculate basic complexity score based on code patterns.""" lines = content.split('\n') complexity_indicators = ['if', 'else', 'elif', 'for', 'while', 'try', 'except', 'catch', 'switch'] complexity = 1 for line in lines: line_lower = line.lower().strip() for indicator in complexity_indicators: if indicator in line_lower: complexity += 1 # Normalize to 1-10 scale return min(complexity / max(len(lines), 1) * 100, 10.0) async def analyze_file_comprehensive(self, file_path: Path, content: str) -> FileAnalysis: """Perform comprehensive file analysis using a single, robust prompt.""" language = self.get_file_language(file_path) lines_of_code = len([line for line in content.split('\n') if line.strip()]) complexity_score = self.calculate_complexity_score(content) # Truncate content if too long if len(content) > 4000: content = content[:4000] + "\n... [truncated for analysis]" print(f" Analyzing {file_path.name} ({language}, {lines_of_code} lines)") # Create comprehensive analysis prompt prompt = f""" You are a senior software engineer with 25 years of experience. Analyze this {language} code file: FILENAME: {file_path.name} LANGUAGE: {language} LINES OF CODE: {lines_of_code} CODE: ```{language.lower()} {content} ``` Provide a comprehensive analysis covering: 1. ISSUES FOUND: List specific problems, bugs, security vulnerabilities, or code smells 2. RECOMMENDATIONS: Actionable suggestions for improvement 3. CODE QUALITY: Overall assessment of code quality and maintainability 4. SECURITY: Any security concerns or vulnerabilities 5. PERFORMANCE: Potential performance issues or optimizations 6. BEST PRACTICES: Adherence to coding standards and best practices Provide your analysis in clear, structured text (not JSON). Be specific and actionable. Rate the overall code quality from 1-10 where 10 is excellent. ANALYSIS: """ try: message = self.client.messages.create( model="claude-3-5-sonnet-20241022", max_tokens=3000, temperature=0.1, messages=[{"role": "user", "content": prompt}] ) analysis_text = message.content[0].text.strip() # Extract severity score from analysis severity_match = re.search(r'(\d+(?:\.\d+)?)/10', analysis_text) severity_score = float(severity_match.group(1)) if severity_match else 5.0 # Parse issues and recommendations from the text issues = self.extract_issues_from_analysis(analysis_text) recommendations = self.extract_recommendations_from_analysis(analysis_text) return FileAnalysis( path=str(file_path.relative_to(Path(self.temp_dir or '.'))), language=language, lines_of_code=lines_of_code, complexity_score=complexity_score, issues_found=issues, recommendations=recommendations, detailed_analysis=analysis_text, severity_score=severity_score ) except Exception as e: print(f" Error analyzing {file_path.name}: {e}") return FileAnalysis( path=str(file_path), language=language, lines_of_code=lines_of_code, complexity_score=complexity_score, issues_found=[f"Analysis failed: {str(e)}"], recommendations=["Review file manually due to analysis error"], detailed_analysis=f"Analysis failed due to error: {str(e)}", severity_score=5.0 ) def extract_issues_from_analysis(self, analysis_text: str) -> List[str]: """Extract issues from analysis text.""" issues = [] lines = analysis_text.split('\n') # Look for common issue indicators issue_keywords = ['issue', 'problem', 'bug', 'vulnerability', 'error', 'warning', 'concern'] for line in lines: line_lower = line.lower().strip() if any(keyword in line_lower for keyword in issue_keywords): if line.strip() and not line.strip().startswith('#'): issues.append(line.strip()) return issues[:10] # Limit to top 10 issues def extract_recommendations_from_analysis(self, analysis_text: str) -> List[str]: """Extract recommendations from analysis text.""" recommendations = [] lines = analysis_text.split('\n') # Look for recommendation indicators rec_keywords = ['recommend', 'suggest', 'should', 'consider', 'improve'] for line in lines: line_lower = line.lower().strip() if any(keyword in line_lower for keyword in rec_keywords): if line.strip() and not line.strip().startswith('#'): recommendations.append(line.strip()) return recommendations[:10] # Limit to top 10 recommendations def scan_repository(self, repo_path: str, max_files: int = 50) -> List[Tuple[Path, str]]: """Scan repository and collect files for analysis.""" print(f"Scanning repository: {repo_path}") files_to_analyze = [] # Important files to always include important_files = { 'README.md', 'package.json', 'requirements.txt', 'Dockerfile', 'docker-compose.yml', 'tsconfig.json', 'next.config.js', 'tailwind.config.js', 'webpack.config.js', '.env.example' } for root, dirs, files in os.walk(repo_path): # Skip common build/cache directories dirs[:] = [d for d in dirs if not d.startswith('.') and d not in {'node_modules', '__pycache__', 'build', 'dist', 'target', 'venv', 'env', '.git', '.next', 'coverage'}] for file in files: if len(files_to_analyze) >= max_files: break file_path = Path(root) / file # Skip large files try: if file_path.stat().st_size > 1000000: # 1MB limit continue except: continue # Include important files or files with code extensions should_include = ( file.lower() in important_files or file_path.suffix.lower() in self.code_extensions or file.lower().startswith('dockerfile') ) if should_include: try: with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: content = f.read() if content.strip(): # Only non-empty files files_to_analyze.append((file_path, content)) except Exception as e: print(f"Could not read {file_path}: {e}") print(f"Found {len(files_to_analyze)} files to analyze") return files_to_analyze async def analyze_repository_overview(self, repo_path: str, file_analyses: List[FileAnalysis]) -> Tuple[str, str]: """Analyze repository architecture and security.""" print("Analyzing repository overview...") # Prepare summary data languages = dict(Counter(fa.language for fa in file_analyses)) total_lines = sum(fa.lines_of_code for fa in file_analyses) avg_quality = sum(fa.severity_score for fa in file_analyses) / len(file_analyses) if file_analyses else 5.0 # Get repository structure structure_lines = [] try: for root, dirs, files in os.walk(repo_path): dirs[:] = [d for d in dirs if not d.startswith('.') and d not in {'node_modules', '__pycache__'}] level = root.replace(repo_path, '').count(os.sep) indent = ' ' * level structure_lines.append(f"{indent}{os.path.basename(root)}/") for file in files[:3]: # Limit files shown per directory structure_lines.append(f"{indent} {file}") if len(structure_lines) > 50: # Limit total structure size break except Exception as e: structure_lines = [f"Error reading structure: {e}"] # Architecture analysis arch_prompt = f""" You are a Senior Software Architect with 25 years of experience. Analyze this repository: REPOSITORY STRUCTURE: {chr(10).join(structure_lines[:30])} STATISTICS: - Total files analyzed: {len(file_analyses)} - Total lines of code: {total_lines:,} - Languages: {languages} - Average code quality: {avg_quality:.1f}/10 TOP FILE ISSUES: {chr(10).join([f"- {fa.path}: {len(fa.issues_found)} issues" for fa in file_analyses[:10]])} Provide an architectural assessment covering: 1. Project type and purpose 2. Technology stack evaluation 3. Code organization and structure 4. Scalability and maintainability concerns 5. Key recommendations for improvement Keep response under 1500 words and focus on actionable insights. """ # Security analysis security_issues = [] for fa in file_analyses: security_issues.extend([issue for issue in fa.issues_found if any(keyword in issue.lower() for keyword in ['security', 'vulnerability', 'injection', 'xss', 'auth', 'password'])]) sec_prompt = f""" You are a Senior Security Engineer with 20+ years of experience. Security Analysis for repository with {len(file_analyses)} files: SECURITY ISSUES FOUND: {chr(10).join(security_issues[:20]) if security_issues else "No obvious security issues detected"} HIGH-RISK FILE TYPES PRESENT: {[lang for lang, count in languages.items() if lang in ['JavaScript', 'TypeScript', 'Python', 'PHP', 'SQL']]} Provide security assessment covering: 1. Overall security posture 2. Main security risks and vulnerabilities 3. Authentication and authorization concerns 4. Data protection and privacy issues 5. Immediate security priorities Keep response under 1000 words and focus on actionable security recommendations. """ try: # Run both analyses arch_task = self.client.messages.create( model="claude-3-5-sonnet-20241022", max_tokens=2000, temperature=0.1, messages=[{"role": "user", "content": arch_prompt}] ) sec_task = self.client.messages.create( model="claude-3-5-sonnet-20241022", max_tokens=1500, temperature=0.1, messages=[{"role": "user", "content": sec_prompt}] ) architecture_assessment = arch_task.content[0].text security_assessment = sec_task.content[0].text return architecture_assessment, security_assessment except Exception as e: return f"Architecture analysis failed: {e}", f"Security analysis failed: {e}" async def generate_executive_summary(self, analysis: RepositoryAnalysis) -> str: """Generate executive summary for leadership.""" print("Generating executive summary...") prompt = f""" You are presenting to C-level executives. Create an executive summary of this technical analysis: REPOSITORY METRICS: - Total Files: {analysis.total_files} - Lines of Code: {analysis.total_lines:,} - Languages: {analysis.languages} - Code Quality Score: {analysis.code_quality_score:.1f}/10 KEY FINDINGS: - Total issues identified: {sum(len(fa.issues_found) for fa in analysis.file_analyses)} - Files needing attention: {len([fa for fa in analysis.file_analyses if fa.severity_score < 7])} - High-quality files: {len([fa for fa in analysis.file_analyses if fa.severity_score >= 8])} Create an executive summary for non-technical leadership covering: 1. Business impact of code quality findings 2. Risk assessment and implications 3. Investment priorities and recommendations 4. Expected ROI from addressing technical debt 5. Competitive implications Focus on business outcomes, not technical details. Keep under 800 words. """ try: message = self.client.messages.create( model="claude-3-5-sonnet-20241022", max_tokens=1200, temperature=0.1, messages=[{"role": "user", "content": prompt}] ) return message.content[0].text except Exception as e: return f"Executive summary generation failed: {e}" def create_pdf_report(self, analysis: RepositoryAnalysis, output_path: str): """Generate comprehensive PDF report.""" print(f"Generating PDF report: {output_path}") doc = SimpleDocTemplate(output_path, pagesize=A4, leftMargin=72, rightMargin=72, topMargin=72, bottomMargin=72) styles = getSampleStyleSheet() story = [] # Custom styles title_style = ParagraphStyle( 'CustomTitle', parent=styles['Heading1'], fontSize=24, textColor=colors.darkblue, spaceAfter=30, alignment=TA_CENTER ) heading_style = ParagraphStyle( 'CustomHeading', parent=styles['Heading2'], fontSize=16, textColor=colors.darkblue, spaceBefore=20, spaceAfter=10 ) # Title Page story.append(Paragraph("Repository Analysis Report", title_style)) story.append(Spacer(1, 20)) story.append(Paragraph(f"Repository: {analysis.repo_path}", styles['Normal'])) story.append(Paragraph(f"Analysis Date: {datetime.now().strftime('%B %d, %Y at %H:%M')}", styles['Normal'])) story.append(Paragraph("Generated by: AI Senior Engineering Team", styles['Normal'])) story.append(PageBreak()) # Executive Summary story.append(Paragraph("Executive Summary", heading_style)) story.append(Paragraph(analysis.executive_summary, styles['Normal'])) story.append(PageBreak()) # Repository Overview story.append(Paragraph("Repository Overview", heading_style)) overview_data = [ ['Metric', 'Value'], ['Total Files Analyzed', str(analysis.total_files)], ['Total Lines of Code', f"{analysis.total_lines:,}"], ['Primary Languages', ', '.join(list(analysis.languages.keys())[:5])], ['Overall Code Quality', f"{analysis.code_quality_score:.1f}/10"], ] overview_table = Table(overview_data, colWidths=[200, 300]) overview_table.setStyle(TableStyle([ ('BACKGROUND', (0, 0), (-1, 0), colors.grey), ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), ('ALIGN', (0, 0), (-1, -1), 'LEFT'), ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'), ('FONTSIZE', (0, 0), (-1, 0), 12), ('BOTTOMPADDING', (0, 0), (-1, 0), 12), ('BACKGROUND', (0, 1), (-1, -1), colors.beige), ('GRID', (0, 0), (-1, -1), 1, colors.black) ])) story.append(overview_table) story.append(Spacer(1, 20)) # Languages Distribution if analysis.languages: story.append(Paragraph("Language Distribution", heading_style)) lang_data = [['Language', 'Files']] for lang, count in sorted(analysis.languages.items(), key=lambda x: x[1], reverse=True): lang_data.append([lang, str(count)]) lang_table = Table(lang_data, colWidths=[200, 100]) lang_table.setStyle(TableStyle([ ('BACKGROUND', (0, 0), (-1, 0), colors.grey), ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), ('ALIGN', (0, 0), (-1, -1), 'LEFT'), ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'), ('GRID', (0, 0), (-1, -1), 1, colors.black) ])) story.append(lang_table) story.append(PageBreak()) # Architecture Assessment story.append(Paragraph("Architecture Assessment", heading_style)) # Split long text into paragraphs arch_paragraphs = analysis.architecture_assessment.split('\n\n') for para in arch_paragraphs[:10]: # Limit paragraphs if para.strip(): story.append(Paragraph(para.strip(), styles['Normal'])) story.append(Spacer(1, 10)) story.append(PageBreak()) # Security Assessment story.append(Paragraph("Security Assessment", heading_style)) sec_paragraphs = analysis.security_assessment.split('\n\n') for para in sec_paragraphs[:10]: # Limit paragraphs if para.strip(): story.append(Paragraph(para.strip(), styles['Normal'])) story.append(Spacer(1, 10)) story.append(PageBreak()) # File Analysis Summary story.append(Paragraph("File Analysis Summary", heading_style)) # Summary statistics high_quality_files = [fa for fa in analysis.file_analyses if fa.severity_score >= 8] medium_quality_files = [fa for fa in analysis.file_analyses if 5 <= fa.severity_score < 8] low_quality_files = [fa for fa in analysis.file_analyses if fa.severity_score < 5] quality_data = [ ['Quality Level', 'Files', 'Percentage'], ['High Quality (8-10)', str(len(high_quality_files)), f"{len(high_quality_files)/len(analysis.file_analyses)*100:.1f}%"], ['Medium Quality (5-7)', str(len(medium_quality_files)), f"{len(medium_quality_files)/len(analysis.file_analyses)*100:.1f}%"], ['Low Quality (1-4)', str(len(low_quality_files)), f"{len(low_quality_files)/len(analysis.file_analyses)*100:.1f}%"] ] quality_table = Table(quality_data) quality_table.setStyle(TableStyle([ ('BACKGROUND', (0, 0), (-1, 0), colors.grey), ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), ('ALIGN', (0, 0), (-1, -1), 'CENTER'), ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'), ('GRID', (0, 0), (-1, -1), 1, colors.black), ('BACKGROUND', (0, 1), (-1, 1), colors.lightgreen), ('BACKGROUND', (0, 2), (-1, 2), colors.lightyellow), ('BACKGROUND', (0, 3), (-1, 3), colors.lightcoral) ])) story.append(quality_table) story.append(Spacer(1, 20)) # Top Issues Found story.append(Paragraph("Files Requiring Attention", heading_style)) # Sort files by severity (lowest scores first - need most attention) files_by_priority = sorted(analysis.file_analyses, key=lambda x: x.severity_score) for i, file_analysis in enumerate(files_by_priority[:15]): # Top 15 files needing attention story.append(Paragraph(f"{i+1}. {file_analysis.path}", styles['Heading4'])) story.append(Paragraph(f"Language: {file_analysis.language} | Quality Score: {file_analysis.severity_score:.1f}/10 | Lines: {file_analysis.lines_of_code}", styles['Normal'])) # Show top issues if file_analysis.issues_found: story.append(Paragraph("Key Issues:", styles['Heading5'])) for issue in file_analysis.issues_found[:3]: # Top 3 issues story.append(Paragraph(f"• {issue}", styles['Normal'])) # Show top recommendations if file_analysis.recommendations: story.append(Paragraph("Recommendations:", styles['Heading5'])) for rec in file_analysis.recommendations[:2]: # Top 2 recommendations story.append(Paragraph(f"• {rec}", styles['Normal'])) story.append(Spacer(1, 15)) # Build PDF try: doc.build(story) print(f"✅ PDF report generated successfully: {output_path}") except Exception as e: print(f"❌ Error generating PDF: {e}") async def analyze_repository(self, repo_path: str, max_files: int = 50) -> RepositoryAnalysis: """Main analysis function.""" try: # Clone/access repository actual_repo_path = self.clone_repository(repo_path) # Scan files files_to_analyze = self.scan_repository(actual_repo_path, max_files) if not files_to_analyze: raise Exception("No files found to analyze") # Analyze each file print(f"Starting analysis of {len(files_to_analyze)} files...") file_analyses = [] for i, (file_path, content) in enumerate(files_to_analyze): print(f"Analyzing file {i+1}/{len(files_to_analyze)}: {file_path.name}") analysis = await self.analyze_file_comprehensive(file_path, content) file_analyses.append(analysis) # Small delay to avoid rate limiting await asyncio.sleep(0.2) # Repository-level analyses print("Performing repository-level analysis...") architecture_assessment, security_assessment = await self.analyze_repository_overview( actual_repo_path, file_analyses) # Calculate overall quality score avg_quality = sum(fa.severity_score for fa in file_analyses) / len(file_analyses) # Generate statistics languages = dict(Counter(fa.language for fa in file_analyses)) total_lines = sum(fa.lines_of_code for fa in file_analyses) # Create repository analysis repo_analysis = RepositoryAnalysis( repo_path=repo_path, total_files=len(file_analyses), total_lines=total_lines, languages=languages, architecture_assessment=architecture_assessment, security_assessment=security_assessment, code_quality_score=avg_quality, file_analyses=file_analyses, executive_summary="" ) # Generate executive summary print("Generating executive summary...") repo_analysis.executive_summary = await self.generate_executive_summary(repo_analysis) return repo_analysis finally: # Cleanup if self.temp_dir and os.path.exists(self.temp_dir): shutil.rmtree(self.temp_dir) print("Temporary files cleaned up") async def main(): # Load environment variables load_dotenv() parser = argparse.ArgumentParser(description="Robust GitHub Repository AI Analysis") parser.add_argument("repo_path", help="Repository path (local directory or Git URL)") parser.add_argument("--output", "-o", default="repository_analysis.pdf", help="Output PDF file path") parser.add_argument("--max-files", type=int, default=50, help="Maximum files to analyze") parser.add_argument("--api-key", help="Anthropic API key (overrides .env)") args = parser.parse_args() # Get API key api_key = args.api_key or os.getenv('ANTHROPIC_API_KEY') if not api_key: print("❌ Error: ANTHROPIC_API_KEY not found in .env file or command line") print("Please create a .env file with: ANTHROPIC_API_KEY=your_key_here") return 1 try: print("🚀 Starting Repository Analysis") print("=" * 60) print(f"Repository: {args.repo_path}") print(f"Max files: {args.max_files}") print(f"Output: {args.output}") print("=" * 60) # Initialize analyzer analyzer = RobustGitHubAnalyzer(api_key) # Perform analysis analysis = await analyzer.analyze_repository(args.repo_path, args.max_files) # Generate PDF report analyzer.create_pdf_report(analysis, args.output) # Print summary to console print("\n" + "=" * 60) print("🎯 ANALYSIS COMPLETE") print("=" * 60) print(f"📊 Repository Statistics:") print(f" • Files Analyzed: {analysis.total_files}") print(f" • Lines of Code: {analysis.total_lines:,}") print(f" • Languages: {len(analysis.languages)}") print(f" • Code Quality: {analysis.code_quality_score:.1f}/10") # Quality breakdown high_quality = len([fa for fa in analysis.file_analyses if fa.severity_score >= 8]) low_quality = len([fa for fa in analysis.file_analyses if fa.severity_score < 5]) print(f"\n📈 Quality Breakdown:") print(f" • High Quality Files: {high_quality}") print(f" • Files Needing Attention: {low_quality}") print(f" • Total Issues Found: {sum(len(fa.issues_found) for fa in analysis.file_analyses)}") print(f"\n📄 Detailed PDF Report: {args.output}") print("\n✅ Analysis completed successfully!") return 0 except Exception as e: print(f"❌ Error during analysis: {e}") return 1 if __name__ == "__main__": exit(asyncio.run(main()))