#!/usr/bin/env python3 """ GitHub Repository AI Analysis Tool Analyzes GitHub repositories using Claude API for comprehensive code insights. """ import os import git import json import requests import tempfile import shutil from pathlib import Path from typing import Dict, List, Optional, Tuple import argparse from datetime import datetime import mimetypes import base64 class GitHubRepoAnalyzer: def __init__(self, anthropic_api_key: str): self.api_key = anthropic_api_key self.api_url = "https://api.anthropic.com/v1/messages" self.temp_dir = None # File extensions to analyze self.code_extensions = { '.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.cpp', '.c', '.h', '.cs', '.php', '.rb', '.go', '.rs', '.swift', '.kt', '.scala', '.html', '.css', '.scss', '.sass', '.less', '.vue', '.svelte', '.sql', '.sh', '.bash', '.yml', '.yaml', '.json', '.xml', '.dockerfile', '.md', '.rst', '.txt' } # Files to always include in analysis self.important_files = { 'README.md', 'readme.md', 'README.txt', 'readme.txt', 'package.json', 'requirements.txt', 'Cargo.toml', 'pom.xml', 'build.gradle', 'Makefile', 'dockerfile', 'Dockerfile', 'docker-compose.yml', '.gitignore', 'setup.py', 'pyproject.toml' } def clone_repository(self, repo_url: str) -> str: """Clone GitHub repository to temporary directory.""" print(f"Cloning repository: {repo_url}") self.temp_dir = tempfile.mkdtemp(prefix="github_analysis_") try: git.Repo.clone_from(repo_url, self.temp_dir) print(f"Repository cloned to: {self.temp_dir}") return self.temp_dir except git.exc.GitCommandError as e: raise Exception(f"Failed to clone repository: {e}") def get_file_info(self, file_path: Path) -> Dict: """Get file information and content.""" try: # Check file size (skip files larger than 1MB) if file_path.stat().st_size > 1024 * 1024: return { 'path': str(file_path.relative_to(self.temp_dir)), 'size': file_path.stat().st_size, 'content': '[File too large to analyze]', 'encoding': 'skipped' } # Try to read as text try: with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: content = f.read() encoding = 'utf-8' except UnicodeDecodeError: # If text fails, try binary for certain file types with open(file_path, 'rb') as f: raw_content = f.read() if len(raw_content) < 10000: # Only encode small binary files content = base64.b64encode(raw_content).decode('ascii') encoding = 'base64' else: content = '[Binary file - content not included]' encoding = 'binary' return { 'path': str(file_path.relative_to(self.temp_dir)), 'size': file_path.stat().st_size, 'content': content, 'encoding': encoding, 'mime_type': mimetypes.guess_type(str(file_path))[0] } except Exception as e: return { 'path': str(file_path.relative_to(self.temp_dir)), 'error': str(e), 'content': '[Error reading file]' } def scan_repository(self, max_files: int = 50) -> Dict: """Scan repository and collect file information.""" print("Scanning repository structure...") repo_data = { 'structure': [], 'files': [], 'stats': { 'total_files': 0, 'analyzed_files': 0, 'total_size': 0, 'languages': {} } } # Get directory structure for root, dirs, files in os.walk(self.temp_dir): # Skip hidden directories and common build/cache directories dirs[:] = [d for d in dirs if not d.startswith('.') and d not in {'node_modules', '__pycache__', 'build', 'dist', 'target', 'venv', 'env'}] level = root.replace(self.temp_dir, '').count(os.sep) indent = ' ' * level folder_name = os.path.basename(root) if root != self.temp_dir else '.' repo_data['structure'].append(f"{indent}{folder_name}/") # Process files for file in files: if file.startswith('.'): continue file_path = Path(root) / file repo_data['stats']['total_files'] += 1 repo_data['stats']['total_size'] += file_path.stat().st_size # Track languages ext = file_path.suffix.lower() if ext: repo_data['stats']['languages'][ext] = repo_data['stats']['languages'].get(ext, 0) + 1 # Add to structure repo_data['structure'].append(f"{indent} {file}") # Decide if we should analyze this file should_analyze = ( file.lower() in self.important_files or ext in self.code_extensions or repo_data['stats']['analyzed_files'] < max_files ) if should_analyze and repo_data['stats']['analyzed_files'] < max_files: file_info = self.get_file_info(file_path) repo_data['files'].append(file_info) repo_data['stats']['analyzed_files'] += 1 return repo_data def call_claude_api(self, prompt: str, max_tokens: int = 4000) -> str: """Make API call to Claude.""" headers = { "Content-Type": "application/json", "x-api-key": self.api_key, "anthropic-version": "2023-06-01" } data = { "model": os.getenv("CLAUDE_MODEL", "claude-3-5-haiku-latest"), "max_tokens": max_tokens, "messages": [ {"role": "user", "content": prompt} ] } try: response = requests.post(self.api_url, headers=headers, json=data) response.raise_for_status() result = response.json() return result['content'][0]['text'] except requests.exceptions.RequestException as e: raise Exception(f"API request failed: {e}") def analyze_repository_overview(self, repo_data: Dict) -> str: """Get high-level repository analysis.""" print("Analyzing repository overview...") structure_summary = "\n".join(repo_data['structure'][:100]) # Limit structure size prompt = f""" Analyze this GitHub repository and provide a comprehensive overview: REPOSITORY STRUCTURE: {structure_summary} STATISTICS: - Total files: {repo_data['stats']['total_files']} - Files analyzed: {repo_data['stats']['analyzed_files']} - Total size: {repo_data['stats']['total_size']} bytes - Languages found: {dict(list(repo_data['stats']['languages'].items())[:10])} Please provide: 1. **Project Type & Purpose**: What kind of project is this? 2. **Technology Stack**: What technologies, frameworks, and languages are used? 3. **Architecture Overview**: How is the project structured? 4. **Key Components**: What are the main modules/components? 5. **Development Setup**: What's needed to run this project? 6. **Code Quality Assessment**: Initial observations about code organization """ return self.call_claude_api(prompt) def analyze_code_files(self, repo_data: Dict) -> str: """Analyze individual code files.""" print("Analyzing code files...") # Prepare file contents for analysis files_content = [] for file_info in repo_data['files'][:20]: # Limit to first 20 files if file_info.get('encoding') == 'utf-8' and len(file_info.get('content', '')) < 5000: files_content.append(f"=== {file_info['path']} ===\n{file_info['content']}\n") files_text = "\n".join(files_content) prompt = f""" Analyze these key files from the repository: {files_text} Please provide detailed analysis covering: 1. **Code Quality**: Code style, organization, and best practices 2. **Design Patterns**: What patterns and architectural approaches are used? 3. **Dependencies & Libraries**: Key external dependencies identified 4. **Potential Issues**: Any code smells, security concerns, or improvements needed 5. **Testing Strategy**: How is testing implemented (if at all)? 6. **Documentation**: Quality of inline documentation and comments 7. **Maintainability**: How maintainable and extensible is this code? """ return self.call_claude_api(prompt, max_tokens=6000) def analyze_security_and_best_practices(self, repo_data: Dict) -> str: """Analyze security and best practices.""" print("Analyzing security and best practices...") # Look for security-sensitive files security_files = [] for file_info in repo_data['files']: path_lower = file_info['path'].lower() if any(term in path_lower for term in ['config', 'env', 'secret', 'key', 'auth', 'security']): if file_info.get('encoding') == 'utf-8': security_files.append(f"=== {file_info['path']} ===\n{file_info['content'][:2000]}\n") security_content = "\n".join(security_files[:10]) prompt = f""" Analyze this repository for security and best practices: SECURITY-RELEVANT FILES: {security_content} FILE STRUCTURE ANALYSIS: {json.dumps(repo_data['stats'], indent=2)} Please analyze: 1. **Security Issues**: Potential security vulnerabilities or concerns 2. **Secret Management**: How are secrets/credentials handled? 3. **Dependencies**: Are there any vulnerable dependencies? 4. **Best Practices**: Adherence to language/framework best practices 5. **Configuration**: Are configurations properly externalized? 6. **Error Handling**: How are errors handled throughout the codebase? 7. **Recommendations**: Specific suggestions for improvement """ return self.call_claude_api(prompt, max_tokens=5000) def generate_comprehensive_report(self, repo_url: str, overview: str, code_analysis: str, security_analysis: str) -> str: """Generate final comprehensive report.""" print("Generating comprehensive report...") timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") report = f""" # GitHub Repository Analysis Report **Repository:** {repo_url} **Analysis Date:** {timestamp} **Analyzed by:** Claude AI Assistant --- ## Executive Summary {overview} --- ## Detailed Code Analysis {code_analysis} --- ## Security & Best Practices Analysis {security_analysis} --- ## Recommendations Summary Based on the analysis, here are the key recommendations for this repository: 1. **Immediate Actions**: Critical issues that should be addressed promptly 2. **Code Quality Improvements**: Suggestions for better maintainability 3. **Security Enhancements**: Steps to improve security posture 4. **Documentation**: Areas where documentation could be enhanced 5. **Architecture**: Potential architectural improvements --- *This analysis was generated using AI and should be reviewed by human developers for accuracy and context.* """ return report def analyze(self, repo_url: str, output_file: Optional[str] = None) -> str: """Main analysis function.""" try: # Clone repository self.clone_repository(repo_url) # Scan repository structure and files repo_data = self.scan_repository() # Perform different types of analysis overview = self.analyze_repository_overview(repo_data) code_analysis = self.analyze_code_files(repo_data) security_analysis = self.analyze_security_and_best_practices(repo_data) # Generate comprehensive report final_report = self.generate_comprehensive_report( repo_url, overview, code_analysis, security_analysis ) # Save report if output file specified if output_file: with open(output_file, 'w', encoding='utf-8') as f: f.write(final_report) print(f"Report saved to: {output_file}") return final_report finally: # Cleanup temporary directory if self.temp_dir and os.path.exists(self.temp_dir): shutil.rmtree(self.temp_dir) print("Temporary files cleaned up") def main(): parser = argparse.ArgumentParser(description="Analyze GitHub repository using Claude AI") parser.add_argument("repo_url", help="GitHub repository URL") parser.add_argument("--api-key", required=True, help="Anthropic API key") parser.add_argument("--output", "-o", help="Output file path (optional)") parser.add_argument("--max-files", type=int, default=50, help="Maximum files to analyze") args = parser.parse_args() # Initialize analyzer analyzer = GitHubRepoAnalyzer(args.api_key) try: print("Starting GitHub repository analysis...") print("=" * 50) # Perform analysis report = analyzer.analyze(args.repo_url, args.output) # Print report if no output file specified if not args.output: print("\n" + "=" * 50) print("ANALYSIS REPORT") print("=" * 50) print(report) print("\nAnalysis completed successfully!") except Exception as e: print(f"Error during analysis: {e}") return 1 return 0 if __name__ == "__main__": exit(main())