391 lines
14 KiB
Python
391 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
GitHub Repository AI Analysis Tool
|
|
Analyzes GitHub repositories using Claude API for comprehensive code insights.
|
|
"""
|
|
|
|
import os
|
|
import git
|
|
import json
|
|
import requests
|
|
import tempfile
|
|
import shutil
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Tuple
|
|
import argparse
|
|
from datetime import datetime
|
|
import mimetypes
|
|
import base64
|
|
|
|
class GitHubRepoAnalyzer:
|
|
def __init__(self, anthropic_api_key: str):
|
|
self.api_key = anthropic_api_key
|
|
self.api_url = "https://api.anthropic.com/v1/messages"
|
|
self.temp_dir = None
|
|
|
|
# File extensions to analyze
|
|
self.code_extensions = {
|
|
'.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.cpp', '.c', '.h',
|
|
'.cs', '.php', '.rb', '.go', '.rs', '.swift', '.kt', '.scala',
|
|
'.html', '.css', '.scss', '.sass', '.less', '.vue', '.svelte',
|
|
'.sql', '.sh', '.bash', '.yml', '.yaml', '.json', '.xml',
|
|
'.dockerfile', '.md', '.rst', '.txt'
|
|
}
|
|
|
|
# Files to always include in analysis
|
|
self.important_files = {
|
|
'README.md', 'readme.md', 'README.txt', 'readme.txt',
|
|
'package.json', 'requirements.txt', 'Cargo.toml', 'pom.xml',
|
|
'build.gradle', 'Makefile', 'dockerfile', 'Dockerfile',
|
|
'docker-compose.yml', '.gitignore', 'setup.py', 'pyproject.toml'
|
|
}
|
|
|
|
def clone_repository(self, repo_url: str) -> str:
|
|
"""Clone GitHub repository to temporary directory."""
|
|
print(f"Cloning repository: {repo_url}")
|
|
|
|
self.temp_dir = tempfile.mkdtemp(prefix="github_analysis_")
|
|
|
|
try:
|
|
git.Repo.clone_from(repo_url, self.temp_dir)
|
|
print(f"Repository cloned to: {self.temp_dir}")
|
|
return self.temp_dir
|
|
except git.exc.GitCommandError as e:
|
|
raise Exception(f"Failed to clone repository: {e}")
|
|
|
|
def get_file_info(self, file_path: Path) -> Dict:
|
|
"""Get file information and content."""
|
|
try:
|
|
# Check file size (skip files larger than 1MB)
|
|
if file_path.stat().st_size > 1024 * 1024:
|
|
return {
|
|
'path': str(file_path.relative_to(self.temp_dir)),
|
|
'size': file_path.stat().st_size,
|
|
'content': '[File too large to analyze]',
|
|
'encoding': 'skipped'
|
|
}
|
|
|
|
# Try to read as text
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
content = f.read()
|
|
encoding = 'utf-8'
|
|
except UnicodeDecodeError:
|
|
# If text fails, try binary for certain file types
|
|
with open(file_path, 'rb') as f:
|
|
raw_content = f.read()
|
|
if len(raw_content) < 10000: # Only encode small binary files
|
|
content = base64.b64encode(raw_content).decode('ascii')
|
|
encoding = 'base64'
|
|
else:
|
|
content = '[Binary file - content not included]'
|
|
encoding = 'binary'
|
|
|
|
return {
|
|
'path': str(file_path.relative_to(self.temp_dir)),
|
|
'size': file_path.stat().st_size,
|
|
'content': content,
|
|
'encoding': encoding,
|
|
'mime_type': mimetypes.guess_type(str(file_path))[0]
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
'path': str(file_path.relative_to(self.temp_dir)),
|
|
'error': str(e),
|
|
'content': '[Error reading file]'
|
|
}
|
|
|
|
def scan_repository(self, max_files: int = 50) -> Dict:
|
|
"""Scan repository and collect file information."""
|
|
print("Scanning repository structure...")
|
|
|
|
repo_data = {
|
|
'structure': [],
|
|
'files': [],
|
|
'stats': {
|
|
'total_files': 0,
|
|
'analyzed_files': 0,
|
|
'total_size': 0,
|
|
'languages': {}
|
|
}
|
|
}
|
|
|
|
# Get directory structure
|
|
for root, dirs, files in os.walk(self.temp_dir):
|
|
# Skip hidden directories and common build/cache directories
|
|
dirs[:] = [d for d in dirs if not d.startswith('.') and
|
|
d not in {'node_modules', '__pycache__', 'build', 'dist', 'target', 'venv', 'env'}]
|
|
|
|
level = root.replace(self.temp_dir, '').count(os.sep)
|
|
indent = ' ' * level
|
|
folder_name = os.path.basename(root) if root != self.temp_dir else '.'
|
|
repo_data['structure'].append(f"{indent}{folder_name}/")
|
|
|
|
# Process files
|
|
for file in files:
|
|
if file.startswith('.'):
|
|
continue
|
|
|
|
file_path = Path(root) / file
|
|
repo_data['stats']['total_files'] += 1
|
|
repo_data['stats']['total_size'] += file_path.stat().st_size
|
|
|
|
# Track languages
|
|
ext = file_path.suffix.lower()
|
|
if ext:
|
|
repo_data['stats']['languages'][ext] = repo_data['stats']['languages'].get(ext, 0) + 1
|
|
|
|
# Add to structure
|
|
repo_data['structure'].append(f"{indent} {file}")
|
|
|
|
# Decide if we should analyze this file
|
|
should_analyze = (
|
|
file.lower() in self.important_files or
|
|
ext in self.code_extensions or
|
|
repo_data['stats']['analyzed_files'] < max_files
|
|
)
|
|
|
|
if should_analyze and repo_data['stats']['analyzed_files'] < max_files:
|
|
file_info = self.get_file_info(file_path)
|
|
repo_data['files'].append(file_info)
|
|
repo_data['stats']['analyzed_files'] += 1
|
|
|
|
return repo_data
|
|
|
|
def call_claude_api(self, prompt: str, max_tokens: int = 4000) -> str:
|
|
"""Make API call to Claude."""
|
|
headers = {
|
|
"Content-Type": "application/json",
|
|
"x-api-key": self.api_key,
|
|
"anthropic-version": "2023-06-01"
|
|
}
|
|
|
|
data = {
|
|
"model": os.getenv("CLAUDE_MODEL", "claude-3-5-haiku-latest"),
|
|
"max_tokens": max_tokens,
|
|
"messages": [
|
|
{"role": "user", "content": prompt}
|
|
]
|
|
}
|
|
|
|
try:
|
|
response = requests.post(self.api_url, headers=headers, json=data)
|
|
response.raise_for_status()
|
|
|
|
result = response.json()
|
|
return result['content'][0]['text']
|
|
except requests.exceptions.RequestException as e:
|
|
raise Exception(f"API request failed: {e}")
|
|
|
|
def analyze_repository_overview(self, repo_data: Dict) -> str:
|
|
"""Get high-level repository analysis."""
|
|
print("Analyzing repository overview...")
|
|
|
|
structure_summary = "\n".join(repo_data['structure'][:100]) # Limit structure size
|
|
|
|
prompt = f"""
|
|
Analyze this GitHub repository and provide a comprehensive overview:
|
|
|
|
REPOSITORY STRUCTURE:
|
|
{structure_summary}
|
|
|
|
STATISTICS:
|
|
- Total files: {repo_data['stats']['total_files']}
|
|
- Files analyzed: {repo_data['stats']['analyzed_files']}
|
|
- Total size: {repo_data['stats']['total_size']} bytes
|
|
- Languages found: {dict(list(repo_data['stats']['languages'].items())[:10])}
|
|
|
|
Please provide:
|
|
1. **Project Type & Purpose**: What kind of project is this?
|
|
2. **Technology Stack**: What technologies, frameworks, and languages are used?
|
|
3. **Architecture Overview**: How is the project structured?
|
|
4. **Key Components**: What are the main modules/components?
|
|
5. **Development Setup**: What's needed to run this project?
|
|
6. **Code Quality Assessment**: Initial observations about code organization
|
|
"""
|
|
|
|
return self.call_claude_api(prompt)
|
|
|
|
def analyze_code_files(self, repo_data: Dict) -> str:
|
|
"""Analyze individual code files."""
|
|
print("Analyzing code files...")
|
|
|
|
# Prepare file contents for analysis
|
|
files_content = []
|
|
for file_info in repo_data['files'][:20]: # Limit to first 20 files
|
|
if file_info.get('encoding') == 'utf-8' and len(file_info.get('content', '')) < 5000:
|
|
files_content.append(f"=== {file_info['path']} ===\n{file_info['content']}\n")
|
|
|
|
files_text = "\n".join(files_content)
|
|
|
|
prompt = f"""
|
|
Analyze these key files from the repository:
|
|
|
|
{files_text}
|
|
|
|
Please provide detailed analysis covering:
|
|
1. **Code Quality**: Code style, organization, and best practices
|
|
2. **Design Patterns**: What patterns and architectural approaches are used?
|
|
3. **Dependencies & Libraries**: Key external dependencies identified
|
|
4. **Potential Issues**: Any code smells, security concerns, or improvements needed
|
|
5. **Testing Strategy**: How is testing implemented (if at all)?
|
|
6. **Documentation**: Quality of inline documentation and comments
|
|
7. **Maintainability**: How maintainable and extensible is this code?
|
|
"""
|
|
|
|
return self.call_claude_api(prompt, max_tokens=6000)
|
|
|
|
def analyze_security_and_best_practices(self, repo_data: Dict) -> str:
|
|
"""Analyze security and best practices."""
|
|
print("Analyzing security and best practices...")
|
|
|
|
# Look for security-sensitive files
|
|
security_files = []
|
|
for file_info in repo_data['files']:
|
|
path_lower = file_info['path'].lower()
|
|
if any(term in path_lower for term in ['config', 'env', 'secret', 'key', 'auth', 'security']):
|
|
if file_info.get('encoding') == 'utf-8':
|
|
security_files.append(f"=== {file_info['path']} ===\n{file_info['content'][:2000]}\n")
|
|
|
|
security_content = "\n".join(security_files[:10])
|
|
|
|
prompt = f"""
|
|
Analyze this repository for security and best practices:
|
|
|
|
SECURITY-RELEVANT FILES:
|
|
{security_content}
|
|
|
|
FILE STRUCTURE ANALYSIS:
|
|
{json.dumps(repo_data['stats'], indent=2)}
|
|
|
|
Please analyze:
|
|
1. **Security Issues**: Potential security vulnerabilities or concerns
|
|
2. **Secret Management**: How are secrets/credentials handled?
|
|
3. **Dependencies**: Are there any vulnerable dependencies?
|
|
4. **Best Practices**: Adherence to language/framework best practices
|
|
5. **Configuration**: Are configurations properly externalized?
|
|
6. **Error Handling**: How are errors handled throughout the codebase?
|
|
7. **Recommendations**: Specific suggestions for improvement
|
|
"""
|
|
|
|
return self.call_claude_api(prompt, max_tokens=5000)
|
|
|
|
def generate_comprehensive_report(self, repo_url: str, overview: str, code_analysis: str, security_analysis: str) -> str:
|
|
"""Generate final comprehensive report."""
|
|
print("Generating comprehensive report...")
|
|
|
|
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
report = f"""
|
|
# GitHub Repository Analysis Report
|
|
|
|
**Repository:** {repo_url}
|
|
**Analysis Date:** {timestamp}
|
|
**Analyzed by:** Claude AI Assistant
|
|
|
|
---
|
|
|
|
## Executive Summary
|
|
|
|
{overview}
|
|
|
|
---
|
|
|
|
## Detailed Code Analysis
|
|
|
|
{code_analysis}
|
|
|
|
---
|
|
|
|
## Security & Best Practices Analysis
|
|
|
|
{security_analysis}
|
|
|
|
---
|
|
|
|
## Recommendations Summary
|
|
|
|
Based on the analysis, here are the key recommendations for this repository:
|
|
|
|
1. **Immediate Actions**: Critical issues that should be addressed promptly
|
|
2. **Code Quality Improvements**: Suggestions for better maintainability
|
|
3. **Security Enhancements**: Steps to improve security posture
|
|
4. **Documentation**: Areas where documentation could be enhanced
|
|
5. **Architecture**: Potential architectural improvements
|
|
|
|
---
|
|
|
|
*This analysis was generated using AI and should be reviewed by human developers for accuracy and context.*
|
|
"""
|
|
|
|
return report
|
|
|
|
def analyze(self, repo_url: str, output_file: Optional[str] = None) -> str:
|
|
"""Main analysis function."""
|
|
try:
|
|
# Clone repository
|
|
self.clone_repository(repo_url)
|
|
|
|
# Scan repository structure and files
|
|
repo_data = self.scan_repository()
|
|
|
|
# Perform different types of analysis
|
|
overview = self.analyze_repository_overview(repo_data)
|
|
code_analysis = self.analyze_code_files(repo_data)
|
|
security_analysis = self.analyze_security_and_best_practices(repo_data)
|
|
|
|
# Generate comprehensive report
|
|
final_report = self.generate_comprehensive_report(
|
|
repo_url, overview, code_analysis, security_analysis
|
|
)
|
|
|
|
# Save report if output file specified
|
|
if output_file:
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write(final_report)
|
|
print(f"Report saved to: {output_file}")
|
|
|
|
return final_report
|
|
|
|
finally:
|
|
# Cleanup temporary directory
|
|
if self.temp_dir and os.path.exists(self.temp_dir):
|
|
shutil.rmtree(self.temp_dir)
|
|
print("Temporary files cleaned up")
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Analyze GitHub repository using Claude AI")
|
|
parser.add_argument("repo_url", help="GitHub repository URL")
|
|
parser.add_argument("--api-key", required=True, help="Anthropic API key")
|
|
parser.add_argument("--output", "-o", help="Output file path (optional)")
|
|
parser.add_argument("--max-files", type=int, default=50, help="Maximum files to analyze")
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Initialize analyzer
|
|
analyzer = GitHubRepoAnalyzer(args.api_key)
|
|
|
|
try:
|
|
print("Starting GitHub repository analysis...")
|
|
print("=" * 50)
|
|
|
|
# Perform analysis
|
|
report = analyzer.analyze(args.repo_url, args.output)
|
|
|
|
# Print report if no output file specified
|
|
if not args.output:
|
|
print("\n" + "=" * 50)
|
|
print("ANALYSIS REPORT")
|
|
print("=" * 50)
|
|
print(report)
|
|
|
|
print("\nAnalysis completed successfully!")
|
|
|
|
except Exception as e:
|
|
print(f"Error during analysis: {e}")
|
|
return 1
|
|
|
|
return 0
|
|
|
|
if __name__ == "__main__":
|
|
exit(main()) |