codenuk_backend_mine/services/ai-analysis-service/ai-analysis/app.py
2025-10-31 08:34:11 +05:30

391 lines
14 KiB
Python

#!/usr/bin/env python3
"""
GitHub Repository AI Analysis Tool
Analyzes GitHub repositories using Claude API for comprehensive code insights.
"""
import os
import git
import json
import requests
import tempfile
import shutil
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import argparse
from datetime import datetime
import mimetypes
import base64
class GitHubRepoAnalyzer:
def __init__(self, anthropic_api_key: str):
self.api_key = anthropic_api_key
self.api_url = "https://api.anthropic.com/v1/messages"
self.temp_dir = None
# File extensions to analyze
self.code_extensions = {
'.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.cpp', '.c', '.h',
'.cs', '.php', '.rb', '.go', '.rs', '.swift', '.kt', '.scala',
'.html', '.css', '.scss', '.sass', '.less', '.vue', '.svelte',
'.sql', '.sh', '.bash', '.yml', '.yaml', '.json', '.xml',
'.dockerfile', '.md', '.rst', '.txt'
}
# Files to always include in analysis
self.important_files = {
'README.md', 'readme.md', 'README.txt', 'readme.txt',
'package.json', 'requirements.txt', 'Cargo.toml', 'pom.xml',
'build.gradle', 'Makefile', 'dockerfile', 'Dockerfile',
'docker-compose.yml', '.gitignore', 'setup.py', 'pyproject.toml'
}
def clone_repository(self, repo_url: str) -> str:
"""Clone GitHub repository to temporary directory."""
print(f"Cloning repository: {repo_url}")
self.temp_dir = tempfile.mkdtemp(prefix="github_analysis_")
try:
git.Repo.clone_from(repo_url, self.temp_dir)
print(f"Repository cloned to: {self.temp_dir}")
return self.temp_dir
except git.exc.GitCommandError as e:
raise Exception(f"Failed to clone repository: {e}")
def get_file_info(self, file_path: Path) -> Dict:
"""Get file information and content."""
try:
# Check file size (skip files larger than 1MB)
if file_path.stat().st_size > 1024 * 1024:
return {
'path': str(file_path.relative_to(self.temp_dir)),
'size': file_path.stat().st_size,
'content': '[File too large to analyze]',
'encoding': 'skipped'
}
# Try to read as text
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
encoding = 'utf-8'
except UnicodeDecodeError:
# If text fails, try binary for certain file types
with open(file_path, 'rb') as f:
raw_content = f.read()
if len(raw_content) < 10000: # Only encode small binary files
content = base64.b64encode(raw_content).decode('ascii')
encoding = 'base64'
else:
content = '[Binary file - content not included]'
encoding = 'binary'
return {
'path': str(file_path.relative_to(self.temp_dir)),
'size': file_path.stat().st_size,
'content': content,
'encoding': encoding,
'mime_type': mimetypes.guess_type(str(file_path))[0]
}
except Exception as e:
return {
'path': str(file_path.relative_to(self.temp_dir)),
'error': str(e),
'content': '[Error reading file]'
}
def scan_repository(self, max_files: int = 50) -> Dict:
"""Scan repository and collect file information."""
print("Scanning repository structure...")
repo_data = {
'structure': [],
'files': [],
'stats': {
'total_files': 0,
'analyzed_files': 0,
'total_size': 0,
'languages': {}
}
}
# Get directory structure
for root, dirs, files in os.walk(self.temp_dir):
# Skip hidden directories and common build/cache directories
dirs[:] = [d for d in dirs if not d.startswith('.') and
d not in {'node_modules', '__pycache__', 'build', 'dist', 'target', 'venv', 'env'}]
level = root.replace(self.temp_dir, '').count(os.sep)
indent = ' ' * level
folder_name = os.path.basename(root) if root != self.temp_dir else '.'
repo_data['structure'].append(f"{indent}{folder_name}/")
# Process files
for file in files:
if file.startswith('.'):
continue
file_path = Path(root) / file
repo_data['stats']['total_files'] += 1
repo_data['stats']['total_size'] += file_path.stat().st_size
# Track languages
ext = file_path.suffix.lower()
if ext:
repo_data['stats']['languages'][ext] = repo_data['stats']['languages'].get(ext, 0) + 1
# Add to structure
repo_data['structure'].append(f"{indent} {file}")
# Decide if we should analyze this file
should_analyze = (
file.lower() in self.important_files or
ext in self.code_extensions or
repo_data['stats']['analyzed_files'] < max_files
)
if should_analyze and repo_data['stats']['analyzed_files'] < max_files:
file_info = self.get_file_info(file_path)
repo_data['files'].append(file_info)
repo_data['stats']['analyzed_files'] += 1
return repo_data
def call_claude_api(self, prompt: str, max_tokens: int = 4000) -> str:
"""Make API call to Claude."""
headers = {
"Content-Type": "application/json",
"x-api-key": self.api_key,
"anthropic-version": "2023-06-01"
}
data = {
"model": os.getenv("CLAUDE_MODEL", "claude-3-5-haiku-latest"),
"max_tokens": max_tokens,
"messages": [
{"role": "user", "content": prompt}
]
}
try:
response = requests.post(self.api_url, headers=headers, json=data)
response.raise_for_status()
result = response.json()
return result['content'][0]['text']
except requests.exceptions.RequestException as e:
raise Exception(f"API request failed: {e}")
def analyze_repository_overview(self, repo_data: Dict) -> str:
"""Get high-level repository analysis."""
print("Analyzing repository overview...")
structure_summary = "\n".join(repo_data['structure'][:100]) # Limit structure size
prompt = f"""
Analyze this GitHub repository and provide a comprehensive overview:
REPOSITORY STRUCTURE:
{structure_summary}
STATISTICS:
- Total files: {repo_data['stats']['total_files']}
- Files analyzed: {repo_data['stats']['analyzed_files']}
- Total size: {repo_data['stats']['total_size']} bytes
- Languages found: {dict(list(repo_data['stats']['languages'].items())[:10])}
Please provide:
1. **Project Type & Purpose**: What kind of project is this?
2. **Technology Stack**: What technologies, frameworks, and languages are used?
3. **Architecture Overview**: How is the project structured?
4. **Key Components**: What are the main modules/components?
5. **Development Setup**: What's needed to run this project?
6. **Code Quality Assessment**: Initial observations about code organization
"""
return self.call_claude_api(prompt)
def analyze_code_files(self, repo_data: Dict) -> str:
"""Analyze individual code files."""
print("Analyzing code files...")
# Prepare file contents for analysis
files_content = []
for file_info in repo_data['files'][:20]: # Limit to first 20 files
if file_info.get('encoding') == 'utf-8' and len(file_info.get('content', '')) < 5000:
files_content.append(f"=== {file_info['path']} ===\n{file_info['content']}\n")
files_text = "\n".join(files_content)
prompt = f"""
Analyze these key files from the repository:
{files_text}
Please provide detailed analysis covering:
1. **Code Quality**: Code style, organization, and best practices
2. **Design Patterns**: What patterns and architectural approaches are used?
3. **Dependencies & Libraries**: Key external dependencies identified
4. **Potential Issues**: Any code smells, security concerns, or improvements needed
5. **Testing Strategy**: How is testing implemented (if at all)?
6. **Documentation**: Quality of inline documentation and comments
7. **Maintainability**: How maintainable and extensible is this code?
"""
return self.call_claude_api(prompt, max_tokens=6000)
def analyze_security_and_best_practices(self, repo_data: Dict) -> str:
"""Analyze security and best practices."""
print("Analyzing security and best practices...")
# Look for security-sensitive files
security_files = []
for file_info in repo_data['files']:
path_lower = file_info['path'].lower()
if any(term in path_lower for term in ['config', 'env', 'secret', 'key', 'auth', 'security']):
if file_info.get('encoding') == 'utf-8':
security_files.append(f"=== {file_info['path']} ===\n{file_info['content'][:2000]}\n")
security_content = "\n".join(security_files[:10])
prompt = f"""
Analyze this repository for security and best practices:
SECURITY-RELEVANT FILES:
{security_content}
FILE STRUCTURE ANALYSIS:
{json.dumps(repo_data['stats'], indent=2)}
Please analyze:
1. **Security Issues**: Potential security vulnerabilities or concerns
2. **Secret Management**: How are secrets/credentials handled?
3. **Dependencies**: Are there any vulnerable dependencies?
4. **Best Practices**: Adherence to language/framework best practices
5. **Configuration**: Are configurations properly externalized?
6. **Error Handling**: How are errors handled throughout the codebase?
7. **Recommendations**: Specific suggestions for improvement
"""
return self.call_claude_api(prompt, max_tokens=5000)
def generate_comprehensive_report(self, repo_url: str, overview: str, code_analysis: str, security_analysis: str) -> str:
"""Generate final comprehensive report."""
print("Generating comprehensive report...")
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
report = f"""
# GitHub Repository Analysis Report
**Repository:** {repo_url}
**Analysis Date:** {timestamp}
**Analyzed by:** Claude AI Assistant
---
## Executive Summary
{overview}
---
## Detailed Code Analysis
{code_analysis}
---
## Security & Best Practices Analysis
{security_analysis}
---
## Recommendations Summary
Based on the analysis, here are the key recommendations for this repository:
1. **Immediate Actions**: Critical issues that should be addressed promptly
2. **Code Quality Improvements**: Suggestions for better maintainability
3. **Security Enhancements**: Steps to improve security posture
4. **Documentation**: Areas where documentation could be enhanced
5. **Architecture**: Potential architectural improvements
---
*This analysis was generated using AI and should be reviewed by human developers for accuracy and context.*
"""
return report
def analyze(self, repo_url: str, output_file: Optional[str] = None) -> str:
"""Main analysis function."""
try:
# Clone repository
self.clone_repository(repo_url)
# Scan repository structure and files
repo_data = self.scan_repository()
# Perform different types of analysis
overview = self.analyze_repository_overview(repo_data)
code_analysis = self.analyze_code_files(repo_data)
security_analysis = self.analyze_security_and_best_practices(repo_data)
# Generate comprehensive report
final_report = self.generate_comprehensive_report(
repo_url, overview, code_analysis, security_analysis
)
# Save report if output file specified
if output_file:
with open(output_file, 'w', encoding='utf-8') as f:
f.write(final_report)
print(f"Report saved to: {output_file}")
return final_report
finally:
# Cleanup temporary directory
if self.temp_dir and os.path.exists(self.temp_dir):
shutil.rmtree(self.temp_dir)
print("Temporary files cleaned up")
def main():
parser = argparse.ArgumentParser(description="Analyze GitHub repository using Claude AI")
parser.add_argument("repo_url", help="GitHub repository URL")
parser.add_argument("--api-key", required=True, help="Anthropic API key")
parser.add_argument("--output", "-o", help="Output file path (optional)")
parser.add_argument("--max-files", type=int, default=50, help="Maximum files to analyze")
args = parser.parse_args()
# Initialize analyzer
analyzer = GitHubRepoAnalyzer(args.api_key)
try:
print("Starting GitHub repository analysis...")
print("=" * 50)
# Perform analysis
report = analyzer.analyze(args.repo_url, args.output)
# Print report if no output file specified
if not args.output:
print("\n" + "=" * 50)
print("ANALYSIS REPORT")
print("=" * 50)
print(report)
print("\nAnalysis completed successfully!")
except Exception as e:
print(f"Error during analysis: {e}")
return 1
return 0
if __name__ == "__main__":
exit(main())