721 lines
29 KiB
Python
721 lines
29 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Robust GitHub Repository AI Analysis Tool
|
|
Simplified version with better error handling and JSON parsing.
|
|
"""
|
|
|
|
import os
|
|
import asyncio
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Tuple
|
|
from datetime import datetime
|
|
import argparse
|
|
from dataclasses import dataclass
|
|
import shutil
|
|
import tempfile
|
|
import json
|
|
import re
|
|
from collections import Counter
|
|
|
|
# Core packages
|
|
import anthropic
|
|
from dotenv import load_dotenv
|
|
import git
|
|
|
|
# PDF generation
|
|
from reportlab.lib.pagesizes import A4
|
|
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
|
from reportlab.lib.enums import TA_CENTER, TA_LEFT
|
|
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak, Table, TableStyle
|
|
from reportlab.lib import colors
|
|
|
|
@dataclass
|
|
class FileAnalysis:
|
|
path: str
|
|
language: str
|
|
lines_of_code: int
|
|
complexity_score: float
|
|
issues_found: List[str]
|
|
recommendations: List[str]
|
|
detailed_analysis: str
|
|
severity_score: float
|
|
|
|
@dataclass
|
|
class RepositoryAnalysis:
|
|
repo_path: str
|
|
total_files: int
|
|
total_lines: int
|
|
languages: Dict[str, int]
|
|
architecture_assessment: str
|
|
security_assessment: str
|
|
code_quality_score: float
|
|
file_analyses: List[FileAnalysis]
|
|
executive_summary: str
|
|
|
|
class RobustGitHubAnalyzer:
|
|
def __init__(self, api_key: str):
|
|
self.client = anthropic.Anthropic(api_key=api_key)
|
|
self.temp_dir = None
|
|
|
|
# Language mapping for file detection
|
|
self.language_map = {
|
|
'.py': 'Python', '.js': 'JavaScript', '.ts': 'TypeScript',
|
|
'.tsx': 'TypeScript', '.jsx': 'JavaScript', '.java': 'Java',
|
|
'.cpp': 'C++', '.c': 'C', '.cs': 'C#', '.go': 'Go', '.rs': 'Rust',
|
|
'.php': 'PHP', '.rb': 'Ruby', '.swift': 'Swift', '.kt': 'Kotlin',
|
|
'.html': 'HTML', '.css': 'CSS', '.scss': 'SCSS', '.sass': 'SASS',
|
|
'.sql': 'SQL', '.yaml': 'YAML', '.yml': 'YAML', '.json': 'JSON',
|
|
'.xml': 'XML', '.sh': 'Shell', '.dockerfile': 'Docker',
|
|
'.md': 'Markdown', '.txt': 'Text'
|
|
}
|
|
|
|
# Code file extensions to analyze
|
|
self.code_extensions = set(self.language_map.keys())
|
|
|
|
def clone_repository(self, repo_path: str) -> str:
|
|
"""Clone repository or use existing path."""
|
|
if os.path.exists(repo_path):
|
|
print(f"Using existing repository: {repo_path}")
|
|
return repo_path
|
|
else:
|
|
print(f"Cloning repository: {repo_path}")
|
|
self.temp_dir = tempfile.mkdtemp(prefix="repo_analysis_")
|
|
try:
|
|
git.Repo.clone_from(repo_path, self.temp_dir)
|
|
return self.temp_dir
|
|
except Exception as e:
|
|
raise Exception(f"Failed to clone repository: {e}")
|
|
|
|
def get_file_language(self, file_path: Path) -> str:
|
|
"""Get programming language from file extension."""
|
|
return self.language_map.get(file_path.suffix.lower(), 'Unknown')
|
|
|
|
def calculate_complexity_score(self, content: str) -> float:
|
|
"""Calculate basic complexity score based on code patterns."""
|
|
lines = content.split('\n')
|
|
complexity_indicators = ['if', 'else', 'elif', 'for', 'while', 'try', 'except', 'catch', 'switch']
|
|
|
|
complexity = 1
|
|
for line in lines:
|
|
line_lower = line.lower().strip()
|
|
for indicator in complexity_indicators:
|
|
if indicator in line_lower:
|
|
complexity += 1
|
|
|
|
# Normalize to 1-10 scale
|
|
return min(complexity / max(len(lines), 1) * 100, 10.0)
|
|
|
|
async def analyze_file_comprehensive(self, file_path: Path, content: str) -> FileAnalysis:
|
|
"""Perform comprehensive file analysis using a single, robust prompt."""
|
|
language = self.get_file_language(file_path)
|
|
lines_of_code = len([line for line in content.split('\n') if line.strip()])
|
|
complexity_score = self.calculate_complexity_score(content)
|
|
|
|
# Truncate content if too long
|
|
if len(content) > 4000:
|
|
content = content[:4000] + "\n... [truncated for analysis]"
|
|
|
|
print(f" Analyzing {file_path.name} ({language}, {lines_of_code} lines)")
|
|
|
|
# Create comprehensive analysis prompt
|
|
prompt = f"""
|
|
You are a senior software engineer with 25 years of experience. Analyze this {language} code file:
|
|
|
|
FILENAME: {file_path.name}
|
|
LANGUAGE: {language}
|
|
LINES OF CODE: {lines_of_code}
|
|
|
|
CODE:
|
|
```{language.lower()}
|
|
{content}
|
|
```
|
|
|
|
Provide a comprehensive analysis covering:
|
|
|
|
1. ISSUES FOUND: List specific problems, bugs, security vulnerabilities, or code smells
|
|
2. RECOMMENDATIONS: Actionable suggestions for improvement
|
|
3. CODE QUALITY: Overall assessment of code quality and maintainability
|
|
4. SECURITY: Any security concerns or vulnerabilities
|
|
5. PERFORMANCE: Potential performance issues or optimizations
|
|
6. BEST PRACTICES: Adherence to coding standards and best practices
|
|
|
|
Provide your analysis in clear, structured text (not JSON). Be specific and actionable.
|
|
Rate the overall code quality from 1-10 where 10 is excellent.
|
|
|
|
ANALYSIS:
|
|
"""
|
|
|
|
try:
|
|
message = self.client.messages.create(
|
|
model=os.getenv("CLAUDE_MODEL", "claude-3-5-haiku-latest"),
|
|
max_tokens=3000,
|
|
temperature=0.1,
|
|
messages=[{"role": "user", "content": prompt}]
|
|
)
|
|
|
|
analysis_text = message.content[0].text.strip()
|
|
|
|
# Extract severity score from analysis
|
|
severity_match = re.search(r'(\d+(?:\.\d+)?)/10', analysis_text)
|
|
severity_score = float(severity_match.group(1)) if severity_match else 5.0
|
|
|
|
# Parse issues and recommendations from the text
|
|
issues = self.extract_issues_from_analysis(analysis_text)
|
|
recommendations = self.extract_recommendations_from_analysis(analysis_text)
|
|
|
|
return FileAnalysis(
|
|
path=str(file_path.relative_to(Path(self.temp_dir or '.'))),
|
|
language=language,
|
|
lines_of_code=lines_of_code,
|
|
complexity_score=complexity_score,
|
|
issues_found=issues,
|
|
recommendations=recommendations,
|
|
detailed_analysis=analysis_text,
|
|
severity_score=severity_score
|
|
)
|
|
|
|
except Exception as e:
|
|
print(f" Error analyzing {file_path.name}: {e}")
|
|
return FileAnalysis(
|
|
path=str(file_path),
|
|
language=language,
|
|
lines_of_code=lines_of_code,
|
|
complexity_score=complexity_score,
|
|
issues_found=[f"Analysis failed: {str(e)}"],
|
|
recommendations=["Review file manually due to analysis error"],
|
|
detailed_analysis=f"Analysis failed due to error: {str(e)}",
|
|
severity_score=5.0
|
|
)
|
|
|
|
def extract_issues_from_analysis(self, analysis_text: str) -> List[str]:
|
|
"""Extract issues from analysis text."""
|
|
issues = []
|
|
lines = analysis_text.split('\n')
|
|
|
|
# Look for common issue indicators
|
|
issue_keywords = ['issue', 'problem', 'bug', 'vulnerability', 'error', 'warning', 'concern']
|
|
|
|
for line in lines:
|
|
line_lower = line.lower().strip()
|
|
if any(keyword in line_lower for keyword in issue_keywords):
|
|
if line.strip() and not line.strip().startswith('#'):
|
|
issues.append(line.strip())
|
|
|
|
return issues[:10] # Limit to top 10 issues
|
|
|
|
def extract_recommendations_from_analysis(self, analysis_text: str) -> List[str]:
|
|
"""Extract recommendations from analysis text."""
|
|
recommendations = []
|
|
lines = analysis_text.split('\n')
|
|
|
|
# Look for recommendation indicators
|
|
rec_keywords = ['recommend', 'suggest', 'should', 'consider', 'improve']
|
|
|
|
for line in lines:
|
|
line_lower = line.lower().strip()
|
|
if any(keyword in line_lower for keyword in rec_keywords):
|
|
if line.strip() and not line.strip().startswith('#'):
|
|
recommendations.append(line.strip())
|
|
|
|
return recommendations[:10] # Limit to top 10 recommendations
|
|
|
|
def scan_repository(self, repo_path: str, max_files: int = 50) -> List[Tuple[Path, str]]:
|
|
"""Scan repository and collect files for analysis."""
|
|
print(f"Scanning repository: {repo_path}")
|
|
|
|
files_to_analyze = []
|
|
|
|
# Important files to always include
|
|
important_files = {
|
|
'README.md', 'package.json', 'requirements.txt', 'Dockerfile',
|
|
'docker-compose.yml', 'tsconfig.json', 'next.config.js',
|
|
'tailwind.config.js', 'webpack.config.js', '.env.example'
|
|
}
|
|
|
|
for root, dirs, files in os.walk(repo_path):
|
|
# Skip common build/cache directories
|
|
dirs[:] = [d for d in dirs if not d.startswith('.') and
|
|
d not in {'node_modules', '__pycache__', 'build', 'dist', 'target',
|
|
'venv', 'env', '.git', '.next', 'coverage'}]
|
|
|
|
for file in files:
|
|
if len(files_to_analyze) >= max_files:
|
|
break
|
|
|
|
file_path = Path(root) / file
|
|
|
|
# Skip large files
|
|
try:
|
|
if file_path.stat().st_size > 1000000: # 1MB limit
|
|
continue
|
|
except:
|
|
continue
|
|
|
|
# Include important files or files with code extensions
|
|
should_include = (
|
|
file.lower() in important_files or
|
|
file_path.suffix.lower() in self.code_extensions or
|
|
file.lower().startswith('dockerfile')
|
|
)
|
|
|
|
if should_include:
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
content = f.read()
|
|
if content.strip(): # Only non-empty files
|
|
files_to_analyze.append((file_path, content))
|
|
except Exception as e:
|
|
print(f"Could not read {file_path}: {e}")
|
|
|
|
print(f"Found {len(files_to_analyze)} files to analyze")
|
|
return files_to_analyze
|
|
|
|
async def analyze_repository_overview(self, repo_path: str, file_analyses: List[FileAnalysis]) -> Tuple[str, str]:
|
|
"""Analyze repository architecture and security."""
|
|
print("Analyzing repository overview...")
|
|
|
|
# Prepare summary data
|
|
languages = dict(Counter(fa.language for fa in file_analyses))
|
|
total_lines = sum(fa.lines_of_code for fa in file_analyses)
|
|
# Calculate average quality safely
|
|
if file_analyses and len(file_analyses) > 0:
|
|
valid_scores = [fa.severity_score for fa in file_analyses if fa.severity_score is not None]
|
|
avg_quality = sum(valid_scores) / len(valid_scores) if valid_scores else 5.0
|
|
else:
|
|
avg_quality = 5.0
|
|
|
|
# Get repository structure
|
|
structure_lines = []
|
|
try:
|
|
for root, dirs, files in os.walk(repo_path):
|
|
dirs[:] = [d for d in dirs if not d.startswith('.') and d not in {'node_modules', '__pycache__'}]
|
|
level = root.replace(repo_path, '').count(os.sep)
|
|
indent = ' ' * level
|
|
structure_lines.append(f"{indent}{os.path.basename(root)}/")
|
|
for file in files[:3]: # Limit files shown per directory
|
|
structure_lines.append(f"{indent} {file}")
|
|
if len(structure_lines) > 50: # Limit total structure size
|
|
break
|
|
except Exception as e:
|
|
structure_lines = [f"Error reading structure: {e}"]
|
|
|
|
# Architecture analysis
|
|
arch_prompt = f"""
|
|
You are a Senior Software Architect with 25 years of experience.
|
|
|
|
Analyze this repository:
|
|
|
|
REPOSITORY STRUCTURE:
|
|
{chr(10).join(structure_lines[:30])}
|
|
|
|
STATISTICS:
|
|
- Total files analyzed: {len(file_analyses)}
|
|
- Total lines of code: {total_lines:,}
|
|
- Languages: {languages}
|
|
- Average code quality: {avg_quality:.1f}/10
|
|
|
|
TOP FILE ISSUES:
|
|
{chr(10).join([f"- {fa.path}: {len(fa.issues_found)} issues" for fa in file_analyses[:10]])}
|
|
|
|
Provide an architectural assessment covering:
|
|
1. Project type and purpose
|
|
2. Technology stack evaluation
|
|
3. Code organization and structure
|
|
4. Scalability and maintainability concerns
|
|
5. Key recommendations for improvement
|
|
|
|
Keep response under 1500 words and focus on actionable insights.
|
|
"""
|
|
|
|
# Security analysis
|
|
security_issues = []
|
|
for fa in file_analyses:
|
|
security_issues.extend([issue for issue in fa.issues_found if
|
|
any(keyword in issue.lower() for keyword in
|
|
['security', 'vulnerability', 'injection', 'xss', 'auth', 'password'])])
|
|
|
|
sec_prompt = f"""
|
|
You are a Senior Security Engineer with 20+ years of experience.
|
|
|
|
Security Analysis for repository with {len(file_analyses)} files:
|
|
|
|
SECURITY ISSUES FOUND:
|
|
{chr(10).join(security_issues[:20]) if security_issues else "No obvious security issues detected"}
|
|
|
|
HIGH-RISK FILE TYPES PRESENT:
|
|
{[lang for lang, count in languages.items() if lang in ['JavaScript', 'TypeScript', 'Python', 'PHP', 'SQL']]}
|
|
|
|
Provide security assessment covering:
|
|
1. Overall security posture
|
|
2. Main security risks and vulnerabilities
|
|
3. Authentication and authorization concerns
|
|
4. Data protection and privacy issues
|
|
5. Immediate security priorities
|
|
|
|
Keep response under 1000 words and focus on actionable security recommendations.
|
|
"""
|
|
|
|
try:
|
|
# Run both analyses
|
|
arch_task = self.client.messages.create(
|
|
model=os.getenv("CLAUDE_MODEL", "claude-3-5-haiku-latest"),
|
|
max_tokens=2000,
|
|
temperature=0.1,
|
|
messages=[{"role": "user", "content": arch_prompt}]
|
|
)
|
|
|
|
sec_task = self.client.messages.create(
|
|
model=os.getenv("CLAUDE_MODEL", "claude-3-5-haiku-latest"),
|
|
max_tokens=1500,
|
|
temperature=0.1,
|
|
messages=[{"role": "user", "content": sec_prompt}]
|
|
)
|
|
|
|
architecture_assessment = arch_task.content[0].text
|
|
security_assessment = sec_task.content[0].text
|
|
|
|
return architecture_assessment, security_assessment
|
|
|
|
except Exception as e:
|
|
return f"Architecture analysis failed: {e}", f"Security analysis failed: {e}"
|
|
|
|
async def generate_executive_summary(self, analysis: RepositoryAnalysis) -> str:
|
|
"""Generate executive summary for leadership."""
|
|
print("Generating executive summary...")
|
|
|
|
prompt = f"""
|
|
You are presenting to C-level executives. Create an executive summary of this technical analysis:
|
|
|
|
REPOSITORY METRICS:
|
|
- Total Files: {analysis.total_files}
|
|
- Lines of Code: {analysis.total_lines:,}
|
|
- Languages: {analysis.languages}
|
|
- Code Quality Score: {analysis.code_quality_score:.1f}/10
|
|
|
|
KEY FINDINGS:
|
|
- Total issues identified: {sum(len(fa.issues_found) for fa in analysis.file_analyses)}
|
|
- Files needing attention: {len([fa for fa in analysis.file_analyses if fa.severity_score < 7])}
|
|
- High-quality files: {len([fa for fa in analysis.file_analyses if fa.severity_score >= 8])}
|
|
|
|
Create an executive summary for non-technical leadership covering:
|
|
1. Business impact of code quality findings
|
|
2. Risk assessment and implications
|
|
3. Investment priorities and recommendations
|
|
4. Expected ROI from addressing technical debt
|
|
5. Competitive implications
|
|
|
|
Focus on business outcomes, not technical details. Keep under 800 words.
|
|
"""
|
|
|
|
try:
|
|
message = self.client.messages.create(
|
|
model=os.getenv("CLAUDE_MODEL", "claude-3-5-haiku-latest"),
|
|
max_tokens=1200,
|
|
temperature=0.1,
|
|
messages=[{"role": "user", "content": prompt}]
|
|
)
|
|
return message.content[0].text
|
|
except Exception as e:
|
|
return f"Executive summary generation failed: {e}"
|
|
|
|
def create_pdf_report(self, analysis: RepositoryAnalysis, output_path: str):
|
|
"""Generate comprehensive PDF report."""
|
|
print(f"Generating PDF report: {output_path}")
|
|
|
|
doc = SimpleDocTemplate(output_path, pagesize=A4,
|
|
leftMargin=72, rightMargin=72,
|
|
topMargin=72, bottomMargin=72)
|
|
styles = getSampleStyleSheet()
|
|
story = []
|
|
|
|
# Custom styles
|
|
title_style = ParagraphStyle(
|
|
'CustomTitle',
|
|
parent=styles['Heading1'],
|
|
fontSize=24,
|
|
textColor=colors.darkblue,
|
|
spaceAfter=30,
|
|
alignment=TA_CENTER
|
|
)
|
|
|
|
heading_style = ParagraphStyle(
|
|
'CustomHeading',
|
|
parent=styles['Heading2'],
|
|
fontSize=16,
|
|
textColor=colors.darkblue,
|
|
spaceBefore=20,
|
|
spaceAfter=10
|
|
)
|
|
|
|
# Title Page
|
|
story.append(Paragraph("Repository Analysis Report", title_style))
|
|
story.append(Spacer(1, 20))
|
|
story.append(Paragraph(f"<b>Repository:</b> {analysis.repo_path}", styles['Normal']))
|
|
story.append(Paragraph(f"<b>Analysis Date:</b> {datetime.now().strftime('%B %d, %Y at %H:%M')}", styles['Normal']))
|
|
story.append(Paragraph("<b>Generated by:</b> AI Senior Engineering Team", styles['Normal']))
|
|
story.append(PageBreak())
|
|
|
|
# Executive Summary
|
|
story.append(Paragraph("Executive Summary", heading_style))
|
|
story.append(Paragraph(analysis.executive_summary, styles['Normal']))
|
|
story.append(PageBreak())
|
|
|
|
# Repository Overview
|
|
story.append(Paragraph("Repository Overview", heading_style))
|
|
|
|
overview_data = [
|
|
['Metric', 'Value'],
|
|
['Total Files Analyzed', str(analysis.total_files)],
|
|
['Total Lines of Code', f"{analysis.total_lines:,}"],
|
|
['Primary Languages', ', '.join(list(analysis.languages.keys())[:5]) if analysis.languages else 'Unknown'],
|
|
['Overall Code Quality', f"{analysis.code_quality_score:.1f}/10"],
|
|
]
|
|
|
|
overview_table = Table(overview_data, colWidths=[200, 300])
|
|
overview_table.setStyle(TableStyle([
|
|
('BACKGROUND', (0, 0), (-1, 0), colors.grey),
|
|
('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
|
|
('ALIGN', (0, 0), (-1, -1), 'LEFT'),
|
|
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
|
('FONTSIZE', (0, 0), (-1, 0), 12),
|
|
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
|
|
('BACKGROUND', (0, 1), (-1, -1), colors.beige),
|
|
('GRID', (0, 0), (-1, -1), 1, colors.black)
|
|
]))
|
|
|
|
story.append(overview_table)
|
|
story.append(Spacer(1, 20))
|
|
|
|
# Languages Distribution
|
|
if analysis.languages:
|
|
story.append(Paragraph("Language Distribution", heading_style))
|
|
lang_data = [['Language', 'Files']]
|
|
for lang, count in sorted(analysis.languages.items(), key=lambda x: x[1], reverse=True):
|
|
lang_data.append([lang, str(count)])
|
|
|
|
lang_table = Table(lang_data, colWidths=[200, 100])
|
|
lang_table.setStyle(TableStyle([
|
|
('BACKGROUND', (0, 0), (-1, 0), colors.grey),
|
|
('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
|
|
('ALIGN', (0, 0), (-1, -1), 'LEFT'),
|
|
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
|
('GRID', (0, 0), (-1, -1), 1, colors.black)
|
|
]))
|
|
story.append(lang_table)
|
|
story.append(PageBreak())
|
|
|
|
# Architecture Assessment
|
|
story.append(Paragraph("Architecture Assessment", heading_style))
|
|
# Split long text into paragraphs
|
|
arch_paragraphs = analysis.architecture_assessment.split('\n\n')
|
|
for para in arch_paragraphs[:10]: # Limit paragraphs
|
|
if para.strip():
|
|
story.append(Paragraph(para.strip(), styles['Normal']))
|
|
story.append(Spacer(1, 10))
|
|
story.append(PageBreak())
|
|
|
|
# Security Assessment
|
|
story.append(Paragraph("Security Assessment", heading_style))
|
|
sec_paragraphs = analysis.security_assessment.split('\n\n')
|
|
for para in sec_paragraphs[:10]: # Limit paragraphs
|
|
if para.strip():
|
|
story.append(Paragraph(para.strip(), styles['Normal']))
|
|
story.append(Spacer(1, 10))
|
|
story.append(PageBreak())
|
|
|
|
# File Analysis Summary
|
|
story.append(Paragraph("File Analysis Summary", heading_style))
|
|
|
|
# Summary statistics
|
|
high_quality_files = [fa for fa in analysis.file_analyses if fa.severity_score >= 8]
|
|
medium_quality_files = [fa for fa in analysis.file_analyses if 5 <= fa.severity_score < 8]
|
|
low_quality_files = [fa for fa in analysis.file_analyses if fa.severity_score < 5]
|
|
|
|
# Calculate percentages safely
|
|
total_files = len(analysis.file_analyses) if analysis.file_analyses else 1
|
|
quality_data = [
|
|
['Quality Level', 'Files', 'Percentage'],
|
|
['High Quality (8-10)', str(len(high_quality_files)), f"{len(high_quality_files)/total_files*100:.1f}%"],
|
|
['Medium Quality (5-7)', str(len(medium_quality_files)), f"{len(medium_quality_files)/total_files*100:.1f}%"],
|
|
['Low Quality (1-4)', str(len(low_quality_files)), f"{len(low_quality_files)/total_files*100:.1f}%"]
|
|
]
|
|
|
|
quality_table = Table(quality_data)
|
|
quality_table.setStyle(TableStyle([
|
|
('BACKGROUND', (0, 0), (-1, 0), colors.grey),
|
|
('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
|
|
('ALIGN', (0, 0), (-1, -1), 'CENTER'),
|
|
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
|
('GRID', (0, 0), (-1, -1), 1, colors.black),
|
|
('BACKGROUND', (0, 1), (-1, 1), colors.lightgreen),
|
|
('BACKGROUND', (0, 2), (-1, 2), colors.lightyellow),
|
|
('BACKGROUND', (0, 3), (-1, 3), colors.lightcoral)
|
|
]))
|
|
|
|
story.append(quality_table)
|
|
story.append(Spacer(1, 20))
|
|
|
|
# Top Issues Found
|
|
story.append(Paragraph("Files Requiring Attention", heading_style))
|
|
|
|
# Sort files by severity (lowest scores first - need most attention)
|
|
files_by_priority = sorted(analysis.file_analyses, key=lambda x: x.severity_score)
|
|
|
|
for i, file_analysis in enumerate(files_by_priority[:15]): # Top 15 files needing attention
|
|
story.append(Paragraph(f"<b>{i+1}. {file_analysis.path}</b>", styles['Heading4']))
|
|
story.append(Paragraph(f"Language: {file_analysis.language} | Quality Score: {file_analysis.severity_score:.1f}/10 | Lines: {file_analysis.lines_of_code}", styles['Normal']))
|
|
|
|
# Show top issues
|
|
if file_analysis.issues_found:
|
|
story.append(Paragraph("Key Issues:", styles['Heading5']))
|
|
for issue in file_analysis.issues_found[:3]: # Top 3 issues
|
|
story.append(Paragraph(f"• {issue}", styles['Normal']))
|
|
|
|
# Show top recommendations
|
|
if file_analysis.recommendations:
|
|
story.append(Paragraph("Recommendations:", styles['Heading5']))
|
|
for rec in file_analysis.recommendations[:2]: # Top 2 recommendations
|
|
story.append(Paragraph(f"• {rec}", styles['Normal']))
|
|
|
|
story.append(Spacer(1, 15))
|
|
|
|
# Build PDF
|
|
try:
|
|
doc.build(story)
|
|
print(f"✅ PDF report generated successfully: {output_path}")
|
|
except Exception as e:
|
|
print(f"❌ Error generating PDF: {e}")
|
|
|
|
async def analyze_repository(self, repo_path: str, max_files: int = 50) -> RepositoryAnalysis:
|
|
"""Main analysis function."""
|
|
try:
|
|
# Clone/access repository
|
|
actual_repo_path = self.clone_repository(repo_path)
|
|
|
|
# Scan files
|
|
files_to_analyze = self.scan_repository(actual_repo_path, max_files)
|
|
|
|
if not files_to_analyze:
|
|
raise Exception("No files found to analyze")
|
|
|
|
# Analyze each file
|
|
print(f"Starting analysis of {len(files_to_analyze)} files...")
|
|
file_analyses = []
|
|
|
|
for i, (file_path, content) in enumerate(files_to_analyze):
|
|
print(f"Analyzing file {i+1}/{len(files_to_analyze)}: {file_path.name}")
|
|
analysis = await self.analyze_file_comprehensive(file_path, content)
|
|
file_analyses.append(analysis)
|
|
|
|
# Small delay to avoid rate limiting
|
|
await asyncio.sleep(0.2)
|
|
|
|
# Repository-level analyses
|
|
print("Performing repository-level analysis...")
|
|
architecture_assessment, security_assessment = await self.analyze_repository_overview(
|
|
actual_repo_path, file_analyses)
|
|
|
|
# Calculate overall quality score safely
|
|
if file_analyses and len(file_analyses) > 0:
|
|
valid_scores = [fa.severity_score for fa in file_analyses if fa.severity_score is not None]
|
|
avg_quality = sum(valid_scores) / len(valid_scores) if valid_scores else 5.0
|
|
else:
|
|
avg_quality = 5.0
|
|
|
|
# Generate statistics
|
|
languages = dict(Counter(fa.language for fa in file_analyses))
|
|
total_lines = sum(fa.lines_of_code for fa in file_analyses)
|
|
|
|
# Create repository analysis
|
|
repo_analysis = RepositoryAnalysis(
|
|
repo_path=repo_path,
|
|
total_files=len(file_analyses),
|
|
total_lines=total_lines,
|
|
languages=languages,
|
|
architecture_assessment=architecture_assessment,
|
|
security_assessment=security_assessment,
|
|
code_quality_score=avg_quality,
|
|
file_analyses=file_analyses,
|
|
executive_summary=""
|
|
)
|
|
|
|
# Generate executive summary
|
|
print("Generating executive summary...")
|
|
repo_analysis.executive_summary = await self.generate_executive_summary(repo_analysis)
|
|
|
|
return repo_analysis
|
|
|
|
finally:
|
|
# Cleanup
|
|
if self.temp_dir and os.path.exists(self.temp_dir):
|
|
shutil.rmtree(self.temp_dir)
|
|
print("Temporary files cleaned up")
|
|
|
|
async def main():
|
|
# Load environment variables
|
|
load_dotenv()
|
|
|
|
parser = argparse.ArgumentParser(description="Robust GitHub Repository AI Analysis")
|
|
parser.add_argument("repo_path", help="Repository path (local directory or Git URL)")
|
|
parser.add_argument("--output", "-o", default="repository_analysis.pdf",
|
|
help="Output PDF file path")
|
|
parser.add_argument("--max-files", type=int, default=50,
|
|
help="Maximum files to analyze")
|
|
parser.add_argument("--api-key", help="Anthropic API key (overrides .env)")
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Get API key
|
|
api_key = args.api_key or os.getenv('ANTHROPIC_API_KEY')
|
|
if not api_key:
|
|
print("❌ Error: ANTHROPIC_API_KEY not found in .env file or command line")
|
|
print("Please create a .env file with: ANTHROPIC_API_KEY=your_key_here")
|
|
return 1
|
|
|
|
try:
|
|
print("🚀 Starting Repository Analysis")
|
|
print("=" * 60)
|
|
print(f"Repository: {args.repo_path}")
|
|
print(f"Max files: {args.max_files}")
|
|
print(f"Output: {args.output}")
|
|
print("=" * 60)
|
|
|
|
# Initialize analyzer
|
|
analyzer = RobustGitHubAnalyzer(api_key)
|
|
|
|
# Perform analysis
|
|
analysis = await analyzer.analyze_repository(args.repo_path, args.max_files)
|
|
|
|
# Generate PDF report
|
|
analyzer.create_pdf_report(analysis, args.output)
|
|
|
|
# Print summary to console
|
|
print("\n" + "=" * 60)
|
|
print("🎯 ANALYSIS COMPLETE")
|
|
print("=" * 60)
|
|
print(f"📊 Repository Statistics:")
|
|
print(f" • Files Analyzed: {analysis.total_files}")
|
|
print(f" • Lines of Code: {analysis.total_lines:,}")
|
|
print(f" • Languages: {len(analysis.languages)}")
|
|
print(f" • Code Quality: {analysis.code_quality_score:.1f}/10")
|
|
|
|
# Quality breakdown
|
|
high_quality = len([fa for fa in analysis.file_analyses if fa.severity_score >= 8])
|
|
low_quality = len([fa for fa in analysis.file_analyses if fa.severity_score < 5])
|
|
|
|
print(f"\n📈 Quality Breakdown:")
|
|
print(f" • High Quality Files: {high_quality}")
|
|
print(f" • Files Needing Attention: {low_quality}")
|
|
print(f" • Total Issues Found: {sum(len(fa.issues_found) for fa in analysis.file_analyses)}")
|
|
|
|
print(f"\n📄 Detailed PDF Report: {args.output}")
|
|
print("\n✅ Analysis completed successfully!")
|
|
|
|
return 0
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error during analysis: {e}")
|
|
return 1
|
|
|
|
if __name__ == "__main__":
|
|
exit(asyncio.run(main())) |