codenuk_backend_mine/services/ai-analysis-service/ai-analysis/app.py

#!/usr/bin/env python3
"""
GitHub Repository AI Analysis Tool
Analyzes GitHub repositories using Claude API for comprehensive code insights.
"""

import os
import git
import json
import requests
import tempfile
import shutil
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import argparse
from datetime import datetime
import mimetypes
import base64

class GitHubRepoAnalyzer:
    def __init__(self, anthropic_api_key: str):
        self.api_key = anthropic_api_key
        self.api_url = "https://api.anthropic.com/v1/messages"
        self.temp_dir = None

        # File extensions to analyze
        self.code_extensions = {
            '.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.cpp', '.c', '.h',
            '.cs', '.php', '.rb', '.go', '.rs', '.swift', '.kt', '.scala',
            '.html', '.css', '.scss', '.sass', '.less', '.vue', '.svelte',
            '.sql', '.sh', '.bash', '.yml', '.yaml', '.json', '.xml',
            '.dockerfile', '.md', '.rst', '.txt'
        }

        # Files to always include in analysis
        self.important_files = {
            'README.md', 'readme.md', 'README.txt', 'readme.txt',
            'package.json', 'requirements.txt', 'Cargo.toml', 'pom.xml',
            'build.gradle', 'Makefile', 'dockerfile', 'Dockerfile',
            'docker-compose.yml', '.gitignore', 'setup.py', 'pyproject.toml'
        }

    def clone_repository(self, repo_url: str) -> str:
        """Clone GitHub repository to temporary directory."""
        print(f"Cloning repository: {repo_url}")

        self.temp_dir = tempfile.mkdtemp(prefix="github_analysis_")

        try:
            git.Repo.clone_from(repo_url, self.temp_dir)
            print(f"Repository cloned to: {self.temp_dir}")
            return self.temp_dir
        except git.exc.GitCommandError as e:
            raise Exception(f"Failed to clone repository: {e}")

    def get_file_info(self, file_path: Path) -> Dict:
        """Get file information and content."""
        try:
            # Check file size (skip files larger than 1MB)
            if file_path.stat().st_size > 1024 * 1024:
                return {
                    'path': str(file_path.relative_to(self.temp_dir)),
                    'size': file_path.stat().st_size,
                    'content': '[File too large to analyze]',
                    'encoding': 'skipped'
                }

            # Try to read as text
            try:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    content = f.read()
                encoding = 'utf-8'
            except UnicodeDecodeError:
                # If text fails, try binary for certain file types
                with open(file_path, 'rb') as f:
                    raw_content = f.read()
                    if len(raw_content) < 10000:  # Only encode small binary files
                        content = base64.b64encode(raw_content).decode('ascii')
                        encoding = 'base64'
                    else:
                        content = '[Binary file - content not included]'
                        encoding = 'binary'

            return {
                'path': str(file_path.relative_to(self.temp_dir)),
                'size': file_path.stat().st_size,
                'content': content,
                'encoding': encoding,
                'mime_type': mimetypes.guess_type(str(file_path))[0]
            }
        except Exception as e:
            return {
                'path': str(file_path.relative_to(self.temp_dir)),
                'error': str(e),
                'content': '[Error reading file]'
            }

    def scan_repository(self, max_files: int = 50) -> Dict:
        """Scan repository and collect file information."""
        print("Scanning repository structure...")

        repo_data = {
            'structure': [],
            'files': [],
            'stats': {
                'total_files': 0,
                'analyzed_files': 0,
                'total_size': 0,
                'languages': {}
            }
        }

        # Get directory structure
        for root, dirs, files in os.walk(self.temp_dir):
            # Skip hidden directories and common build/cache directories
            dirs[:] = [d for d in dirs if not d.startswith('.') and
                      d not in {'node_modules', '__pycache__', 'build', 'dist', 'target', 'venv', 'env'}]

            level = root.replace(self.temp_dir, '').count(os.sep)
            indent = '  ' * level
            folder_name = os.path.basename(root) if root != self.temp_dir else '.'
            repo_data['structure'].append(f"{indent}{folder_name}/")

            # Process files
            for file in files:
                if file.startswith('.'):
                    continue

                file_path = Path(root) / file
                repo_data['stats']['total_files'] += 1
                repo_data['stats']['total_size'] += file_path.stat().st_size

                # Track languages
                ext = file_path.suffix.lower()
                if ext:
                    repo_data['stats']['languages'][ext] = repo_data['stats']['languages'].get(ext, 0) + 1

                # Add to structure
                repo_data['structure'].append(f"{indent}  {file}")

                # Decide if we should analyze this file
                should_analyze = (
                    file.lower() in self.important_files or
                    ext in self.code_extensions or
                    repo_data['stats']['analyzed_files'] < max_files
                )

                if should_analyze and repo_data['stats']['analyzed_files'] < max_files:
                    file_info = self.get_file_info(file_path)
                    repo_data['files'].append(file_info)
                    repo_data['stats']['analyzed_files'] += 1

        return repo_data

    def call_claude_api(self, prompt: str, max_tokens: int = 4000) -> str:
        """Make API call to Claude."""
        headers = {
            "Content-Type": "application/json",
            "x-api-key": self.api_key,
            "anthropic-version": "2023-06-01"
        }

        data = {
            "model": os.getenv("CLAUDE_MODEL", "claude-3-5-haiku-latest"),
            "max_tokens": max_tokens,
            "messages": [
                {"role": "user", "content": prompt}
            ]
        }

        try:
            response = requests.post(self.api_url, headers=headers, json=data)
            response.raise_for_status()

            result = response.json()
            return result['content'][0]['text']
        except requests.exceptions.RequestException as e:
            raise Exception(f"API request failed: {e}")

    def analyze_repository_overview(self, repo_data: Dict) -> str:
        """Get high-level repository analysis."""
        print("Analyzing repository overview...")

        structure_summary = "\n".join(repo_data['structure'][:100])  # Limit structure size

        prompt = f"""
Analyze this GitHub repository and provide a comprehensive overview:

REPOSITORY STRUCTURE:
{structure_summary}

STATISTICS:
- Total files: {repo_data['stats']['total_files']}
- Files analyzed: {repo_data['stats']['analyzed_files']}
- Total size: {repo_data['stats']['total_size']} bytes
- Languages found: {dict(list(repo_data['stats']['languages'].items())[:10])}

Please provide:
1. **Project Type & Purpose**: What kind of project is this?
2. **Technology Stack**: What technologies, frameworks, and languages are used?
3. **Architecture Overview**: How is the project structured?
4. **Key Components**: What are the main modules/components?
5. **Development Setup**: What's needed to run this project?
6. **Code Quality Assessment**: Initial observations about code organization
"""

        return self.call_claude_api(prompt)

    def analyze_code_files(self, repo_data: Dict) -> str:
        """Analyze individual code files."""
        print("Analyzing code files...")

        # Prepare file contents for analysis
        files_content = []
        for file_info in repo_data['files'][:20]:  # Limit to first 20 files
            if file_info.get('encoding') == 'utf-8' and len(file_info.get('content', '')) < 5000:
                files_content.append(f"=== {file_info['path']} ===\n{file_info['content']}\n")

        files_text = "\n".join(files_content)

        prompt = f"""
Analyze these key files from the repository:

{files_text}

Please provide detailed analysis covering:
1. **Code Quality**: Code style, organization, and best practices
2. **Design Patterns**: What patterns and architectural approaches are used?
3. **Dependencies & Libraries**: Key external dependencies identified
4. **Potential Issues**: Any code smells, security concerns, or improvements needed
5. **Testing Strategy**: How is testing implemented (if at all)?
6. **Documentation**: Quality of inline documentation and comments
7. **Maintainability**: How maintainable and extensible is this code?
"""

        return self.call_claude_api(prompt, max_tokens=6000)

    def analyze_security_and_best_practices(self, repo_data: Dict) -> str:
        """Analyze security and best practices."""
        print("Analyzing security and best practices...")

        # Look for security-sensitive files
        security_files = []
        for file_info in repo_data['files']:
            path_lower = file_info['path'].lower()
            if any(term in path_lower for term in ['config', 'env', 'secret', 'key', 'auth', 'security']):
                if file_info.get('encoding') == 'utf-8':
                    security_files.append(f"=== {file_info['path']} ===\n{file_info['content'][:2000]}\n")

        security_content = "\n".join(security_files[:10])

        prompt = f"""
Analyze this repository for security and best practices:

SECURITY-RELEVANT FILES:
{security_content}

FILE STRUCTURE ANALYSIS:
{json.dumps(repo_data['stats'], indent=2)}

Please analyze:
1. **Security Issues**: Potential security vulnerabilities or concerns
2. **Secret Management**: How are secrets/credentials handled?
3. **Dependencies**: Are there any vulnerable dependencies?
4. **Best Practices**: Adherence to language/framework best practices
5. **Configuration**: Are configurations properly externalized?
6. **Error Handling**: How are errors handled throughout the codebase?
7. **Recommendations**: Specific suggestions for improvement
"""

        return self.call_claude_api(prompt, max_tokens=5000)

    def generate_comprehensive_report(self, repo_url: str, overview: str, code_analysis: str, security_analysis: str) -> str:
        """Generate final comprehensive report."""
        print("Generating comprehensive report...")

        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        report = f"""
# GitHub Repository Analysis Report

**Repository:** {repo_url}
**Analysis Date:** {timestamp}
**Analyzed by:** Claude AI Assistant

---

## Executive Summary

{overview}

---

## Detailed Code Analysis

{code_analysis}

---

## Security & Best Practices Analysis

{security_analysis}

---

## Recommendations Summary

Based on the analysis, here are the key recommendations for this repository:

1. **Immediate Actions**: Critical issues that should be addressed promptly
2. **Code Quality Improvements**: Suggestions for better maintainability
3. **Security Enhancements**: Steps to improve security posture
4. **Documentation**: Areas where documentation could be enhanced
5. **Architecture**: Potential architectural improvements

---

*This analysis was generated using AI and should be reviewed by human developers for accuracy and context.*
"""

        return report

    def analyze(self, repo_url: str, output_file: Optional[str] = None) -> str:
        """Main analysis function."""
        try:
            # Clone repository
            self.clone_repository(repo_url)

            # Scan repository structure and files
            repo_data = self.scan_repository()

            # Perform different types of analysis
            overview = self.analyze_repository_overview(repo_data)
            code_analysis = self.analyze_code_files(repo_data)
            security_analysis = self.analyze_security_and_best_practices(repo_data)

            # Generate comprehensive report
            final_report = self.generate_comprehensive_report(
                repo_url, overview, code_analysis, security_analysis
            )

            # Save report if output file specified
            if output_file:
                with open(output_file, 'w', encoding='utf-8') as f:
                    f.write(final_report)
                print(f"Report saved to: {output_file}")

            return final_report

        finally:
            # Cleanup temporary directory
            if self.temp_dir and os.path.exists(self.temp_dir):
                shutil.rmtree(self.temp_dir)
                print("Temporary files cleaned up")

def main():
    parser = argparse.ArgumentParser(description="Analyze GitHub repository using Claude AI")
    parser.add_argument("repo_url", help="GitHub repository URL")
    parser.add_argument("--api-key", required=True, help="Anthropic API key")
    parser.add_argument("--output", "-o", help="Output file path (optional)")
    parser.add_argument("--max-files", type=int, default=50, help="Maximum files to analyze")

    args = parser.parse_args()

    # Initialize analyzer
    analyzer = GitHubRepoAnalyzer(args.api_key)

    try:
        print("Starting GitHub repository analysis...")
        print("=" * 50)

        # Perform analysis
        report = analyzer.analyze(args.repo_url, args.output)

        # Print report if no output file specified
        if not args.output:
            print("\n" + "=" * 50)
            print("ANALYSIS REPORT")
            print("=" * 50)
            print(report)

        print("\nAnalysis completed successfully!")

    except Exception as e:
        print(f"Error during analysis: {e}")
        return 1

    return 0

if __name__ == "__main__":
    exit(main())