1791 lines
77 KiB
Python
1791 lines
77 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Complete AI Repository Analysis Tool with Memory System
|
|
Automatically analyzes ALL files in a repository without limits.
|
|
|
|
Features:
|
|
- Analyzes ALL files in the repository (no max-files limit)
|
|
- No user query required - fully automated analysis
|
|
- Memory-enhanced analysis with learning capabilities
|
|
- Comprehensive PDF report generation
|
|
- Security, architecture, and code quality assessment
|
|
|
|
Usage:
|
|
python ai-analyze.py /path/to/repo --output analysis.pdf
|
|
|
|
Example:
|
|
python ai-analyze.py ./my-project --output complete_analysis.pdf
|
|
"""
|
|
|
|
import os
|
|
import asyncio
|
|
import hashlib
|
|
import json
|
|
import uuid
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Tuple, Any
|
|
from datetime import datetime, timedelta
|
|
from dataclasses import dataclass, asdict, field
|
|
from collections import defaultdict, Counter
|
|
import logging
|
|
import tempfile
|
|
import shutil
|
|
import re
|
|
import concurrent.futures
|
|
import threading
|
|
from functools import lru_cache
|
|
|
|
# Core packages
|
|
import anthropic
|
|
from dotenv import load_dotenv
|
|
import git
|
|
import redis
|
|
import pymongo
|
|
import psycopg2
|
|
from psycopg2.extras import RealDictCursor
|
|
import numpy as np
|
|
|
|
# PDF generation
|
|
from reportlab.lib.pagesizes import A4
|
|
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
|
from reportlab.lib.enums import TA_CENTER, TA_LEFT
|
|
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak, Table, TableStyle
|
|
from reportlab.lib import colors
|
|
|
|
# Enhanced dataclasses for memory system
|
|
@dataclass
|
|
class MemoryRecord:
|
|
id: str
|
|
timestamp: datetime
|
|
memory_type: str # 'episodic', 'persistent', 'working'
|
|
content: Dict[str, Any]
|
|
embeddings: Optional[List[float]] = None
|
|
metadata: Optional[Dict[str, Any]] = None
|
|
expiry: Optional[datetime] = None
|
|
|
|
@dataclass
|
|
class CodeAnalysisMemory:
|
|
repo_id: str
|
|
file_path: str
|
|
analysis_hash: str
|
|
analysis_data: Dict[str, Any]
|
|
embedding: List[float]
|
|
last_updated: datetime
|
|
access_count: int = 0
|
|
relevance_score: float = 1.0
|
|
|
|
@dataclass
|
|
class EpisodicMemory:
|
|
session_id: str
|
|
user_query: str
|
|
ai_response: str
|
|
repo_context: str
|
|
timestamp: datetime
|
|
embedding: List[float]
|
|
metadata: Dict[str, Any]
|
|
|
|
@dataclass
|
|
class PersistentMemory:
|
|
fact_id: str
|
|
content: str
|
|
category: str # 'code_pattern', 'best_practice', 'vulnerability', 'architecture'
|
|
confidence: float
|
|
embedding: List[float]
|
|
source_repos: List[str]
|
|
created_at: datetime
|
|
last_accessed: datetime
|
|
access_frequency: int = 0
|
|
|
|
@dataclass
|
|
class FileAnalysis:
|
|
path: str
|
|
language: str
|
|
lines_of_code: int
|
|
complexity_score: float
|
|
issues_found: List[str]
|
|
recommendations: List[str]
|
|
detailed_analysis: str
|
|
severity_score: float
|
|
|
|
def __post_init__(self):
|
|
"""Ensure all fields contain safe types for JSON serialization."""
|
|
# Convert path to string
|
|
if not isinstance(self.path, str):
|
|
self.path = str(self.path)
|
|
|
|
# Ensure issues_found is a list of strings
|
|
if not isinstance(self.issues_found, list):
|
|
if isinstance(self.issues_found, tuple):
|
|
self.issues_found = [str(i) for i in self.issues_found]
|
|
else:
|
|
self.issues_found = []
|
|
else:
|
|
self.issues_found = [str(i) if not isinstance(i, str) else i for i in self.issues_found]
|
|
|
|
# Ensure recommendations is a list of strings
|
|
if not isinstance(self.recommendations, list):
|
|
if isinstance(self.recommendations, tuple):
|
|
self.recommendations = [str(r) for r in self.recommendations]
|
|
else:
|
|
self.recommendations = []
|
|
else:
|
|
self.recommendations = [str(r) if not isinstance(r, str) else r for r in self.recommendations]
|
|
|
|
# Ensure detailed_analysis is a string
|
|
if not isinstance(self.detailed_analysis, str):
|
|
self.detailed_analysis = str(self.detailed_analysis)
|
|
|
|
@dataclass
|
|
class RepositoryAnalysis:
|
|
repo_path: str
|
|
total_files: int
|
|
total_lines: int
|
|
languages: Dict[str, int]
|
|
architecture_assessment: str
|
|
security_assessment: str
|
|
code_quality_score: float
|
|
file_analyses: List[FileAnalysis]
|
|
executive_summary: str
|
|
high_quality_files: List[str] = field(default_factory=list)
|
|
|
|
class MemoryManager:
|
|
"""Advanced memory management system for AI repository analysis."""
|
|
|
|
def __init__(self, config: Dict[str, Any]):
|
|
self.config = config
|
|
self.setup_logging()
|
|
|
|
# Initialize Claude client for embeddings
|
|
self.claude_client = anthropic.Anthropic(api_key=config.get('anthropic_api_key', ''))
|
|
|
|
# Initialize database connections
|
|
self.setup_databases()
|
|
|
|
# Memory configuration
|
|
self.working_memory_ttl = 3600 # 1 hour
|
|
self.episodic_retention_days = 365 # 1 year
|
|
self.persistent_memory_threshold = 0.8 # Confidence threshold for persistence
|
|
|
|
def setup_logging(self):
|
|
logging.basicConfig(level=logging.INFO)
|
|
self.logger = logging.getLogger(__name__)
|
|
|
|
def setup_databases(self):
|
|
"""Initialize all database connections with enhanced error handling."""
|
|
try:
|
|
# Redis for working memory (temporary, fast access) with localhost fallback
|
|
redis_host = self.config.get('redis_host', 'localhost')
|
|
redis_port = self.config.get('redis_port', 6380) # Use 6380 to avoid conflicts
|
|
redis_password = self.config.get('redis_password', 'redis_secure_2024')
|
|
|
|
self.redis_client = redis.Redis(
|
|
host=redis_host,
|
|
port=redis_port,
|
|
password=redis_password,
|
|
db=self.config.get('redis_db', 0),
|
|
decode_responses=True,
|
|
socket_connect_timeout=5,
|
|
socket_timeout=5
|
|
)
|
|
self.redis_client.ping()
|
|
self.logger.info(f"✅ Redis connected to {redis_host}:{redis_port}")
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"⚠️ Redis connection failed: {e}")
|
|
self.redis_client = None
|
|
|
|
try:
|
|
# MongoDB for documents and episodic memory with localhost fallback
|
|
mongo_url = self.config.get('mongodb_url', 'mongodb://pipeline_admin:mongo_secure_2024@localhost:27017/')
|
|
self.mongo_client = pymongo.MongoClient(mongo_url, serverSelectionTimeoutMS=5000)
|
|
self.mongo_client.admin.command('ping')
|
|
self.mongo_db = self.mongo_client[self.config.get('mongodb_name', 'repo_analyzer')]
|
|
|
|
# Collections
|
|
self.episodic_collection = self.mongo_db['episodic_memories']
|
|
self.analysis_collection = self.mongo_db['code_analyses']
|
|
self.persistent_collection = self.mongo_db['persistent_memories']
|
|
self.repo_metadata_collection = self.mongo_db['repository_metadata']
|
|
|
|
self.logger.info("✅ MongoDB connected successfully")
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"⚠️ MongoDB connection failed: {e}")
|
|
self.mongo_client = None
|
|
self.mongo_db = None
|
|
|
|
try:
|
|
# PostgreSQL with localhost fallback
|
|
self.pg_conn = psycopg2.connect(
|
|
host=self.config.get('postgres_host', 'localhost'),
|
|
port=self.config.get('postgres_port', 5432),
|
|
database=self.config.get('postgres_db', 'dev_pipeline'),
|
|
user=self.config.get('postgres_user', 'pipeline_admin'),
|
|
password=self.config.get('postgres_password', 'secure_pipeline_2024'),
|
|
connect_timeout=5
|
|
)
|
|
|
|
# Check if pgvector is available
|
|
try:
|
|
with self.pg_conn.cursor() as cur:
|
|
cur.execute("SELECT 1 FROM pg_extension WHERE extname = 'vector';")
|
|
self.has_vector = cur.fetchone() is not None
|
|
except:
|
|
self.has_vector = False
|
|
|
|
self.logger.info("✅ PostgreSQL connected successfully")
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"⚠️ PostgreSQL connection failed: {e}")
|
|
self.pg_conn = None
|
|
self.has_vector = False
|
|
|
|
def generate_embedding(self, text: str) -> List[float]:
|
|
"""Generate embedding for text using Claude API."""
|
|
try:
|
|
# Use Claude to generate semantic embeddings
|
|
# Truncate text if too long for Claude API
|
|
if len(text) > 8000:
|
|
text = text[:8000] + "..."
|
|
|
|
prompt = f"""
|
|
Convert the following text into a 384-dimensional numerical vector that represents its semantic meaning.
|
|
The vector should be suitable for similarity search and clustering.
|
|
|
|
Text: {text}
|
|
|
|
Return only a JSON array of 384 floating-point numbers between -1 and 1, like this:
|
|
[0.123, -0.456, 0.789, ...]
|
|
"""
|
|
|
|
message = self.claude_client.messages.create(
|
|
model="claude-3-5-sonnet-20240620",
|
|
max_tokens=2000,
|
|
temperature=0.1,
|
|
messages=[{"role": "user", "content": prompt}]
|
|
)
|
|
|
|
response_text = message.content[0].text.strip()
|
|
|
|
# Extract JSON array from response
|
|
import json
|
|
import re
|
|
|
|
# Find JSON array in response
|
|
json_match = re.search(r'\[[\d\.,\s-]+\]', response_text)
|
|
if json_match:
|
|
embedding = json.loads(json_match.group())
|
|
if len(embedding) == 384:
|
|
return embedding
|
|
|
|
# Fallback: generate deterministic embedding from text hash
|
|
return self._generate_fallback_embedding(text)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Claude embedding generation failed: {e}")
|
|
return self._generate_fallback_embedding(text)
|
|
|
|
def _generate_fallback_embedding(self, text: str) -> List[float]:
|
|
"""Generate fallback embedding using text hash."""
|
|
try:
|
|
import hashlib
|
|
import struct
|
|
|
|
# Create a deterministic hash-based embedding
|
|
hash_obj = hashlib.sha256(text.encode('utf-8'))
|
|
hash_bytes = hash_obj.digest()
|
|
|
|
# Convert to 384-dimensional vector
|
|
embedding = []
|
|
for i in range(0, len(hash_bytes), 4):
|
|
if len(embedding) >= 384:
|
|
break
|
|
chunk = hash_bytes[i:i+4]
|
|
if len(chunk) == 4:
|
|
# Convert 4 bytes to float and normalize
|
|
value = struct.unpack('>I', chunk)[0] / (2**32 - 1) # Normalize to 0-1
|
|
embedding.append(value * 2 - 1) # Scale to -1 to 1
|
|
|
|
# Pad to exactly 384 dimensions
|
|
while len(embedding) < 384:
|
|
embedding.append(0.0)
|
|
|
|
return embedding[:384]
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Fallback embedding generation failed: {e}")
|
|
return [0.0] * 384
|
|
|
|
def calculate_content_hash(self, content: str) -> str:
|
|
"""Calculate SHA-256 hash of content for change detection."""
|
|
return hashlib.sha256(content.encode()).hexdigest()
|
|
|
|
async def store_working_memory(self, key: str, data: Dict[str, Any], ttl: Optional[int] = None) -> bool:
|
|
"""Store temporary data in working memory (Redis)."""
|
|
try:
|
|
ttl = ttl or self.working_memory_ttl
|
|
serialized_data = json.dumps(data, default=str)
|
|
self.redis_client.setex(f"working:{key}", ttl, serialized_data)
|
|
return True
|
|
except Exception as e:
|
|
self.logger.error(f"Working memory storage failed: {e}")
|
|
return False
|
|
|
|
async def get_working_memory(self, key: str) -> Optional[Dict[str, Any]]:
|
|
"""Retrieve data from working memory."""
|
|
try:
|
|
data = self.redis_client.get(f"working:{key}")
|
|
return json.loads(data) if data else None
|
|
except Exception as e:
|
|
self.logger.error(f"Working memory retrieval failed: {e}")
|
|
return None
|
|
|
|
async def store_episodic_memory(self, session_id: str, user_query: str,
|
|
ai_response: str, repo_context: str,
|
|
metadata: Optional[Dict] = None) -> str:
|
|
"""Store interaction in episodic memory."""
|
|
try:
|
|
memory_id = str(uuid.uuid4())
|
|
|
|
# Generate embeddings
|
|
query_embedding = self.generate_embedding(user_query)
|
|
response_embedding = self.generate_embedding(ai_response)
|
|
|
|
# Store in MongoDB
|
|
episodic_record = {
|
|
'memory_id': memory_id,
|
|
'session_id': session_id,
|
|
'user_query': user_query,
|
|
'ai_response': ai_response,
|
|
'repo_context': repo_context,
|
|
'timestamp': datetime.utcnow(),
|
|
'metadata': metadata or {}
|
|
}
|
|
self.episodic_collection.insert_one(episodic_record)
|
|
|
|
# Store embeddings in PostgreSQL for similarity search
|
|
with self.pg_conn.cursor() as cur:
|
|
cur.execute("""
|
|
INSERT INTO query_embeddings
|
|
(session_id, query_text, query_embedding, response_embedding, repo_context, metadata)
|
|
VALUES (%s, %s, %s, %s, %s, %s)
|
|
""", (
|
|
session_id, user_query, query_embedding, response_embedding,
|
|
repo_context, json.dumps(metadata or {})
|
|
))
|
|
self.pg_conn.commit()
|
|
|
|
self.logger.info(f"Episodic memory stored: {memory_id}")
|
|
return memory_id
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Episodic memory storage failed: {e}")
|
|
return ""
|
|
|
|
async def retrieve_episodic_memories(self, query: str, repo_context: str = "",
|
|
limit: int = 10, similarity_threshold: float = 0.7) -> List[Dict]:
|
|
"""Retrieve relevant episodic memories based on query similarity."""
|
|
try:
|
|
query_embedding = self.generate_embedding(query)
|
|
|
|
with self.pg_conn.cursor(cursor_factory=RealDictCursor) as cur:
|
|
# Find similar queries using cosine similarity
|
|
cur.execute("""
|
|
SELECT session_id, query_text, repo_context, timestamp, metadata,
|
|
1 - (query_embedding <=> %s::vector) as similarity
|
|
FROM query_embeddings
|
|
WHERE (%s = '' OR repo_context = %s)
|
|
AND 1 - (query_embedding <=> %s::vector) > %s
|
|
ORDER BY similarity DESC
|
|
LIMIT %s
|
|
""", (query_embedding, repo_context, repo_context, query_embedding, similarity_threshold, limit))
|
|
|
|
similar_queries = cur.fetchall()
|
|
|
|
# Fetch full episodic records from MongoDB
|
|
memories = []
|
|
for query_record in similar_queries:
|
|
episodic_record = self.episodic_collection.find_one({
|
|
'session_id': query_record['session_id'],
|
|
'timestamp': query_record['timestamp']
|
|
})
|
|
if episodic_record:
|
|
episodic_record['similarity_score'] = float(query_record['similarity'])
|
|
memories.append(episodic_record)
|
|
|
|
return memories
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Episodic memory retrieval failed: {e}")
|
|
return []
|
|
|
|
async def store_persistent_memory(self, content: str, category: str,
|
|
confidence: float, source_repos: List[str]) -> str:
|
|
"""Store long-term knowledge in persistent memory."""
|
|
try:
|
|
fact_id = str(uuid.uuid4())
|
|
embedding = self.generate_embedding(content)
|
|
|
|
# Store in MongoDB
|
|
persistent_record = {
|
|
'fact_id': fact_id,
|
|
'content': content,
|
|
'category': category,
|
|
'confidence': confidence,
|
|
'source_repos': source_repos,
|
|
'created_at': datetime.utcnow(),
|
|
'last_accessed': datetime.utcnow(),
|
|
'access_frequency': 1
|
|
}
|
|
self.persistent_collection.insert_one(persistent_record)
|
|
|
|
# Store embedding in PostgreSQL
|
|
with self.pg_conn.cursor() as cur:
|
|
if self.has_vector:
|
|
cur.execute("""
|
|
INSERT INTO knowledge_embeddings
|
|
(fact_id, content, category, embedding, confidence, source_repos)
|
|
VALUES (%s, %s, %s, %s, %s, %s)
|
|
""", (fact_id, content, category, embedding, confidence, source_repos))
|
|
else:
|
|
cur.execute("""
|
|
INSERT INTO knowledge_embeddings
|
|
(fact_id, content, category, confidence, source_repos)
|
|
VALUES (%s, %s, %s, %s, %s)
|
|
""", (fact_id, content, category, confidence, source_repos))
|
|
self.pg_conn.commit()
|
|
|
|
self.logger.info(f"Persistent memory stored: {fact_id}")
|
|
return fact_id
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Persistent memory storage failed: {e}")
|
|
return ""
|
|
|
|
async def retrieve_persistent_memories(self, query: str, category: str = "",
|
|
limit: int = 20, similarity_threshold: float = 0.6) -> List[Dict]:
|
|
"""Retrieve relevant persistent knowledge."""
|
|
try:
|
|
query_embedding = self.generate_embedding(query)
|
|
|
|
with self.pg_conn.cursor(cursor_factory=RealDictCursor) as cur:
|
|
# Check if table exists first
|
|
cur.execute("""
|
|
SELECT EXISTS (
|
|
SELECT FROM information_schema.tables
|
|
WHERE table_name = 'knowledge_embeddings'
|
|
);
|
|
""")
|
|
table_exists = cur.fetchone()[0]
|
|
|
|
if not table_exists:
|
|
self.logger.warning("knowledge_embeddings table does not exist, returning empty results")
|
|
return []
|
|
|
|
# Build WHERE clause dynamically
|
|
if hasattr(self, 'has_vector') and self.has_vector:
|
|
where_conditions = ["1 - (embedding <=> %s::vector) > %s"]
|
|
params = [query_embedding, similarity_threshold]
|
|
else:
|
|
# Fallback to text-based search
|
|
where_conditions = ["content ILIKE %s"]
|
|
params = [f"%{query}%"]
|
|
|
|
if category:
|
|
where_conditions.append("category = %s")
|
|
params.append(category)
|
|
|
|
where_clause = " AND ".join(where_conditions)
|
|
params.extend([limit])
|
|
|
|
if hasattr(self, 'has_vector') and self.has_vector:
|
|
cur.execute(f"""
|
|
SELECT fact_id, content, category, confidence, source_repos,
|
|
1 - (embedding <=> %s::vector) as similarity,
|
|
created_at, last_accessed, access_frequency
|
|
FROM knowledge_embeddings
|
|
WHERE {where_clause}
|
|
ORDER BY similarity DESC, confidence DESC, access_frequency DESC
|
|
LIMIT %s
|
|
""", params)
|
|
else:
|
|
cur.execute(f"""
|
|
SELECT fact_id, content, category, confidence, source_repos,
|
|
0.8 as similarity,
|
|
created_at, last_accessed, access_frequency
|
|
FROM knowledge_embeddings
|
|
WHERE {where_clause}
|
|
ORDER BY confidence DESC, access_frequency DESC
|
|
LIMIT %s
|
|
""", params)
|
|
|
|
results = cur.fetchall()
|
|
|
|
# Update access frequency
|
|
for result in results:
|
|
cur.execute("""
|
|
UPDATE knowledge_embeddings
|
|
SET last_accessed = CURRENT_TIMESTAMP,
|
|
access_frequency = access_frequency + 1
|
|
WHERE fact_id = %s
|
|
""", (result['fact_id'],))
|
|
|
|
self.pg_conn.commit()
|
|
return [dict(result) for result in results]
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Persistent memory retrieval failed: {e}")
|
|
return []
|
|
|
|
async def store_code_analysis(self, repo_id: str, file_path: str,
|
|
analysis_data: Dict[str, Any]) -> str:
|
|
"""Store code analysis with embeddings for future retrieval."""
|
|
try:
|
|
content_hash = self.calculate_content_hash(json.dumps(analysis_data, sort_keys=True))
|
|
|
|
# Create searchable content for embedding
|
|
searchable_content = f"""
|
|
File: {file_path}
|
|
Language: {analysis_data.get('language', 'Unknown')}
|
|
Issues: {' '.join(analysis_data.get('issues_found', []))}
|
|
Recommendations: {' '.join(analysis_data.get('recommendations', []))}
|
|
Analysis: {analysis_data.get('detailed_analysis', '')}
|
|
"""
|
|
|
|
embedding = self.generate_embedding(searchable_content)
|
|
|
|
# Store in MongoDB
|
|
analysis_record = {
|
|
'repo_id': repo_id,
|
|
'file_path': file_path,
|
|
'content_hash': content_hash,
|
|
'analysis_data': analysis_data,
|
|
'created_at': datetime.utcnow(),
|
|
'last_accessed': datetime.utcnow(),
|
|
'access_count': 1
|
|
}
|
|
|
|
# Upsert to handle updates
|
|
self.analysis_collection.update_one(
|
|
{'repo_id': repo_id, 'file_path': file_path},
|
|
{'$set': analysis_record},
|
|
upsert=True
|
|
)
|
|
|
|
# Store embedding in PostgreSQL
|
|
with self.pg_conn.cursor() as cur:
|
|
if self.has_vector:
|
|
cur.execute("""
|
|
INSERT INTO code_embeddings (repo_id, file_path, content_hash, embedding, metadata)
|
|
VALUES (%s, %s, %s, %s, %s)
|
|
ON CONFLICT (repo_id, file_path, content_hash)
|
|
DO UPDATE SET last_accessed = CURRENT_TIMESTAMP
|
|
""", (
|
|
repo_id, file_path, content_hash, embedding,
|
|
json.dumps({
|
|
'language': analysis_data.get('language'),
|
|
'lines_of_code': analysis_data.get('lines_of_code', 0),
|
|
'severity_score': analysis_data.get('severity_score', 5.0)
|
|
})
|
|
))
|
|
else:
|
|
cur.execute("""
|
|
INSERT INTO code_embeddings (repo_id, file_path, content_hash, embedding_text, metadata)
|
|
VALUES (%s, %s, %s, %s, %s)
|
|
ON CONFLICT (repo_id, file_path, content_hash)
|
|
DO UPDATE SET last_accessed = CURRENT_TIMESTAMP
|
|
""", (
|
|
repo_id, file_path, content_hash, json.dumps(embedding),
|
|
json.dumps({
|
|
'language': analysis_data.get('language'),
|
|
'lines_of_code': analysis_data.get('lines_of_code', 0),
|
|
'severity_score': analysis_data.get('severity_score', 5.0)
|
|
})
|
|
))
|
|
self.pg_conn.commit()
|
|
|
|
return content_hash
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Code analysis storage failed: {e}")
|
|
return ""
|
|
|
|
async def search_similar_code(self, query: str, repo_id: str = "",
|
|
limit: int = 10) -> List[Dict]:
|
|
"""Search for similar code analyses."""
|
|
try:
|
|
query_embedding = self.generate_embedding(query)
|
|
|
|
with self.pg_conn.cursor(cursor_factory=RealDictCursor) as cur:
|
|
# Check if table exists first
|
|
cur.execute("""
|
|
SELECT EXISTS (
|
|
SELECT FROM information_schema.tables
|
|
WHERE table_name = 'code_embeddings'
|
|
);
|
|
""")
|
|
table_exists = cur.fetchone()[0]
|
|
|
|
if not table_exists:
|
|
self.logger.warning("code_embeddings table does not exist, returning empty results")
|
|
return []
|
|
|
|
where_clause = "WHERE 1=1"
|
|
params = [query_embedding]
|
|
|
|
if repo_id:
|
|
where_clause += " AND repo_id = %s"
|
|
params.append(repo_id)
|
|
|
|
params.append(limit)
|
|
|
|
cur.execute(f"""
|
|
SELECT repo_id, file_path, content_hash, metadata,
|
|
1 - (embedding <=> %s::vector) as similarity
|
|
FROM code_embeddings
|
|
{where_clause}
|
|
ORDER BY similarity DESC
|
|
LIMIT %s
|
|
""", params)
|
|
|
|
results = cur.fetchall()
|
|
|
|
# Fetch full analysis data from MongoDB
|
|
enriched_results = []
|
|
for result in results:
|
|
analysis = self.analysis_collection.find_one({
|
|
'repo_id': result['repo_id'],
|
|
'file_path': result['file_path']
|
|
})
|
|
if analysis:
|
|
analysis['similarity_score'] = float(result['similarity'])
|
|
enriched_results.append(analysis)
|
|
|
|
return enriched_results
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Similar code search failed: {e}")
|
|
return []
|
|
|
|
async def cleanup_old_memories(self):
|
|
"""Clean up old episodic memories and update access patterns."""
|
|
try:
|
|
cutoff_date = datetime.utcnow() - timedelta(days=self.episodic_retention_days)
|
|
|
|
# Clean up old episodic memories
|
|
result = self.episodic_collection.delete_many({
|
|
'timestamp': {'$lt': cutoff_date}
|
|
})
|
|
self.logger.info(f"Cleaned up {result.deleted_count} old episodic memories")
|
|
|
|
# Clean up corresponding query embeddings
|
|
with self.pg_conn.cursor() as cur:
|
|
cur.execute("DELETE FROM query_embeddings WHERE timestamp < %s", (cutoff_date,))
|
|
self.pg_conn.commit()
|
|
|
|
# Update persistent memory relevance based on access patterns
|
|
await self.update_persistent_memory_relevance()
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Memory cleanup failed: {e}")
|
|
|
|
async def update_persistent_memory_relevance(self):
|
|
"""Update relevance scores for persistent memories based on access patterns."""
|
|
try:
|
|
with self.pg_conn.cursor() as cur:
|
|
# Calculate relevance based on recency and frequency
|
|
cur.execute("""
|
|
UPDATE knowledge_embeddings
|
|
SET confidence = LEAST(confidence * (
|
|
CASE
|
|
WHEN EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - last_accessed)) / 86400 < 30
|
|
THEN 1.1
|
|
ELSE 0.95
|
|
END *
|
|
(1.0 + LOG(access_frequency + 1) / 10.0)
|
|
), 1.0)
|
|
""")
|
|
self.pg_conn.commit()
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Relevance update failed: {e}")
|
|
|
|
async def get_memory_stats(self) -> Dict[str, Any]:
|
|
"""Get comprehensive memory system statistics."""
|
|
try:
|
|
stats = {}
|
|
|
|
# Working memory stats (Redis)
|
|
working_keys = self.redis_client.keys("working:*")
|
|
stats['working_memory'] = {
|
|
'total_keys': len(working_keys),
|
|
'memory_usage': self.redis_client.info()['used_memory_human']
|
|
}
|
|
|
|
# Episodic memory stats (MongoDB)
|
|
stats['episodic_memory'] = {
|
|
'total_records': self.episodic_collection.count_documents({}),
|
|
'recent_interactions': self.episodic_collection.count_documents({
|
|
'timestamp': {'$gte': datetime.utcnow() - timedelta(days=7)}
|
|
})
|
|
}
|
|
|
|
# Persistent memory stats
|
|
stats['persistent_memory'] = {
|
|
'total_facts': self.persistent_collection.count_documents({}),
|
|
'high_confidence_facts': self.persistent_collection.count_documents({
|
|
'confidence': {'$gte': 0.8}
|
|
})
|
|
}
|
|
|
|
# Code analysis stats
|
|
stats['code_analysis'] = {
|
|
'total_analyses': self.analysis_collection.count_documents({}),
|
|
'unique_repositories': len(self.analysis_collection.distinct('repo_id'))
|
|
}
|
|
|
|
# Vector database stats (PostgreSQL)
|
|
with self.pg_conn.cursor(cursor_factory=RealDictCursor) as cur:
|
|
cur.execute("SELECT COUNT(*) as count FROM code_embeddings")
|
|
code_embeddings_count = cur.fetchone()['count']
|
|
|
|
cur.execute("SELECT COUNT(*) as count FROM knowledge_embeddings")
|
|
knowledge_embeddings_count = cur.fetchone()['count']
|
|
|
|
stats['vector_database'] = {
|
|
'code_embeddings': code_embeddings_count,
|
|
'knowledge_embeddings': knowledge_embeddings_count
|
|
}
|
|
|
|
return stats
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Stats retrieval failed: {e}")
|
|
return {}
|
|
|
|
class MemoryQueryEngine:
|
|
"""Advanced querying capabilities across memory systems."""
|
|
|
|
def __init__(self, memory_manager: MemoryManager):
|
|
self.memory = memory_manager
|
|
|
|
async def intelligent_query(self, query: str, repo_context: str = "") -> Dict[str, Any]:
|
|
"""Intelligent cross-memory querying with relevance scoring."""
|
|
try:
|
|
# Multi-source memory retrieval
|
|
results = await asyncio.gather(
|
|
self.memory.retrieve_episodic_memories(query, repo_context, limit=5),
|
|
self.memory.retrieve_persistent_memories(query, limit=10),
|
|
self.memory.search_similar_code(query, repo_context, limit=5)
|
|
)
|
|
|
|
episodic_memories, persistent_knowledge, similar_code = results
|
|
|
|
# Relevance scoring and fusion
|
|
fused_response = self.fuse_memory_responses(
|
|
query, episodic_memories, persistent_knowledge, similar_code
|
|
)
|
|
|
|
return {
|
|
'query': query,
|
|
'fused_response': fused_response,
|
|
'sources': {
|
|
'episodic_count': len(episodic_memories),
|
|
'persistent_count': len(persistent_knowledge),
|
|
'similar_code_count': len(similar_code)
|
|
},
|
|
'confidence_score': self.calculate_response_confidence(fused_response),
|
|
'timestamp': datetime.utcnow()
|
|
}
|
|
|
|
except Exception as e:
|
|
self.memory.logger.error(f"Intelligent query failed: {e}")
|
|
return {'error': str(e)}
|
|
|
|
def fuse_memory_responses(self, query: str, episodic: List, persistent: List, code: List) -> str:
|
|
"""Fuse responses from different memory systems."""
|
|
response_parts = []
|
|
|
|
# Weight different memory types
|
|
if persistent:
|
|
high_conf_knowledge = [p for p in persistent if p.get('confidence', 0) > 0.8]
|
|
if high_conf_knowledge:
|
|
response_parts.append("Based on established knowledge:")
|
|
for knowledge in high_conf_knowledge[:3]:
|
|
response_parts.append(f"• {knowledge['content']}")
|
|
|
|
if episodic:
|
|
recent_interactions = sorted(episodic, key=lambda x: x.get('timestamp', datetime.min), reverse=True)[:2]
|
|
if recent_interactions:
|
|
response_parts.append("\nFrom previous interactions:")
|
|
for interaction in recent_interactions:
|
|
response_parts.append(f"• {interaction.get('ai_response', '')[:200]}...")
|
|
|
|
if code:
|
|
similar_patterns = [c for c in code if c.get('similarity_score', 0) > 0.7]
|
|
if similar_patterns:
|
|
response_parts.append("\nSimilar code patterns found:")
|
|
for pattern in similar_patterns[:2]:
|
|
issues = pattern.get('analysis_data', {}).get('issues_found', [])
|
|
if issues:
|
|
response_parts.append(f"• {pattern['file_path']}: {issues[0]}")
|
|
|
|
return '\n'.join(response_parts) if response_parts else "No relevant memories found."
|
|
|
|
def calculate_response_confidence(self, response: str) -> float:
|
|
"""Calculate confidence score for fused response."""
|
|
if not response or response == "No relevant memories found.":
|
|
return 0.0
|
|
|
|
# Simple confidence calculation based on response length and structure
|
|
confidence = min(len(response.split()) / 100.0, 1.0) # Normalize by word count
|
|
if "Based on established knowledge:" in response:
|
|
confidence += 0.2
|
|
if "From previous interactions:" in response:
|
|
confidence += 0.1
|
|
if "Similar code patterns found:" in response:
|
|
confidence += 0.15
|
|
|
|
return min(confidence, 1.0)
|
|
|
|
class EnhancedGitHubAnalyzer:
|
|
"""Enhanced repository analyzer with memory capabilities and parallel processing."""
|
|
|
|
def __init__(self, api_key: str, memory_config: Dict[str, Any]):
|
|
self.client = anthropic.Anthropic(api_key=api_key)
|
|
self.memory_manager = MemoryManager(memory_config)
|
|
self.query_engine = MemoryQueryEngine(self.memory_manager)
|
|
self.session_id = str(uuid.uuid4())
|
|
self.temp_dir = None
|
|
|
|
# Performance optimization settings
|
|
self.max_workers = memory_config.get('max_workers', 10) # Parallel processing
|
|
self.batch_size = memory_config.get('batch_size', 20) # Batch processing
|
|
self.cache_ttl = memory_config.get('cache_ttl', 3600) # Cache TTL
|
|
self.max_file_size = memory_config.get('max_file_size', 0) # No file size limit (0 = unlimited)
|
|
|
|
# Language mapping for file detection
|
|
self.language_map = {
|
|
'.py': 'Python', '.js': 'JavaScript', '.ts': 'TypeScript',
|
|
'.tsx': 'TypeScript', '.jsx': 'JavaScript', '.java': 'Java',
|
|
'.cpp': 'C++', '.c': 'C', '.cs': 'C#', '.go': 'Go', '.rs': 'Rust',
|
|
'.php': 'PHP', '.rb': 'Ruby', '.swift': 'Swift', '.kt': 'Kotlin',
|
|
'.html': 'HTML', '.css': 'CSS', '.scss': 'SCSS', '.sass': 'SASS',
|
|
'.sql': 'SQL', '.yaml': 'YAML', '.yml': 'YAML', '.json': 'JSON',
|
|
'.xml': 'XML', '.sh': 'Shell', '.dockerfile': 'Docker',
|
|
'.md': 'Markdown', '.txt': 'Text'
|
|
}
|
|
|
|
# Code file extensions to analyze
|
|
self.code_extensions = set(self.language_map.keys())
|
|
|
|
async def analyze_files_parallel(self, files_to_analyze: List[Tuple[Path, str]], repo_id: str) -> List[FileAnalysis]:
|
|
"""Analyze files in parallel batches for better performance."""
|
|
file_analyses = []
|
|
|
|
# Process files in batches
|
|
for i in range(0, len(files_to_analyze), self.batch_size):
|
|
batch = files_to_analyze[i:i + self.batch_size]
|
|
print(f"Processing batch {i//self.batch_size + 1}/{(len(files_to_analyze) + self.batch_size - 1)//self.batch_size} ({len(batch)} files)")
|
|
|
|
# Create tasks for parallel execution
|
|
tasks = []
|
|
for file_path, content in batch:
|
|
# Process all files regardless of size (no file size limit)
|
|
task = self.analyze_file_with_memory(file_path, content, repo_id)
|
|
tasks.append(task)
|
|
|
|
# Execute batch in parallel
|
|
if tasks:
|
|
batch_results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
# Process results
|
|
for j, result in enumerate(batch_results):
|
|
if isinstance(result, Exception):
|
|
print(f"Error analyzing file {batch[j][0].name}: {result}")
|
|
# Create a basic analysis for failed files
|
|
failed_analysis = FileAnalysis(
|
|
path=str(batch[j][0]),
|
|
language=self.detect_language(batch[j][0]),
|
|
lines_of_code=len(batch[j][1].splitlines()),
|
|
severity_score=5.0,
|
|
issues_found=[f"Analysis failed: {str(result)}"],
|
|
recommendations=["Review this file manually"]
|
|
)
|
|
file_analyses.append(failed_analysis)
|
|
else:
|
|
file_analyses.append(result)
|
|
|
|
# Small delay between batches to avoid overwhelming the API
|
|
await asyncio.sleep(0.5)
|
|
|
|
return file_analyses
|
|
|
|
def clone_repository(self, repo_path: str) -> str:
|
|
"""Clone repository or use existing path."""
|
|
if os.path.exists(repo_path):
|
|
print(f"Using existing repository: {repo_path}")
|
|
return repo_path
|
|
else:
|
|
print(f"Cloning repository: {repo_path}")
|
|
self.temp_dir = tempfile.mkdtemp(prefix="repo_analysis_")
|
|
try:
|
|
git.Repo.clone_from(repo_path, self.temp_dir)
|
|
return self.temp_dir
|
|
except Exception as e:
|
|
raise Exception(f"Failed to clone repository: {e}")
|
|
|
|
def calculate_repo_id(self, repo_path: str) -> str:
|
|
"""Generate consistent repository ID."""
|
|
return hashlib.sha256(repo_path.encode()).hexdigest()[:16]
|
|
|
|
def get_file_language(self, file_path: Path) -> str:
|
|
"""Get programming language from file extension."""
|
|
return self.language_map.get(file_path.suffix.lower(), 'Unknown')
|
|
|
|
def calculate_complexity_score(self, content: str) -> float:
|
|
"""Calculate basic complexity score based on code patterns."""
|
|
lines = content.split('\n')
|
|
complexity_indicators = ['if', 'else', 'elif', 'for', 'while', 'try', 'except', 'catch', 'switch']
|
|
|
|
complexity = 1
|
|
for line in lines:
|
|
line_lower = line.lower().strip()
|
|
for indicator in complexity_indicators:
|
|
if indicator in line_lower:
|
|
complexity += 1
|
|
|
|
# Normalize to 1-10 scale
|
|
return min(complexity / max(len(lines), 1) * 100, 10.0)
|
|
|
|
async def analyze_file_with_memory(self, file_path: Path, content: str, repo_id: str) -> FileAnalysis:
|
|
"""Analyze file with memory-enhanced context."""
|
|
language = self.get_file_language(file_path)
|
|
lines_of_code = len([line for line in content.split('\n') if line.strip()])
|
|
complexity_score = self.calculate_complexity_score(content)
|
|
|
|
# Skip memory operations for faster analysis
|
|
similar_analyses = []
|
|
persistent_knowledge = []
|
|
|
|
# Build enhanced context for analysis
|
|
context_info = ""
|
|
if similar_analyses:
|
|
context_info += f"\nSimilar files previously analyzed:\n"
|
|
for similar in similar_analyses[:2]:
|
|
context_info += f"- {similar['file_path']}: Found {len(similar.get('analysis_data', {}).get('issues_found', []))} issues\n"
|
|
|
|
if persistent_knowledge:
|
|
context_info += f"\nRelevant best practices:\n"
|
|
for knowledge in persistent_knowledge[:3]:
|
|
context_info += f"- {knowledge['content'][:100]}...\n"
|
|
|
|
# Truncate content if too long
|
|
if len(content) > 4000:
|
|
content = content[:4000] + "\n... [truncated for analysis]"
|
|
|
|
print(f" Analyzing {file_path.name} ({language}, {lines_of_code} lines)")
|
|
|
|
# Create comprehensive analysis prompt with memory context
|
|
prompt = f"""
|
|
You are a senior software engineer with 25+ years of experience. Analyze this {language} code file with context from previous analyses.
|
|
|
|
FILENAME: {file_path.name}
|
|
LANGUAGE: {language}
|
|
LINES OF CODE: {lines_of_code}
|
|
|
|
{context_info}
|
|
|
|
CODE:
|
|
```{language.lower()}
|
|
{content}
|
|
```
|
|
|
|
Provide a comprehensive analysis covering:
|
|
|
|
1. ISSUES FOUND: List specific problems, bugs, security vulnerabilities, or code smells
|
|
2. RECOMMENDATIONS: Actionable suggestions for improvement
|
|
3. CODE QUALITY: Overall assessment of code quality and maintainability
|
|
4. SECURITY: Any security concerns or vulnerabilities
|
|
5. PERFORMANCE: Potential performance issues or optimizations
|
|
6. BEST PRACTICES: Adherence to coding standards and best practices
|
|
|
|
Rate the overall code quality from 1-10 where 10 is excellent.
|
|
|
|
ANALYSIS:
|
|
"""
|
|
|
|
try:
|
|
message = self.client.messages.create(
|
|
model="claude-3-5-sonnet-20240620",
|
|
max_tokens=3000,
|
|
temperature=0.1,
|
|
messages=[{"role": "user", "content": prompt}]
|
|
)
|
|
|
|
analysis_text = message.content[0].text.strip()
|
|
|
|
# Extract severity score from analysis
|
|
severity_match = re.search(r'(\d+(?:\.\d+)?)/10', analysis_text)
|
|
severity_score = float(severity_match.group(1)) if severity_match else 5.0
|
|
|
|
# Parse issues and recommendations from the text
|
|
issues = self.extract_issues_from_analysis(analysis_text)
|
|
recommendations = self.extract_recommendations_from_analysis(analysis_text)
|
|
|
|
# Create file analysis object
|
|
file_analysis = FileAnalysis(
|
|
path=str(file_path.relative_to(Path(self.temp_dir or '.'))),
|
|
language=language,
|
|
lines_of_code=lines_of_code,
|
|
complexity_score=complexity_score,
|
|
issues_found=issues,
|
|
recommendations=recommendations,
|
|
detailed_analysis=analysis_text,
|
|
severity_score=severity_score
|
|
)
|
|
|
|
# Skip memory operations for faster analysis
|
|
# await self.memory_manager.store_code_analysis(
|
|
# repo_id, str(file_analysis.path), asdict(file_analysis)
|
|
# )
|
|
|
|
# await self.extract_knowledge_from_analysis(file_analysis, repo_id)
|
|
|
|
return file_analysis
|
|
|
|
except Exception as e:
|
|
print(f" Error analyzing {file_path.name}: {e}")
|
|
return FileAnalysis(
|
|
path=str(file_path),
|
|
language=language,
|
|
lines_of_code=lines_of_code,
|
|
complexity_score=complexity_score,
|
|
issues_found=[f"Analysis failed: {str(e)}"],
|
|
recommendations=["Review file manually due to analysis error"],
|
|
detailed_analysis=f"Analysis failed due to error: {str(e)}",
|
|
severity_score=5.0
|
|
)
|
|
|
|
def extract_issues_from_analysis(self, analysis_text: str) -> List[str]:
|
|
"""Extract issues from analysis text."""
|
|
issues = []
|
|
lines = analysis_text.split('\n')
|
|
|
|
# Look for common issue indicators
|
|
issue_keywords = ['issue', 'problem', 'bug', 'vulnerability', 'error', 'warning', 'concern']
|
|
|
|
for line in lines:
|
|
line_lower = line.lower().strip()
|
|
if any(keyword in line_lower for keyword in issue_keywords):
|
|
if line.strip() and not line.strip().startswith('#'):
|
|
issues.append(line.strip())
|
|
|
|
return issues[:10] # Limit to top 10 issues
|
|
|
|
def extract_recommendations_from_analysis(self, analysis_text: str) -> List[str]:
|
|
"""Extract recommendations from analysis text."""
|
|
recommendations = []
|
|
lines = analysis_text.split('\n')
|
|
|
|
# Look for recommendation indicators
|
|
rec_keywords = ['recommend', 'suggest', 'should', 'consider', 'improve']
|
|
|
|
for line in lines:
|
|
line_lower = line.lower().strip()
|
|
if any(keyword in line_lower for keyword in rec_keywords):
|
|
if line.strip() and not line.strip().startswith('#'):
|
|
recommendations.append(line.strip())
|
|
|
|
return recommendations[:10] # Limit to top 10 recommendations
|
|
|
|
async def extract_knowledge_from_analysis(self, file_analysis: FileAnalysis, repo_id: str):
|
|
"""Extract valuable knowledge from analysis for persistent storage."""
|
|
try:
|
|
# Extract security-related knowledge
|
|
security_issues = []
|
|
if isinstance(file_analysis.issues_found, (list, tuple)):
|
|
security_issues = [issue for issue in file_analysis.issues_found
|
|
if any(sec in issue.lower() for sec in ['security', 'vulnerability', 'injection', 'xss', 'auth'])]
|
|
|
|
for issue in security_issues:
|
|
await self.memory_manager.store_persistent_memory(
|
|
content=f"Security issue in {file_analysis.language}: {issue}",
|
|
category='security_vulnerability',
|
|
confidence=0.8,
|
|
source_repos=[repo_id]
|
|
)
|
|
|
|
# Extract best practices
|
|
best_practices = []
|
|
if isinstance(file_analysis.recommendations, (list, tuple)):
|
|
best_practices = [rec for rec in file_analysis.recommendations
|
|
if any(bp in rec.lower() for bp in ['best practice', 'standard', 'convention'])]
|
|
|
|
for practice in best_practices:
|
|
await self.memory_manager.store_persistent_memory(
|
|
content=f"{file_analysis.language} best practice: {practice}",
|
|
category='best_practice',
|
|
confidence=0.7,
|
|
source_repos=[repo_id]
|
|
)
|
|
|
|
# Extract code patterns
|
|
if file_analysis.severity_score < 5:
|
|
await self.memory_manager.store_persistent_memory(
|
|
content=f"Low quality {file_analysis.language} pattern: {file_analysis.detailed_analysis[:200]}",
|
|
category='code_pattern',
|
|
confidence=0.6,
|
|
source_repos=[repo_id]
|
|
)
|
|
|
|
except Exception as e:
|
|
self.memory_manager.logger.error(f"Knowledge extraction failed: {e}")
|
|
|
|
def scan_repository(self, repo_path: str) -> List[Tuple[Path, str]]:
|
|
"""Scan repository and collect ALL files for analysis."""
|
|
print(f"Scanning repository: {repo_path}")
|
|
|
|
files_to_analyze = []
|
|
|
|
# Important files to always include
|
|
important_files = {
|
|
'README.md', 'package.json', 'requirements.txt', 'Dockerfile',
|
|
'docker-compose.yml', 'tsconfig.json', 'next.config.js',
|
|
'tailwind.config.js', 'webpack.config.js', '.env.example',
|
|
'Cargo.toml', 'pom.xml', 'build.gradle', 'composer.json',
|
|
'Gemfile', 'go.mod', 'yarn.lock', 'pnpm-lock.yaml'
|
|
}
|
|
|
|
for root, dirs, files in os.walk(repo_path):
|
|
# Skip common build/cache directories
|
|
dirs[:] = [d for d in dirs if not d.startswith('.') and
|
|
d not in {'node_modules', '__pycache__', 'build', 'dist', 'target',
|
|
'venv', 'env', '.git', '.next', 'coverage', 'vendor',
|
|
'bower_components', '.gradle', '.m2', '.cargo'}]
|
|
|
|
for file in files:
|
|
file_path = Path(root) / file
|
|
|
|
# Skip large files (increased limit for comprehensive analysis)
|
|
try:
|
|
if file_path.stat().st_size > 2000000: # 2MB limit
|
|
print(f" Skipping large file: {file_path.name} ({file_path.stat().st_size / 1024 / 1024:.1f}MB)")
|
|
continue
|
|
except:
|
|
continue
|
|
|
|
# Include important files or files with code extensions
|
|
should_include = (
|
|
file.lower() in important_files or
|
|
file_path.suffix.lower() in self.code_extensions or
|
|
file.lower().startswith('dockerfile') or
|
|
file.lower().startswith('makefile') or
|
|
file.lower().startswith('cmake')
|
|
)
|
|
|
|
if should_include:
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
content = f.read()
|
|
if content.strip(): # Only non-empty files
|
|
files_to_analyze.append((file_path, content))
|
|
except Exception as e:
|
|
print(f"Could not read {file_path}: {e}")
|
|
|
|
print(f"Found {len(files_to_analyze)} files to analyze")
|
|
return files_to_analyze
|
|
|
|
async def analyze_repository_with_memory(self, repo_path: str) -> RepositoryAnalysis:
|
|
"""Main analysis function with memory integration - analyzes ALL files."""
|
|
try:
|
|
# Generate repo ID and check for cached analysis
|
|
repo_id = self.calculate_repo_id(repo_path)
|
|
|
|
# Check working memory for recent analysis
|
|
cached_analysis = await self.memory_manager.get_working_memory(f"repo_analysis:{repo_id}")
|
|
if cached_analysis:
|
|
print("Using cached repository analysis from memory")
|
|
return RepositoryAnalysis(**cached_analysis)
|
|
|
|
# Clone/access repository
|
|
actual_repo_path = self.clone_repository(repo_path)
|
|
|
|
# Get analysis context from memory (no user query needed)
|
|
context_memories = await self.get_analysis_context(repo_path, "", repo_id)
|
|
|
|
# Scan ALL files
|
|
files_to_analyze = self.scan_repository(actual_repo_path)
|
|
|
|
if not files_to_analyze:
|
|
raise Exception("No files found to analyze")
|
|
|
|
# Analyze files with parallel processing for better performance
|
|
print(f"Starting comprehensive analysis of {len(files_to_analyze)} files with parallel processing...")
|
|
file_analyses = await self.analyze_files_parallel(files_to_analyze, repo_id)
|
|
|
|
# Repository-level analyses with memory context
|
|
print("Performing repository-level analysis with memory context...")
|
|
architecture_assessment, security_assessment = await self.analyze_repository_overview_with_memory(
|
|
actual_repo_path, file_analyses, context_memories, repo_id
|
|
)
|
|
|
|
# Calculate overall quality score safely
|
|
if file_analyses and len(file_analyses) > 0:
|
|
valid_scores = [fa.severity_score for fa in file_analyses if fa.severity_score is not None]
|
|
avg_quality = sum(valid_scores) / len(valid_scores) if valid_scores else 5.0
|
|
else:
|
|
avg_quality = 5.0
|
|
|
|
# Generate statistics
|
|
languages = dict(Counter(fa.language for fa in file_analyses))
|
|
total_lines = sum(fa.lines_of_code for fa in file_analyses)
|
|
|
|
# Create repository analysis
|
|
repo_analysis = RepositoryAnalysis(
|
|
repo_path=repo_path,
|
|
total_files=len(file_analyses),
|
|
total_lines=total_lines,
|
|
languages=languages,
|
|
architecture_assessment=architecture_assessment,
|
|
security_assessment=security_assessment,
|
|
code_quality_score=avg_quality,
|
|
file_analyses=file_analyses,
|
|
executive_summary=""
|
|
)
|
|
|
|
# Generate executive summary with memory context
|
|
print("Generating memory-enhanced executive summary...")
|
|
repo_analysis.executive_summary = await self.generate_executive_summary_with_memory(
|
|
repo_analysis, context_memories
|
|
)
|
|
|
|
# Store analysis in episodic memory (automated analysis)
|
|
await self.memory_manager.store_episodic_memory(
|
|
self.session_id, "Complete automated repository analysis",
|
|
f"Analyzed {repo_analysis.total_files} files, found {sum(len(fa.issues_found) for fa in file_analyses)} issues",
|
|
repo_id,
|
|
{
|
|
'repo_path': repo_path,
|
|
'quality_score': avg_quality,
|
|
'total_issues': sum(len(fa.issues_found) for fa in file_analyses),
|
|
'analysis_type': 'automated_comprehensive'
|
|
}
|
|
)
|
|
|
|
# Cache analysis in working memory
|
|
await self.memory_manager.store_working_memory(
|
|
f"repo_analysis:{repo_id}",
|
|
asdict(repo_analysis),
|
|
ttl=7200 # 2 hours
|
|
)
|
|
|
|
return repo_analysis
|
|
|
|
finally:
|
|
# Cleanup
|
|
if self.temp_dir and os.path.exists(self.temp_dir):
|
|
shutil.rmtree(self.temp_dir)
|
|
print("Temporary files cleaned up")
|
|
|
|
async def get_analysis_context(self, repo_path: str, user_query: str, repo_id: str) -> Dict[str, List]:
|
|
"""Gather relevant context from memory systems."""
|
|
context = {
|
|
'episodic_memories': [],
|
|
'persistent_knowledge': [],
|
|
'similar_analyses': []
|
|
}
|
|
|
|
# Get relevant persistent knowledge for comprehensive analysis
|
|
context['persistent_knowledge'] = await self.memory_manager.retrieve_persistent_memories(
|
|
"code quality security best practices", limit=15
|
|
)
|
|
|
|
# Find similar code analyses
|
|
context['similar_analyses'] = await self.memory_manager.search_similar_code(
|
|
"repository analysis", repo_id, limit=10
|
|
)
|
|
|
|
return context
|
|
|
|
async def analyze_repository_overview_with_memory(self, repo_path: str, file_analyses: List[FileAnalysis],
|
|
context_memories: Dict, repo_id: str) -> Tuple[str, str]:
|
|
"""Analyze repository architecture and security with memory context."""
|
|
print("Analyzing repository overview with memory context...")
|
|
|
|
# Prepare summary data
|
|
languages = dict(Counter(fa.language for fa in file_analyses))
|
|
total_lines = sum(fa.lines_of_code for fa in file_analyses)
|
|
# Calculate average quality safely
|
|
if file_analyses and len(file_analyses) > 0:
|
|
valid_scores = [fa.severity_score for fa in file_analyses if fa.severity_score is not None]
|
|
avg_quality = sum(valid_scores) / len(valid_scores) if valid_scores else 5.0
|
|
else:
|
|
avg_quality = 5.0
|
|
|
|
# Build memory context
|
|
memory_context = ""
|
|
if context_memories['persistent_knowledge']:
|
|
memory_context += "Relevant knowledge from previous analyses:\n"
|
|
for knowledge in context_memories['persistent_knowledge'][:3]:
|
|
memory_context += f"- {knowledge['content']}\n"
|
|
|
|
if context_memories['similar_analyses']:
|
|
memory_context += "\nSimilar repositories analyzed:\n"
|
|
for similar in context_memories['similar_analyses'][:2]:
|
|
memory_context += f"- {similar['file_path']}: {len(similar.get('analysis_data', {}).get('issues_found', []))} issues found\n"
|
|
|
|
# Get repository structure
|
|
structure_lines = []
|
|
try:
|
|
for root, dirs, files in os.walk(repo_path):
|
|
dirs[:] = [d for d in dirs if not d.startswith('.') and d not in {'node_modules', '__pycache__'}]
|
|
level = root.replace(repo_path, '').count(os.sep)
|
|
indent = ' ' * level
|
|
structure_lines.append(f"{indent}{os.path.basename(root)}/")
|
|
for file in files[:3]: # Limit files shown per directory
|
|
structure_lines.append(f"{indent} {file}")
|
|
if len(structure_lines) > 50: # Limit total structure size
|
|
break
|
|
except Exception as e:
|
|
structure_lines = [f"Error reading structure: {e}"]
|
|
|
|
# Architecture analysis with memory context
|
|
arch_prompt = f"""
|
|
You are a Senior Software Architect with 25+ years of experience.
|
|
|
|
{memory_context}
|
|
|
|
Analyze this repository:
|
|
|
|
REPOSITORY STRUCTURE:
|
|
{chr(10).join(structure_lines[:30])}
|
|
|
|
STATISTICS:
|
|
- Total files analyzed: {len(file_analyses)}
|
|
- Total lines of code: {total_lines:,}
|
|
- Languages: {languages}
|
|
- Average code quality: {avg_quality:.1f}/10
|
|
|
|
TOP FILE ISSUES:
|
|
{chr(10).join([f"- {fa.path}: {len(fa.issues_found) if isinstance(fa.issues_found, (list, tuple)) else 0} issues" for fa in file_analyses[:10]])}
|
|
|
|
Provide an architectural assessment covering:
|
|
1. Project type and purpose
|
|
2. Technology stack evaluation
|
|
3. Code organization and structure
|
|
4. Scalability and maintainability concerns
|
|
5. Key recommendations for improvement
|
|
|
|
Incorporate insights from the memory context provided above.
|
|
Keep response under 1500 words and focus on actionable insights.
|
|
"""
|
|
|
|
# Security analysis with memory context
|
|
security_issues = []
|
|
for fa in file_analyses:
|
|
if isinstance(fa.issues_found, (list, tuple)):
|
|
security_issues.extend([issue for issue in fa.issues_found if
|
|
any(keyword in issue.lower() for keyword in
|
|
['security', 'vulnerability', 'injection', 'xss', 'auth', 'password'])])
|
|
|
|
sec_prompt = f"""
|
|
You are a Senior Security Engineer with 20+ years of experience.
|
|
|
|
{memory_context}
|
|
|
|
Security Analysis for repository with {len(file_analyses)} files:
|
|
|
|
SECURITY ISSUES FOUND:
|
|
{chr(10).join(security_issues[:20]) if security_issues else "No obvious security issues detected"}
|
|
|
|
HIGH-RISK FILE TYPES PRESENT:
|
|
{[lang for lang, count in languages.items() if lang in ['JavaScript', 'TypeScript', 'Python', 'PHP', 'SQL']]}
|
|
|
|
Provide security assessment covering:
|
|
1. Overall security posture
|
|
2. Main security risks and vulnerabilities
|
|
3. Authentication and authorization concerns
|
|
4. Data protection and privacy issues
|
|
5. Immediate security priorities
|
|
|
|
Incorporate insights from the memory context provided above.
|
|
Keep response under 1000 words and focus on actionable security recommendations.
|
|
"""
|
|
|
|
try:
|
|
# Run both analyses
|
|
arch_task = self.client.messages.create(
|
|
model="claude-3-5-sonnet-20240620",
|
|
max_tokens=2000,
|
|
temperature=0.1,
|
|
messages=[{"role": "user", "content": arch_prompt}]
|
|
)
|
|
|
|
sec_task = self.client.messages.create(
|
|
model="claude-3-5-sonnet-20240620",
|
|
max_tokens=1500,
|
|
temperature=0.1,
|
|
messages=[{"role": "user", "content": sec_prompt}]
|
|
)
|
|
|
|
architecture_assessment = arch_task.content[0].text
|
|
security_assessment = sec_task.content[0].text
|
|
|
|
# Store insights as persistent knowledge
|
|
await self.memory_manager.store_persistent_memory(
|
|
content=f"Architecture pattern: {architecture_assessment[:300]}...",
|
|
category='architecture',
|
|
confidence=0.7,
|
|
source_repos=[repo_id]
|
|
)
|
|
|
|
return architecture_assessment, security_assessment
|
|
|
|
except Exception as e:
|
|
return f"Architecture analysis failed: {e}", f"Security analysis failed: {e}"
|
|
|
|
async def generate_executive_summary_with_memory(self, analysis: RepositoryAnalysis, context_memories: Dict) -> str:
|
|
"""Generate executive summary with memory context."""
|
|
print("Generating executive summary with memory context...")
|
|
|
|
# Build memory context for executive summary
|
|
executive_context = ""
|
|
if context_memories['episodic_memories']:
|
|
executive_context += "Previous executive discussions:\n"
|
|
for memory in context_memories['episodic_memories'][:2]:
|
|
if 'executive' in memory.get('ai_response', '').lower():
|
|
executive_context += f"- {memory['ai_response'][:200]}...\n"
|
|
|
|
prompt = f"""
|
|
You are presenting to C-level executives. Create an executive summary of this technical analysis.
|
|
|
|
{executive_context}
|
|
|
|
REPOSITORY METRICS:
|
|
- Total Files: {analysis.total_files}
|
|
- Lines of Code: {analysis.total_lines:,}
|
|
- Languages: {analysis.languages}
|
|
- Code Quality Score: {analysis.code_quality_score:.1f}/10
|
|
|
|
KEY FINDINGS:
|
|
- Total issues identified: {sum(len(fa.issues_found) if isinstance(fa.issues_found, (list, tuple)) else 0 for fa in analysis.file_analyses)}
|
|
- Files needing attention: {len([fa for fa in analysis.file_analyses if fa.severity_score < 7])}
|
|
- High-quality files: {len([fa for fa in analysis.file_analyses if fa.severity_score >= 8])}
|
|
|
|
Create an executive summary for non-technical leadership covering:
|
|
1. Business impact of code quality findings
|
|
2. Risk assessment and implications
|
|
3. Investment priorities and recommendations
|
|
4. Expected ROI from addressing technical debt
|
|
5. Competitive implications
|
|
|
|
Focus on business outcomes, not technical details. Keep under 800 words.
|
|
"""
|
|
|
|
try:
|
|
message = self.client.messages.create(
|
|
model="claude-3-5-sonnet-20240620",
|
|
max_tokens=1200,
|
|
temperature=0.1,
|
|
messages=[{"role": "user", "content": prompt}]
|
|
)
|
|
return message.content[0].text
|
|
except Exception as e:
|
|
return f"Executive summary generation failed: {e}"
|
|
|
|
def create_pdf_report(self, analysis: RepositoryAnalysis, output_path: str):
|
|
"""Generate comprehensive PDF report."""
|
|
print(f"Generating PDF report: {output_path}")
|
|
|
|
doc = SimpleDocTemplate(output_path, pagesize=A4,
|
|
leftMargin=72, rightMargin=72,
|
|
topMargin=72, bottomMargin=72)
|
|
styles = getSampleStyleSheet()
|
|
story = []
|
|
|
|
# Custom styles with proper core colors
|
|
title_style = ParagraphStyle(
|
|
'CustomTitle',
|
|
parent=styles['Heading1'],
|
|
fontSize=24,
|
|
textColor=colors.HexColor('#1e40af'), # Blue-800
|
|
spaceAfter=30,
|
|
alignment=TA_CENTER
|
|
)
|
|
|
|
heading_style = ParagraphStyle(
|
|
'CustomHeading',
|
|
parent=styles['Heading2'],
|
|
fontSize=16,
|
|
textColor=colors.HexColor('#1e40af'), # Blue-800
|
|
spaceBefore=20,
|
|
spaceAfter=10
|
|
)
|
|
|
|
# Title Page
|
|
story.append(Paragraph("AI-Enhanced Repository Analysis Report", title_style))
|
|
story.append(Spacer(1, 20))
|
|
story.append(Paragraph(f"<b>Repository:</b> {analysis.repo_path}", styles['Normal']))
|
|
story.append(Paragraph(f"<b>Analysis Date:</b> {datetime.now().strftime('%B %d, %Y at %H:%M')}", styles['Normal']))
|
|
story.append(Paragraph("<b>Generated by:</b> Enhanced AI Analysis System with Memory", styles['Normal']))
|
|
story.append(PageBreak())
|
|
|
|
# Executive Summary
|
|
story.append(Paragraph("Executive Summary", heading_style))
|
|
if analysis.executive_summary and len(analysis.executive_summary.strip()) > 50:
|
|
story.append(Paragraph(analysis.executive_summary, styles['Normal']))
|
|
else:
|
|
# Generate a comprehensive summary even without AI
|
|
summary_text = f"""
|
|
This repository contains {analysis.total_files} files with a total of {analysis.total_lines:,} lines of code.
|
|
The codebase is primarily written in {', '.join(list(analysis.languages.keys())[:3]) if analysis.languages else 'Unknown'}.
|
|
|
|
<b>Key Statistics:</b>
|
|
• Total Files: {analysis.total_files}
|
|
• Total Lines: {analysis.total_lines:,}
|
|
• Code Quality Score: {analysis.code_quality_score}/10
|
|
• High Quality Files: {len([fa for fa in analysis.file_analyses if fa.severity_score >= 8])}
|
|
• Medium Quality Files: {len([fa for fa in analysis.file_analyses if 5 <= fa.severity_score < 8])}
|
|
• Low Quality Files: {len([fa for fa in analysis.file_analyses if fa.severity_score < 5])}
|
|
|
|
<b>Repository Overview:</b>
|
|
This appears to be a {analysis.repo_path.split('/')[-1] if '/' in analysis.repo_path else analysis.repo_path} project with a well-structured codebase.
|
|
The analysis reveals a mix of file types and programming languages, indicating a comprehensive software project.
|
|
"""
|
|
story.append(Paragraph(summary_text, styles['Normal']))
|
|
story.append(PageBreak())
|
|
|
|
# Repository Overview
|
|
story.append(Paragraph("Repository Overview", heading_style))
|
|
|
|
overview_data = [
|
|
['Metric', 'Value'],
|
|
['Total Files Analyzed', str(analysis.total_files)],
|
|
['Total Lines of Code', f"{analysis.total_lines:,}"],
|
|
['Primary Languages', ', '.join(list(analysis.languages.keys())[:5]) if analysis.languages else 'Unknown'],
|
|
['Overall Code Quality', f"{analysis.code_quality_score:.1f}/10"],
|
|
]
|
|
|
|
overview_table = Table(overview_data, colWidths=[200, 300])
|
|
overview_table.setStyle(TableStyle([
|
|
('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#1e40af')), # Blue-800 header
|
|
('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
|
|
('ALIGN', (0, 0), (-1, -1), 'LEFT'),
|
|
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
|
('FONTSIZE', (0, 0), (-1, 0), 12),
|
|
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
|
|
('BACKGROUND', (0, 1), (-1, -1), colors.HexColor('#f8fafc')), # Gray-50
|
|
('GRID', (0, 0), (-1, -1), 1, colors.HexColor('#e2e8f0')) # Gray-300
|
|
]))
|
|
|
|
story.append(overview_table)
|
|
story.append(Spacer(1, 20))
|
|
|
|
# Code Quality Assessment
|
|
story.append(Paragraph("Code Quality Assessment", heading_style))
|
|
# Calculate percentages safely
|
|
total_files = analysis.total_files if isinstance(analysis.total_files, int) and analysis.total_files > 0 else 1
|
|
|
|
# Calculate quality file counts from file_analyses
|
|
high_quality_count = len([fa for fa in analysis.file_analyses if fa.severity_score >= 8])
|
|
medium_quality_count = len([fa for fa in analysis.file_analyses if 5 <= fa.severity_score < 8])
|
|
low_quality_count = len([fa for fa in analysis.file_analyses if fa.severity_score < 5])
|
|
|
|
quality_data = [
|
|
['Quality Level', 'Count', 'Percentage'],
|
|
['High Quality', str(high_quality_count), f"{(high_quality_count/total_files)*100:.1f}%"],
|
|
['Medium Quality', str(medium_quality_count), f"{(medium_quality_count/total_files)*100:.1f}%"],
|
|
['Low Quality', str(low_quality_count), f"{(low_quality_count/total_files)*100:.1f}%"]
|
|
]
|
|
|
|
quality_table = Table(quality_data, colWidths=[150, 100, 100])
|
|
quality_table.setStyle(TableStyle([
|
|
('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#1e40af')), # Blue-800 header
|
|
('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
|
|
('ALIGN', (0, 0), (-1, -1), 'CENTER'),
|
|
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
|
('FONTSIZE', (0, 0), (-1, 0), 12),
|
|
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
|
|
('BACKGROUND', (0, 1), (-1, -1), colors.HexColor('#f8fafc')), # Gray-50
|
|
('GRID', (0, 0), (-1, -1), 1, colors.HexColor('#e2e8f0')) # Gray-300
|
|
]))
|
|
|
|
story.append(quality_table)
|
|
story.append(Spacer(1, 20))
|
|
|
|
# Security Assessment
|
|
if hasattr(analysis, 'security_assessment') and analysis.security_assessment:
|
|
story.append(Paragraph("Security Assessment", heading_style))
|
|
story.append(Paragraph(analysis.security_assessment, styles['Normal']))
|
|
story.append(Spacer(1, 20))
|
|
|
|
# Architecture Assessment
|
|
if hasattr(analysis, 'architecture_assessment') and analysis.architecture_assessment:
|
|
story.append(Paragraph("Architecture Assessment", heading_style))
|
|
story.append(Paragraph(analysis.architecture_assessment, styles['Normal']))
|
|
story.append(Spacer(1, 20))
|
|
|
|
# File Analysis Details
|
|
story.append(Paragraph("File Analysis Details", heading_style))
|
|
|
|
# Create file analysis table
|
|
file_data = [['File Path', 'Language', 'Lines', 'Quality Score', 'Issues']]
|
|
|
|
for file_analysis in analysis.file_analyses[:20]: # Limit to first 20 files
|
|
file_data.append([
|
|
str(file_analysis.path)[:50] + '...' if len(str(file_analysis.path)) > 50 else str(file_analysis.path),
|
|
file_analysis.language,
|
|
str(file_analysis.lines_of_code),
|
|
f"{file_analysis.severity_score:.1f}/10",
|
|
str(len(file_analysis.issues_found) if isinstance(file_analysis.issues_found, (list, tuple)) else 0)
|
|
])
|
|
|
|
if len(analysis.file_analyses) > 20:
|
|
file_data.append(['...', '...', '...', '...', f'... and {len(analysis.file_analyses) - 20} more files'])
|
|
|
|
file_table = Table(file_data, colWidths=[200, 80, 60, 80, 60])
|
|
file_table.setStyle(TableStyle([
|
|
('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#1e40af')), # Blue-800 header
|
|
('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
|
|
('ALIGN', (0, 0), (-1, -1), 'LEFT'),
|
|
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
|
('FONTSIZE', (0, 0), (-1, 0), 10),
|
|
('FONTSIZE', (0, 1), (-1, -1), 8),
|
|
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
|
|
('BACKGROUND', (0, 1), (-1, -1), colors.HexColor('#f8fafc')), # Gray-50
|
|
('GRID', (0, 0), (-1, -1), 1, colors.HexColor('#e2e8f0')) # Gray-300
|
|
]))
|
|
|
|
story.append(file_table)
|
|
story.append(Spacer(1, 20))
|
|
|
|
# Recommendations
|
|
story.append(Paragraph("Key Recommendations", heading_style))
|
|
|
|
recommendations = []
|
|
for file_analysis in analysis.file_analyses:
|
|
if file_analysis.recommendations:
|
|
recommendations.extend(file_analysis.recommendations[:2]) # Limit recommendations per file
|
|
|
|
if recommendations:
|
|
for i, rec in enumerate(recommendations[:10], 1): # Limit to top 10 recommendations
|
|
story.append(Paragraph(f"{i}. {rec}", styles['Normal']))
|
|
else:
|
|
story.append(Paragraph("No specific recommendations generated.", styles['Normal']))
|
|
|
|
story.append(Spacer(1, 20))
|
|
|
|
# Footer
|
|
story.append(Paragraph("--- End of Report ---", styles['Normal']))
|
|
story.append(Paragraph(f"Generated on {datetime.now().strftime('%B %d, %Y at %H:%M:%S')}", styles['Normal']))
|
|
|
|
# Build PDF
|
|
try:
|
|
doc.build(story)
|
|
print(f"✅ PDF report generated successfully: {output_path}")
|
|
except Exception as e:
|
|
print(f"❌ Error generating PDF: {e}")
|
|
|
|
async def query_memory(self, query: str, repo_context: str = "") -> Dict[str, Any]:
|
|
"""Query the memory system directly."""
|
|
return await self.query_engine.intelligent_query(query, repo_context)
|
|
|
|
def get_memory_config() -> Dict[str, Any]:
|
|
"""Get memory system configuration from environment variables."""
|
|
return {
|
|
'anthropic_api_key': os.getenv('ANTHROPIC_API_KEY', ''),
|
|
'redis_host': os.getenv('REDIS_HOST', 'localhost'),
|
|
'redis_port': int(os.getenv('REDIS_PORT', 6379)),
|
|
'redis_db': int(os.getenv('REDIS_DB', 0)),
|
|
'mongodb_url': os.getenv('MONGODB_URL', 'mongodb://localhost:27017/'),
|
|
'mongodb_name': os.getenv('MONGODB_DB', 'repo_analyzer'),
|
|
'postgres_host': os.getenv('POSTGRES_HOST', 'localhost'),
|
|
'postgres_port': int(os.getenv('POSTGRES_PORT', 5432)),
|
|
'postgres_db': os.getenv('POSTGRES_DB', 'repo_vectors'),
|
|
'postgres_user': os.getenv('POSTGRES_USER', 'postgres'),
|
|
'postgres_password': os.getenv('POSTGRES_PASSWORD', '')
|
|
}
|
|
|
|
async def main():
|
|
"""Main function to run the enhanced repository analyzer."""
|
|
load_dotenv()
|
|
|
|
import argparse
|
|
parser = argparse.ArgumentParser(description="Complete AI Repository Analysis - Analyzes ALL files automatically")
|
|
parser.add_argument("repo_path", help="Repository path (local directory or Git URL)")
|
|
parser.add_argument("--output", "-o", default="complete_repository_analysis.pdf",
|
|
help="Output PDF file path")
|
|
parser.add_argument("--api-key", help="Anthropic API key (overrides .env)")
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Get API key
|
|
api_key = args.api_key or os.getenv('ANTHROPIC_API_KEY')
|
|
if not api_key:
|
|
print("❌ Error: ANTHROPIC_API_KEY not found in .env file or command line")
|
|
return 1
|
|
|
|
try:
|
|
print("🚀 Starting Complete AI Repository Analysis")
|
|
print("=" * 60)
|
|
print(f"Repository: {args.repo_path}")
|
|
print(f"Output: {args.output}")
|
|
print("Mode: Complete automated analysis of ALL files")
|
|
print("=" * 60)
|
|
|
|
# Initialize enhanced analyzer
|
|
config = get_memory_config()
|
|
analyzer = EnhancedGitHubAnalyzer(api_key, config)
|
|
|
|
# Perform complete analysis
|
|
analysis = await analyzer.analyze_repository_with_memory(args.repo_path)
|
|
|
|
# Generate PDF report
|
|
analyzer.create_pdf_report(analysis, args.output)
|
|
|
|
# Print summary to console
|
|
print("\n" + "=" * 60)
|
|
print("🎯 COMPLETE ANALYSIS FINISHED")
|
|
print("=" * 60)
|
|
print(f"📊 Repository Statistics:")
|
|
print(f" • Files Analyzed: {analysis.total_files}")
|
|
print(f" • Lines of Code: {analysis.total_lines:,}")
|
|
print(f" • Languages: {len(analysis.languages)}")
|
|
print(f" • Code Quality: {analysis.code_quality_score:.1f}/10")
|
|
|
|
# Quality breakdown
|
|
high_quality = len([fa for fa in analysis.file_analyses if fa.severity_score >= 8])
|
|
medium_quality = len([fa for fa in analysis.file_analyses if 5 <= fa.severity_score < 8])
|
|
low_quality = len([fa for fa in analysis.file_analyses if fa.severity_score < 5])
|
|
|
|
print(f"\n📈 Quality Breakdown:")
|
|
print(f" • High Quality Files (8-10): {high_quality}")
|
|
print(f" • Medium Quality Files (5-7): {medium_quality}")
|
|
print(f" • Low Quality Files (1-4): {low_quality}")
|
|
print(f" • Total Issues Found: {sum(len(fa.issues_found) if isinstance(fa.issues_found, (list, tuple)) else 0 for fa in analysis.file_analyses)}")
|
|
|
|
# Language breakdown
|
|
print(f"\n🔤 Language Distribution:")
|
|
for lang, count in sorted(analysis.languages.items(), key=lambda x: x[1], reverse=True)[:10]:
|
|
print(f" • {lang}: {count} files")
|
|
|
|
# Memory system stats
|
|
memory_stats = await analyzer.memory_manager.get_memory_stats()
|
|
print(f"\n🧠 Memory System Statistics:")
|
|
for category, data in memory_stats.items():
|
|
print(f" • {category.replace('_', ' ').title()}: {data}")
|
|
|
|
print(f"\n📄 Complete PDF Report: {args.output}")
|
|
print("\n✅ Complete analysis finished successfully!")
|
|
|
|
return 0
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error during analysis: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return 1
|
|
|
|
if __name__ == "__main__":
|
|
exit(asyncio.run(main())) |