diff --git a/docker-compose.yml b/docker-compose.yml index 9c05177..f291a94 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -4,7 +4,7 @@ services: # ===================================== postgres: - image: postgres:15 + image: pgvector/pgvector:pg15 container_name: pipeline_postgres environment: POSTGRES_USER: pipeline_admin @@ -31,7 +31,7 @@ services: volumes: - redis_data:/data ports: - - "6379:6379" + - "6380:6379" networks: - pipeline_network healthcheck: @@ -714,6 +714,55 @@ services: timeout: 10s retries: 3 start_period: 40s + + # ===================================== + # AI Analysis Service + # ===================================== + + ai-analysis-service: + build: ./services/ai-analysis-service + container_name: pipeline_ai_analysis_service + ports: + - "8022:8022" + environment: + - PORT=8022 + - HOST=0.0.0.0 + - ANTHROPIC_API_KEY=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 + - POSTGRES_HOST=postgres + - POSTGRES_PORT=5432 + - POSTGRES_DB=dev_pipeline + - POSTGRES_USER=pipeline_admin + - POSTGRES_PASSWORD=secure_pipeline_2024 + - REDIS_HOST=redis + - REDIS_PORT=6379 + - REDIS_PASSWORD=redis_secure_2024 + - MONGODB_URL=mongodb://pipeline_admin:mongo_secure_2024@mongodb:27017/ + - MONGODB_DB=repo_analyzer + - JWT_ACCESS_SECRET=access-secret-key-2024-tech4biz-secure_pipeline_2024 + - USER_AUTH_SERVICE_URL=http://user-auth:8011 + - PYTHONUNBUFFERED=1 + volumes: + - ai_analysis_logs:/app/logs + - ai_analysis_reports:/app/reports + - ai_analysis_temp:/app/temp + networks: + - pipeline_network + depends_on: + postgres: + condition: service_healthy + redis: + condition: service_healthy + mongodb: + condition: service_started + migrations: + condition: service_completed_successfully + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8022/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 60s + restart: unless-stopped # ===================================== # Workflow Orchestration # ===================================== @@ -827,6 +876,12 @@ volumes: driver: local migration_state: driver: local + ai_analysis_logs: + driver: local + ai_analysis_reports: + driver: local + ai_analysis_temp: + driver: local # ===================================== # Networks @@ -834,11 +889,3 @@ volumes: networks: pipeline_network: driver: bridge - # ===================================== - # Self-Improving Code Generator - # ===================================== - - - # ===================================== - # Self-Improving Code Generator - # ===================================== diff --git a/fix_provider_names.sql b/fix_provider_names.sql new file mode 100644 index 0000000..d894fd2 --- /dev/null +++ b/fix_provider_names.sql @@ -0,0 +1,95 @@ +-- Fix provider_name based on repository URLs across ALL tables +-- This script updates the provider_name field to match the actual provider from the repository URL + +-- ============================================= +-- 1. Fix all_repositories table +-- ============================================= +UPDATE all_repositories +SET provider_name = 'github' +WHERE repository_url LIKE '%github.com%' + OR repository_url LIKE '%github.io%'; + +UPDATE all_repositories +SET provider_name = 'gitlab' +WHERE repository_url LIKE '%gitlab.com%' + OR repository_url LIKE '%gitlab.io%'; + +UPDATE all_repositories +SET provider_name = 'bitbucket' +WHERE repository_url LIKE '%bitbucket.org%' + OR repository_url LIKE '%bitbucket.io%'; + +UPDATE all_repositories +SET provider_name = 'gitea' +WHERE repository_url LIKE '%gitea.com%' + OR repository_url LIKE '%gitea.io%'; + +-- ============================================= +-- 2. Fix repository_storage table (linked to all_repositories) +-- ============================================= +UPDATE repository_storage +SET provider_name = ar.provider_name +FROM all_repositories ar +WHERE repository_storage.repository_id = ar.id; + +-- ============================================= +-- 3. Fix repository_commit_details table (linked to all_repositories) +-- ============================================= +UPDATE repository_commit_details +SET provider_name = ar.provider_name +FROM all_repositories ar +WHERE repository_commit_details.repository_id = ar.id; + +-- ============================================= +-- 4. Fix repository_commit_files table (linked to all_repositories) +-- ============================================= +UPDATE repository_commit_files +SET provider_name = ar.provider_name +FROM all_repositories ar +WHERE repository_commit_files.repository_id = ar.id; + +-- ============================================= +-- 5. Fix repository_directories table (linked to all_repositories) +-- ============================================= +UPDATE repository_directories +SET provider_name = ar.provider_name +FROM all_repositories ar +WHERE repository_directories.repository_id = ar.id; + +-- ============================================= +-- 6. Fix repository_files table (linked to all_repositories) +-- ============================================= +UPDATE repository_files +SET provider_name = ar.provider_name +FROM all_repositories ar +WHERE repository_files.repository_id = ar.id; + +-- ============================================= +-- 7. Show results for verification +-- ============================================= + +-- Show all_repositories results +SELECT + 'all_repositories' as table_name, + repository_url, + repository_name, + owner_name, + provider_name, + CASE + WHEN repository_url LIKE '%github.com%' OR repository_url LIKE '%github.io%' THEN 'github' + WHEN repository_url LIKE '%gitlab.com%' OR repository_url LIKE '%gitlab.io%' THEN 'gitlab' + WHEN repository_url LIKE '%bitbucket.org%' OR repository_url LIKE '%bitbucket.io%' THEN 'bitbucket' + WHEN repository_url LIKE '%gitea.com%' OR repository_url LIKE '%gitea.io%' THEN 'gitea' + ELSE 'unknown' + END as detected_provider +FROM all_repositories +ORDER BY provider_name, repository_name; + +-- Show summary counts by provider +SELECT + 'Summary by Provider' as info, + provider_name, + COUNT(*) as count +FROM all_repositories +GROUP BY provider_name +ORDER BY provider_name; diff --git a/services/ai-analysis-service/001-schema.sql b/services/ai-analysis-service/001-schema.sql new file mode 100644 index 0000000..a775c8c --- /dev/null +++ b/services/ai-analysis-service/001-schema.sql @@ -0,0 +1,613 @@ +-- ================================================ +-- Repository Analyzer Memory System Database Migration +-- Version: 1.0 +-- Description: Complete database setup for AI memory system +-- ================================================ + +-- Enable required extensions +CREATE EXTENSION IF NOT EXISTS vector; +CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; + +-- ================================================ +-- CORE TABLES +-- ================================================ + +-- Code embeddings table for semantic search of analyzed code +CREATE TABLE IF NOT EXISTS code_embeddings ( + id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), + repo_id VARCHAR(255) NOT NULL, + file_path TEXT NOT NULL, + content_hash VARCHAR(64) NOT NULL, + embedding vector(384) NOT NULL, + metadata JSONB DEFAULT '{}', + created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + last_accessed TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + access_count INTEGER DEFAULT 0, + + -- Ensure uniqueness per repo/file/hash combination + CONSTRAINT unique_code_analysis UNIQUE(repo_id, file_path, content_hash) +); + +-- Query embeddings for episodic memory (user interactions) +CREATE TABLE IF NOT EXISTS query_embeddings ( + id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), + session_id VARCHAR(255) NOT NULL, + query_text TEXT NOT NULL, + query_embedding vector(384) NOT NULL, + response_embedding vector(384), + repo_context VARCHAR(255), + timestamp TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + metadata JSONB DEFAULT '{}', + + -- Index for session-based queries + CONSTRAINT valid_session_id CHECK (LENGTH(session_id) > 0) +); + +-- Persistent knowledge embeddings for long-term learning +CREATE TABLE IF NOT EXISTS knowledge_embeddings ( + id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), + fact_id VARCHAR(255) UNIQUE NOT NULL, + content TEXT NOT NULL, + category VARCHAR(100) NOT NULL, + embedding vector(384) NOT NULL, + confidence REAL DEFAULT 1.0 CHECK (confidence >= 0.0 AND confidence <= 1.0), + source_repos TEXT[] DEFAULT '{}', + created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + last_accessed TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + access_frequency INTEGER DEFAULT 0, + + -- Ensure valid categories + CONSTRAINT valid_category CHECK (category IN ('code_pattern', 'best_practice', 'vulnerability', 'architecture', 'security_vulnerability', 'performance')) +); + +-- Repository metadata for tracking analyzed repositories +CREATE TABLE IF NOT EXISTS repository_metadata ( + id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), + repo_id VARCHAR(255) UNIQUE NOT NULL, + repo_path TEXT NOT NULL, + repo_name VARCHAR(500), + primary_language VARCHAR(100), + total_files INTEGER DEFAULT 0, + total_lines INTEGER DEFAULT 0, + last_analyzed TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + analysis_count INTEGER DEFAULT 0, + quality_score REAL DEFAULT 5.0 CHECK (quality_score >= 0.0 AND quality_score <= 10.0), + metadata JSONB DEFAULT '{}' +); + +-- Session tracking for episodic memory correlation +CREATE TABLE IF NOT EXISTS analysis_sessions ( + id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), + session_id VARCHAR(255) UNIQUE NOT NULL, + user_identifier VARCHAR(255), + start_time TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + end_time TIMESTAMP WITH TIME ZONE, + total_queries INTEGER DEFAULT 0, + repositories_analyzed TEXT[] DEFAULT '{}', + session_metadata JSONB DEFAULT '{}' +); + +-- File analysis history for change tracking +CREATE TABLE IF NOT EXISTS file_analysis_history ( + id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), + repo_id VARCHAR(255) NOT NULL, + file_path TEXT NOT NULL, + content_hash VARCHAR(64) NOT NULL, + language VARCHAR(100), + lines_of_code INTEGER DEFAULT 0, + complexity_score REAL DEFAULT 0.0, + severity_score REAL DEFAULT 5.0 CHECK (severity_score >= 0.0 AND severity_score <= 10.0), + issues_count INTEGER DEFAULT 0, + analyzed_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + analysis_version VARCHAR(50) DEFAULT '1.0' +); + +-- Memory consolidation log for tracking knowledge extraction +CREATE TABLE IF NOT EXISTS memory_consolidation_log ( + id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), + source_type VARCHAR(50) NOT NULL, -- 'episodic', 'code_analysis', 'manual' + source_id VARCHAR(255) NOT NULL, + target_memory_type VARCHAR(50) NOT NULL, -- 'persistent', 'working' + target_id VARCHAR(255), + consolidation_confidence REAL DEFAULT 0.5, + consolidation_timestamp TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + consolidation_metadata JSONB DEFAULT '{}' +); + +-- ================================================ +-- PERFORMANCE INDEXES +-- ================================================ + +-- Code embeddings indexes +CREATE INDEX IF NOT EXISTS idx_code_embeddings_repo_id ON code_embeddings(repo_id); +CREATE INDEX IF NOT EXISTS idx_code_embeddings_file_path ON code_embeddings(file_path); +CREATE INDEX IF NOT EXISTS idx_code_embeddings_accessed ON code_embeddings(last_accessed DESC); +CREATE INDEX IF NOT EXISTS idx_code_embeddings_metadata ON code_embeddings USING gin(metadata); + +-- Vector similarity indexes (using IVFFlat for better performance) +CREATE INDEX IF NOT EXISTS idx_code_embeddings_vector +ON code_embeddings USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100); + +-- Query embeddings indexes +CREATE INDEX IF NOT EXISTS idx_query_embeddings_session ON query_embeddings(session_id); +CREATE INDEX IF NOT EXISTS idx_query_embeddings_timestamp ON query_embeddings(timestamp DESC); +CREATE INDEX IF NOT EXISTS idx_query_embeddings_repo_context ON query_embeddings(repo_context); +CREATE INDEX IF NOT EXISTS idx_query_embeddings_vector +ON query_embeddings USING ivfflat (query_embedding vector_cosine_ops) WITH (lists = 100); + +-- Knowledge embeddings indexes +CREATE INDEX IF NOT EXISTS idx_knowledge_embeddings_category ON knowledge_embeddings(category); +CREATE INDEX IF NOT EXISTS idx_knowledge_embeddings_confidence ON knowledge_embeddings(confidence DESC); +CREATE INDEX IF NOT EXISTS idx_knowledge_embeddings_access_freq ON knowledge_embeddings(access_frequency DESC); +CREATE INDEX IF NOT EXISTS idx_knowledge_embeddings_vector +ON knowledge_embeddings USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100); +CREATE INDEX IF NOT EXISTS idx_knowledge_source_repos ON knowledge_embeddings USING gin(source_repos); + +-- Repository metadata indexes +CREATE INDEX IF NOT EXISTS idx_repository_metadata_repo_id ON repository_metadata(repo_id); +CREATE INDEX IF NOT EXISTS idx_repository_metadata_analyzed ON repository_metadata(last_analyzed DESC); +CREATE INDEX IF NOT EXISTS idx_repository_metadata_language ON repository_metadata(primary_language); + +-- File history indexes +CREATE INDEX IF NOT EXISTS idx_file_history_repo_file ON file_analysis_history(repo_id, file_path); +CREATE INDEX IF NOT EXISTS idx_file_history_analyzed ON file_analysis_history(analyzed_at DESC); +CREATE INDEX IF NOT EXISTS idx_file_history_severity ON file_analysis_history(severity_score); + +-- ================================================ +-- MATERIALIZED VIEWS FOR COMMON QUERIES +-- ================================================ + +-- High confidence knowledge view +CREATE MATERIALIZED VIEW IF NOT EXISTS high_confidence_knowledge AS +SELECT + fact_id, + content, + category, + confidence, + source_repos, + created_at, + last_accessed, + access_frequency +FROM knowledge_embeddings +WHERE confidence > 0.8 +ORDER BY confidence DESC, access_frequency DESC; + +CREATE INDEX ON high_confidence_knowledge (category); +CREATE INDEX ON high_confidence_knowledge (confidence DESC); + +-- Repository quality summary view +CREATE MATERIALIZED VIEW IF NOT EXISTS repository_quality_summary AS +SELECT + rm.repo_id, + rm.repo_path, + rm.repo_name, + rm.primary_language, + rm.total_files, + rm.total_lines, + rm.quality_score, + rm.last_analyzed, + COUNT(ce.id) as total_embeddings, + AVG(fah.severity_score) as avg_file_quality, + COUNT(DISTINCT fah.file_path) as analyzed_files_count +FROM repository_metadata rm +LEFT JOIN code_embeddings ce ON rm.repo_id = ce.repo_id +LEFT JOIN file_analysis_history fah ON rm.repo_id = fah.repo_id +GROUP BY rm.repo_id, rm.repo_path, rm.repo_name, rm.primary_language, + rm.total_files, rm.total_lines, rm.quality_score, rm.last_analyzed; + +CREATE INDEX ON repository_quality_summary (quality_score DESC); +CREATE INDEX ON repository_quality_summary (last_analyzed DESC); + +-- Recent activity view +CREATE MATERIALIZED VIEW IF NOT EXISTS recent_activity AS +SELECT + 'query' as activity_type, + session_id as identifier, + query_text as description, + timestamp as activity_time, + repo_context +FROM query_embeddings +WHERE timestamp >= CURRENT_TIMESTAMP - INTERVAL '7 days' +UNION ALL +SELECT + 'analysis' as activity_type, + repo_id as identifier, + file_path as description, + analyzed_at as activity_time, + repo_id as repo_context +FROM file_analysis_history +WHERE analyzed_at >= CURRENT_TIMESTAMP - INTERVAL '7 days' +ORDER BY activity_time DESC; + +CREATE INDEX ON recent_activity (activity_time DESC); +CREATE INDEX ON recent_activity (activity_type); + +-- ================================================ +-- STORED FUNCTIONS AND PROCEDURES +-- ================================================ + +-- Function to refresh all materialized views +CREATE OR REPLACE FUNCTION refresh_memory_views() +RETURNS void AS $$ +BEGIN + REFRESH MATERIALIZED VIEW CONCURRENTLY high_confidence_knowledge; + REFRESH MATERIALIZED VIEW CONCURRENTLY repository_quality_summary; + REFRESH MATERIALIZED VIEW CONCURRENTLY recent_activity; + + -- Log the refresh + INSERT INTO memory_consolidation_log ( + source_type, source_id, target_memory_type, target_id, + consolidation_confidence, consolidation_metadata + ) VALUES ( + 'system', 'materialized_views', 'system', 'view_refresh', + 1.0, '{"refresh_time": "' || CURRENT_TIMESTAMP || '"}'::jsonb + ); +END; +$$ LANGUAGE plpgsql; + +-- Function to calculate semantic similarity between texts +CREATE OR REPLACE FUNCTION calculate_similarity(embedding1 vector(384), embedding2 vector(384)) +RETURNS real AS $$ +BEGIN + RETURN 1 - (embedding1 <=> embedding2); +END; +$$ LANGUAGE plpgsql IMMUTABLE STRICT; + +-- Function to update access patterns +CREATE OR REPLACE FUNCTION update_access_pattern(table_name text, id_column text, id_value text) +RETURNS void AS $$ +BEGIN + CASE table_name + WHEN 'knowledge_embeddings' THEN + EXECUTE 'UPDATE knowledge_embeddings SET last_accessed = CURRENT_TIMESTAMP, access_frequency = access_frequency + 1 WHERE fact_id = $1' + USING id_value; + WHEN 'code_embeddings' THEN + EXECUTE 'UPDATE code_embeddings SET last_accessed = CURRENT_TIMESTAMP, access_count = access_count + 1 WHERE id = $1::uuid' + USING id_value; + ELSE + RAISE EXCEPTION 'Unsupported table: %', table_name; + END CASE; +END; +$$ LANGUAGE plpgsql; + +-- Function to cleanup old memories +CREATE OR REPLACE FUNCTION cleanup_old_memories(retention_days integer DEFAULT 365) +RETURNS integer AS $$ +DECLARE + deleted_count integer := 0; + cutoff_date timestamp; +BEGIN + cutoff_date := CURRENT_TIMESTAMP - (retention_days || ' days')::interval; + + -- Delete old query embeddings (episodic memories) + DELETE FROM query_embeddings WHERE timestamp < cutoff_date; + GET DIAGNOSTICS deleted_count = ROW_COUNT; + + -- Update knowledge confidence based on access patterns + UPDATE knowledge_embeddings + SET confidence = LEAST(confidence * ( + CASE + WHEN EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - last_accessed)) / 86400 < 30 + THEN 1.05 + ELSE 0.98 + END * + (1.0 + LOG(access_frequency + 1) / 20.0) + ), 1.0); + + -- Log cleanup activity + INSERT INTO memory_consolidation_log ( + source_type, source_id, target_memory_type, target_id, + consolidation_confidence, consolidation_metadata + ) VALUES ( + 'system', 'cleanup_function', 'system', 'memory_cleanup', + 1.0, ('{"deleted_records": ' || deleted_count || ', "cutoff_date": "' || cutoff_date || '"}')::jsonb + ); + + RETURN deleted_count; +END; +$$ LANGUAGE plpgsql; + +-- Function to find similar code patterns +CREATE OR REPLACE FUNCTION find_similar_code( + query_embedding vector(384), + repo_filter text DEFAULT NULL, + similarity_threshold real DEFAULT 0.7, + max_results integer DEFAULT 10 +) +RETURNS TABLE ( + id uuid, + repo_id varchar(255), + file_path text, + similarity real, + metadata jsonb +) AS $$ +BEGIN + RETURN QUERY + SELECT + ce.id, + ce.repo_id, + ce.file_path, + (1 - (ce.embedding <=> query_embedding))::real as similarity, + ce.metadata + FROM code_embeddings ce + WHERE (repo_filter IS NULL OR ce.repo_id = repo_filter) + AND (1 - (ce.embedding <=> query_embedding)) > similarity_threshold + ORDER BY similarity DESC + LIMIT max_results; +END; +$$ LANGUAGE plpgsql; + +-- Function to get knowledge by category +CREATE OR REPLACE FUNCTION get_knowledge_by_category( + category_filter varchar(100), + min_confidence real DEFAULT 0.5, + max_results integer DEFAULT 20 +) +RETURNS TABLE ( + fact_id varchar(255), + content text, + confidence real, + access_frequency integer, + source_repos text[] +) AS $$ +BEGIN + RETURN QUERY + SELECT + ke.fact_id, + ke.content, + ke.confidence, + ke.access_frequency, + ke.source_repos + FROM knowledge_embeddings ke + WHERE ke.category = category_filter + AND ke.confidence >= min_confidence + ORDER BY ke.confidence DESC, ke.access_frequency DESC + LIMIT max_results; +END; +$$ LANGUAGE plpgsql; + +-- ================================================ +-- TRIGGERS FOR AUTOMATIC MAINTENANCE +-- ================================================ + +-- Trigger function to update repository metadata when embeddings are added +CREATE OR REPLACE FUNCTION update_repository_stats() +RETURNS trigger AS $$ +BEGIN + -- Update or insert repository metadata + INSERT INTO repository_metadata (repo_id, repo_path, analysis_count, last_analyzed) + VALUES (NEW.repo_id, NEW.repo_id, 1, CURRENT_TIMESTAMP) + ON CONFLICT (repo_id) + DO UPDATE SET + analysis_count = repository_metadata.analysis_count + 1, + last_analyzed = CURRENT_TIMESTAMP; + + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +-- Create triggers +DROP TRIGGER IF EXISTS trigger_update_repo_stats ON code_embeddings; +CREATE TRIGGER trigger_update_repo_stats + AFTER INSERT ON code_embeddings + FOR EACH ROW + EXECUTE FUNCTION update_repository_stats(); + +-- Trigger to automatically update access patterns +CREATE OR REPLACE FUNCTION auto_update_access() +RETURNS trigger AS $$ +BEGIN + NEW.last_accessed = CURRENT_TIMESTAMP; + NEW.access_count = COALESCE(OLD.access_count, 0) + 1; + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +DROP TRIGGER IF EXISTS trigger_auto_access_update ON code_embeddings; +CREATE TRIGGER trigger_auto_access_update + BEFORE UPDATE ON code_embeddings + FOR EACH ROW + EXECUTE FUNCTION auto_update_access(); + +-- ================================================ +-- SECURITY AND PERMISSIONS +-- ================================================ + +-- Create roles for different access levels +DO $$ +BEGIN + IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'repo_analyzer_read') THEN + CREATE ROLE repo_analyzer_read; + END IF; + + IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'repo_analyzer_write') THEN + CREATE ROLE repo_analyzer_write; + END IF; + + IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'repo_analyzer_admin') THEN + CREATE ROLE repo_analyzer_admin; + END IF; +END +$$; + +-- Grant permissions +GRANT SELECT ON ALL TABLES IN SCHEMA public TO repo_analyzer_read; +GRANT SELECT ON high_confidence_knowledge TO repo_analyzer_read; +GRANT SELECT ON repository_quality_summary TO repo_analyzer_read; +GRANT SELECT ON recent_activity TO repo_analyzer_read; + +GRANT SELECT, INSERT, UPDATE ON ALL TABLES IN SCHEMA public TO repo_analyzer_write; +GRANT SELECT ON high_confidence_knowledge TO repo_analyzer_write; +GRANT SELECT ON repository_quality_summary TO repo_analyzer_write; +GRANT SELECT ON recent_activity TO repo_analyzer_write; +GRANT USAGE, SELECT ON ALL SEQUENCES IN SCHEMA public TO repo_analyzer_write; + +GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO repo_analyzer_admin; +GRANT ALL PRIVILEGES ON high_confidence_knowledge TO repo_analyzer_admin; +GRANT ALL PRIVILEGES ON repository_quality_summary TO repo_analyzer_admin; +GRANT ALL PRIVILEGES ON recent_activity TO repo_analyzer_admin; +GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA public TO repo_analyzer_admin; +GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA public TO repo_analyzer_admin; + +-- ================================================ +-- DATA VALIDATION AND CONSTRAINTS +-- ================================================ + +-- Add check constraints for data quality +-- Note: Vector dimensions are validated at insertion time, no need for runtime checks + +-- Add constraints for reasonable data ranges +DO $$ +BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_constraint WHERE conname = 'reasonable_lines_of_code') THEN + ALTER TABLE file_analysis_history ADD CONSTRAINT reasonable_lines_of_code + CHECK (lines_of_code >= 0 AND lines_of_code <= 1000000); + END IF; + + IF NOT EXISTS (SELECT 1 FROM pg_constraint WHERE conname = 'reasonable_complexity') THEN + ALTER TABLE file_analysis_history ADD CONSTRAINT reasonable_complexity + CHECK (complexity_score >= 0.0 AND complexity_score <= 100.0); + END IF; +END +$$; + +-- ================================================ +-- INITIAL DATA AND CONFIGURATION +-- ================================================ + +-- Insert initial system configuration +INSERT INTO memory_consolidation_log ( + source_type, source_id, target_memory_type, target_id, + consolidation_confidence, consolidation_metadata +) VALUES ( + 'system', 'database_migration', 'system', 'initial_setup', + 1.0, ('{"migration_version": "1.0", "setup_time": "' || CURRENT_TIMESTAMP || '"}')::jsonb +) ON CONFLICT DO NOTHING; + +-- Create initial knowledge categories +INSERT INTO knowledge_embeddings ( + fact_id, content, category, embedding, confidence, source_repos +) VALUES +( + 'init_security_001', + 'Always validate and sanitize user input to prevent injection attacks', + 'security_vulnerability', + array_fill(0.0, ARRAY[384])::vector(384), + 0.95, + ARRAY[]::text[] +), +( + 'init_performance_001', + 'Use appropriate data structures and algorithms for better performance', + 'performance', + array_fill(0.0, ARRAY[384])::vector(384), + 0.9, + ARRAY[]::text[] +), +( + 'init_best_practice_001', + 'Follow consistent naming conventions and code formatting standards', + 'best_practice', + array_fill(0.0, ARRAY[384])::vector(384), + 0.85, + ARRAY[]::text[] +) +ON CONFLICT (fact_id) DO NOTHING; + +-- ================================================ +-- BACKUP AND MAINTENANCE PROCEDURES +-- ================================================ + +-- Function to create backup of critical memory data +CREATE OR REPLACE FUNCTION backup_memory_data(backup_path text DEFAULT '/tmp/memory_backup') +RETURNS text AS $$ +DECLARE + backup_file text; + result_message text; +BEGIN + backup_file := backup_path || '_' || to_char(CURRENT_TIMESTAMP, 'YYYY-MM-DD_HH24-MI-SS') || '.sql'; + + -- This would need to be implemented with actual backup logic + -- For now, just return the intended backup file name + result_message := 'Backup would be created at: ' || backup_file; + + -- Log backup activity + INSERT INTO memory_consolidation_log ( + source_type, source_id, target_memory_type, target_id, + consolidation_confidence, consolidation_metadata + ) VALUES ( + 'system', 'backup_function', 'system', 'backup_created', + 1.0, ('{"backup_file": "' || backup_file || '"}')::jsonb + ); + + RETURN result_message; +END; +$$ LANGUAGE plpgsql; + +-- ================================================ +-- MONITORING AND ANALYTICS +-- ================================================ + +-- View for system health monitoring +CREATE OR REPLACE VIEW system_health_monitor AS +SELECT + 'code_embeddings' as table_name, + COUNT(*) as record_count, + MAX(created_at) as latest_record, + AVG(access_count) as avg_access_count +FROM code_embeddings +UNION ALL +SELECT + 'query_embeddings' as table_name, + COUNT(*) as record_count, + MAX(timestamp) as latest_record, + NULL as avg_access_count +FROM query_embeddings +UNION ALL +SELECT + 'knowledge_embeddings' as table_name, + COUNT(*) as record_count, + MAX(created_at) as latest_record, + AVG(access_frequency) as avg_access_count +FROM knowledge_embeddings; + +-- Function to get comprehensive system statistics +CREATE OR REPLACE FUNCTION get_system_statistics() +RETURNS jsonb AS $$ +DECLARE + stats jsonb; +BEGIN + SELECT jsonb_build_object( + 'total_code_embeddings', (SELECT COUNT(*) FROM code_embeddings), + 'total_query_embeddings', (SELECT COUNT(*) FROM query_embeddings), + 'total_knowledge_embeddings', (SELECT COUNT(*) FROM knowledge_embeddings), + 'unique_repositories', (SELECT COUNT(DISTINCT repo_id) FROM code_embeddings), + 'high_confidence_knowledge', (SELECT COUNT(*) FROM knowledge_embeddings WHERE confidence > 0.8), + 'recent_activity_7d', (SELECT COUNT(*) FROM query_embeddings WHERE timestamp >= CURRENT_TIMESTAMP - INTERVAL '7 days'), + 'average_code_quality', (SELECT AVG(quality_score) FROM repository_metadata), + 'last_updated', CURRENT_TIMESTAMP + ) INTO stats; + + RETURN stats; +END; +$$ LANGUAGE plpgsql; + +-- ================================================ +-- COMPLETION MESSAGE +-- ================================================ + +DO $$ +BEGIN + RAISE NOTICE '================================================'; + RAISE NOTICE 'Repository Analyzer Memory System Database Setup Complete'; + RAISE NOTICE '================================================'; + RAISE NOTICE 'Tables created: code_embeddings, query_embeddings, knowledge_embeddings'; + RAISE NOTICE 'Indexes created: Vector similarity indexes with IVFFlat'; + RAISE NOTICE 'Functions created: Similarity search, cleanup, statistics'; + RAISE NOTICE 'Materialized views created: High confidence knowledge, repository summary'; + RAISE NOTICE 'Triggers created: Auto-update repository stats and access patterns'; + RAISE NOTICE '================================================'; + RAISE NOTICE 'Ready for AI-enhanced repository analysis with persistent memory'; + RAISE NOTICE '================================================'; +END +$$; \ No newline at end of file diff --git a/services/ai-analysis-service/Dockerfile b/services/ai-analysis-service/Dockerfile new file mode 100644 index 0000000..9f3745a --- /dev/null +++ b/services/ai-analysis-service/Dockerfile @@ -0,0 +1,37 @@ +FROM python:3.11-slim + +# Set working directory +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + git \ + postgresql-client \ + curl \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +# Copy requirements and install Python dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy the service code +COPY . . + +# Create necessary directories +RUN mkdir -p /app/logs /app/temp /app/reports + +# Set environment variables +ENV PYTHONPATH=/app +ENV PYTHONUNBUFFERED=1 +ENV PORT=8022 + +# Expose port +EXPOSE 8022 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ + CMD curl -f http://localhost:8022/health || exit 1 + +# Run migration and then start the service +CMD ["sh", "-c", "python run_migration.py && python server.py"] diff --git a/services/ai-analysis-service/README.md b/services/ai-analysis-service/README.md new file mode 100644 index 0000000..111c0ff --- /dev/null +++ b/services/ai-analysis-service/README.md @@ -0,0 +1,202 @@ +# Complete AI Repository Analysis Service + +A comprehensive AI-powered repository analysis tool that automatically analyzes **ALL files** in a repository without any limits or user queries required. + +## šŸš€ Features + +- **Complete Analysis**: Analyzes ALL files in the repository (no max-files limit) +- **Fully Automated**: No user query required - runs completely automatically +- **Memory-Enhanced**: Learns from previous analyses using advanced memory systems +- **Comprehensive Reports**: Generates detailed PDF reports with executive summaries +- **Multi-Database Support**: Uses PostgreSQL, MongoDB, and Redis for optimal performance +- **Security Focus**: Identifies security vulnerabilities and code quality issues +- **Architecture Assessment**: Provides architectural insights and recommendations + +## šŸ“‹ Requirements + +### System Dependencies +- Python 3.8+ +- PostgreSQL with pgvector extension +- MongoDB +- Redis + +### Python Dependencies +```bash +pip install anthropic python-dotenv git redis pymongo psycopg2-binary numpy reportlab +``` + +## šŸ› ļø Setup + +1. **Install Dependencies**: + ```bash + pip install -r requirements.txt + ``` + +2. **Database Setup**: + ```bash + # Run the database migration + psql -U postgres -d repo_vectors -f 001-schema.sql + ``` + +3. **Environment Variables**: + Create a `.env` file with: + ```env + ANTHROPIC_API_KEY=your_api_key_here + REDIS_HOST=localhost + REDIS_PORT=6379 + REDIS_DB=0 + MONGODB_URL=mongodb://localhost:27017/ + MONGODB_DB=repo_analyzer + POSTGRES_HOST=localhost + POSTGRES_PORT=5432 + POSTGRES_DB=repo_vectors + POSTGRES_USER=postgres + POSTGRES_PASSWORD=your_password + ``` + +## šŸŽÆ Usage + +### Basic Usage +```bash +python ai-analyze.py /path/to/repository +``` + +### With Custom Output +```bash +python ai-analyze.py /path/to/repository --output my_analysis.pdf +``` + +### With API Key Override +```bash +python ai-analyze.py /path/to/repository --api-key your_api_key +``` + +## šŸ“Š What It Analyzes + +### File Types Supported +- **Programming Languages**: Python, JavaScript, TypeScript, Java, C++, C#, Go, Rust, PHP, Ruby, Swift, Kotlin +- **Web Technologies**: HTML, CSS, SCSS, SASS +- **Configuration Files**: JSON, YAML, XML, SQL +- **Build Files**: Dockerfile, Makefile, CMake, package.json, requirements.txt, Cargo.toml, pom.xml, build.gradle +- **Documentation**: README.md, Markdown files + +### Analysis Coverage +- **Code Quality**: Complexity, maintainability, best practices +- **Security**: Vulnerabilities, injection attacks, authentication issues +- **Architecture**: Project structure, scalability, design patterns +- **Performance**: Optimization opportunities, bottlenecks +- **Documentation**: Completeness and quality + +## šŸ“ˆ Output + +### Console Output +- Real-time analysis progress +- Repository statistics +- Quality breakdown by file +- Language distribution +- Memory system statistics + +### PDF Report +- Executive summary for leadership +- Repository overview with metrics +- Detailed file-by-file analysis +- Security assessment +- Architecture evaluation +- Recommendations and next steps + +## 🧠 Memory System + +The tool uses a sophisticated three-tier memory system: + +1. **Working Memory (Redis)**: Temporary, fast access for current analysis +2. **Episodic Memory (MongoDB)**: User interactions and analysis sessions +3. **Persistent Memory (PostgreSQL)**: Long-term knowledge and best practices + +This allows the tool to learn from previous analyses and provide increasingly accurate insights. + +## šŸ”§ Configuration + +### File Size Limits +- Default: 2MB per file (configurable in code) +- Large files are skipped with notification + +### Excluded Directories +- `.git`, `node_modules`, `__pycache__`, `build`, `dist`, `target` +- `venv`, `env`, `.next`, `coverage`, `vendor` +- `bower_components`, `.gradle`, `.m2`, `.cargo` + +### Rate Limiting +- 0.1 second delay between file analyses to avoid API rate limits +- Configurable in the code + +## šŸ“ Example Output + +``` +šŸš€ Starting Complete AI Repository Analysis +============================================================ +Repository: /path/to/my-project +Output: complete_repository_analysis.pdf +Mode: Complete automated analysis of ALL files +============================================================ + +Scanning repository: /path/to/my-project +Found 127 files to analyze +Starting comprehensive analysis of 127 files... +Analyzing file 1/127: main.py +Analyzing file 2/127: config.js +... + +šŸŽÆ COMPLETE ANALYSIS FINISHED +============================================================ +šŸ“Š Repository Statistics: + • Files Analyzed: 127 + • Lines of Code: 15,432 + • Languages: 8 + • Code Quality: 7.2/10 + +šŸ“ˆ Quality Breakdown: + • High Quality Files (8-10): 45 + • Medium Quality Files (5-7): 67 + • Low Quality Files (1-4): 15 + • Total Issues Found: 89 + +šŸ”¤ Language Distribution: + • Python: 45 files + • JavaScript: 32 files + • TypeScript: 28 files + • HTML: 12 files + • CSS: 10 files + +šŸ“„ Complete PDF Report: complete_repository_analysis.pdf +āœ… Complete analysis finished successfully! +``` + +## 🚨 Troubleshooting + +### Common Issues + +1. **Database Connection Errors**: + - Ensure PostgreSQL, MongoDB, and Redis are running + - Check connection credentials in `.env` file + +2. **API Key Issues**: + - Verify Anthropic API key is valid and has sufficient credits + - Check rate limits if analysis fails + +3. **Memory Issues**: + - Large repositories may require more RAM + - Consider increasing system memory or processing in batches + +4. **File Permission Errors**: + - Ensure read access to repository files + - Check write permissions for output directory + +## šŸ¤ Contributing + +This is a complete automated analysis system. The tool will: +- Analyze every file in the repository +- Generate comprehensive reports +- Learn from previous analyses +- Provide actionable insights + +No user interaction required - just run and get results! diff --git a/services/ai-analysis-service/ai-analysis/adv_git_analyzer.py b/services/ai-analysis-service/ai-analysis/adv_git_analyzer.py new file mode 100644 index 0000000..a5f3860 --- /dev/null +++ b/services/ai-analysis-service/ai-analysis/adv_git_analyzer.py @@ -0,0 +1,710 @@ +#!/usr/bin/env python3 +""" +Robust GitHub Repository AI Analysis Tool +Simplified version with better error handling and JSON parsing. +""" + +import os +import asyncio +from pathlib import Path +from typing import Dict, List, Optional, Tuple +from datetime import datetime +import argparse +from dataclasses import dataclass +import shutil +import tempfile +import json +import re +from collections import Counter + +# Core packages +import anthropic +from dotenv import load_dotenv +import git + +# PDF generation +from reportlab.lib.pagesizes import A4 +from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle +from reportlab.lib.enums import TA_CENTER, TA_LEFT +from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak, Table, TableStyle +from reportlab.lib import colors + +@dataclass +class FileAnalysis: + path: str + language: str + lines_of_code: int + complexity_score: float + issues_found: List[str] + recommendations: List[str] + detailed_analysis: str + severity_score: float + +@dataclass +class RepositoryAnalysis: + repo_path: str + total_files: int + total_lines: int + languages: Dict[str, int] + architecture_assessment: str + security_assessment: str + code_quality_score: float + file_analyses: List[FileAnalysis] + executive_summary: str + +class RobustGitHubAnalyzer: + def __init__(self, api_key: str): + self.client = anthropic.Anthropic(api_key=api_key) + self.temp_dir = None + + # Language mapping for file detection + self.language_map = { + '.py': 'Python', '.js': 'JavaScript', '.ts': 'TypeScript', + '.tsx': 'TypeScript', '.jsx': 'JavaScript', '.java': 'Java', + '.cpp': 'C++', '.c': 'C', '.cs': 'C#', '.go': 'Go', '.rs': 'Rust', + '.php': 'PHP', '.rb': 'Ruby', '.swift': 'Swift', '.kt': 'Kotlin', + '.html': 'HTML', '.css': 'CSS', '.scss': 'SCSS', '.sass': 'SASS', + '.sql': 'SQL', '.yaml': 'YAML', '.yml': 'YAML', '.json': 'JSON', + '.xml': 'XML', '.sh': 'Shell', '.dockerfile': 'Docker', + '.md': 'Markdown', '.txt': 'Text' + } + + # Code file extensions to analyze + self.code_extensions = set(self.language_map.keys()) + + def clone_repository(self, repo_path: str) -> str: + """Clone repository or use existing path.""" + if os.path.exists(repo_path): + print(f"Using existing repository: {repo_path}") + return repo_path + else: + print(f"Cloning repository: {repo_path}") + self.temp_dir = tempfile.mkdtemp(prefix="repo_analysis_") + try: + git.Repo.clone_from(repo_path, self.temp_dir) + return self.temp_dir + except Exception as e: + raise Exception(f"Failed to clone repository: {e}") + + def get_file_language(self, file_path: Path) -> str: + """Get programming language from file extension.""" + return self.language_map.get(file_path.suffix.lower(), 'Unknown') + + def calculate_complexity_score(self, content: str) -> float: + """Calculate basic complexity score based on code patterns.""" + lines = content.split('\n') + complexity_indicators = ['if', 'else', 'elif', 'for', 'while', 'try', 'except', 'catch', 'switch'] + + complexity = 1 + for line in lines: + line_lower = line.lower().strip() + for indicator in complexity_indicators: + if indicator in line_lower: + complexity += 1 + + # Normalize to 1-10 scale + return min(complexity / max(len(lines), 1) * 100, 10.0) + + async def analyze_file_comprehensive(self, file_path: Path, content: str) -> FileAnalysis: + """Perform comprehensive file analysis using a single, robust prompt.""" + language = self.get_file_language(file_path) + lines_of_code = len([line for line in content.split('\n') if line.strip()]) + complexity_score = self.calculate_complexity_score(content) + + # Truncate content if too long + if len(content) > 4000: + content = content[:4000] + "\n... [truncated for analysis]" + + print(f" Analyzing {file_path.name} ({language}, {lines_of_code} lines)") + + # Create comprehensive analysis prompt + prompt = f""" +You are a senior software engineer with 25 years of experience. Analyze this {language} code file: + +FILENAME: {file_path.name} +LANGUAGE: {language} +LINES OF CODE: {lines_of_code} + +CODE: +```{language.lower()} +{content} +``` + +Provide a comprehensive analysis covering: + +1. ISSUES FOUND: List specific problems, bugs, security vulnerabilities, or code smells +2. RECOMMENDATIONS: Actionable suggestions for improvement +3. CODE QUALITY: Overall assessment of code quality and maintainability +4. SECURITY: Any security concerns or vulnerabilities +5. PERFORMANCE: Potential performance issues or optimizations +6. BEST PRACTICES: Adherence to coding standards and best practices + +Provide your analysis in clear, structured text (not JSON). Be specific and actionable. +Rate the overall code quality from 1-10 where 10 is excellent. + +ANALYSIS: +""" + + try: + message = self.client.messages.create( + model="claude-3-5-sonnet-20241022", + max_tokens=3000, + temperature=0.1, + messages=[{"role": "user", "content": prompt}] + ) + + analysis_text = message.content[0].text.strip() + + # Extract severity score from analysis + severity_match = re.search(r'(\d+(?:\.\d+)?)/10', analysis_text) + severity_score = float(severity_match.group(1)) if severity_match else 5.0 + + # Parse issues and recommendations from the text + issues = self.extract_issues_from_analysis(analysis_text) + recommendations = self.extract_recommendations_from_analysis(analysis_text) + + return FileAnalysis( + path=str(file_path.relative_to(Path(self.temp_dir or '.'))), + language=language, + lines_of_code=lines_of_code, + complexity_score=complexity_score, + issues_found=issues, + recommendations=recommendations, + detailed_analysis=analysis_text, + severity_score=severity_score + ) + + except Exception as e: + print(f" Error analyzing {file_path.name}: {e}") + return FileAnalysis( + path=str(file_path), + language=language, + lines_of_code=lines_of_code, + complexity_score=complexity_score, + issues_found=[f"Analysis failed: {str(e)}"], + recommendations=["Review file manually due to analysis error"], + detailed_analysis=f"Analysis failed due to error: {str(e)}", + severity_score=5.0 + ) + + def extract_issues_from_analysis(self, analysis_text: str) -> List[str]: + """Extract issues from analysis text.""" + issues = [] + lines = analysis_text.split('\n') + + # Look for common issue indicators + issue_keywords = ['issue', 'problem', 'bug', 'vulnerability', 'error', 'warning', 'concern'] + + for line in lines: + line_lower = line.lower().strip() + if any(keyword in line_lower for keyword in issue_keywords): + if line.strip() and not line.strip().startswith('#'): + issues.append(line.strip()) + + return issues[:10] # Limit to top 10 issues + + def extract_recommendations_from_analysis(self, analysis_text: str) -> List[str]: + """Extract recommendations from analysis text.""" + recommendations = [] + lines = analysis_text.split('\n') + + # Look for recommendation indicators + rec_keywords = ['recommend', 'suggest', 'should', 'consider', 'improve'] + + for line in lines: + line_lower = line.lower().strip() + if any(keyword in line_lower for keyword in rec_keywords): + if line.strip() and not line.strip().startswith('#'): + recommendations.append(line.strip()) + + return recommendations[:10] # Limit to top 10 recommendations + + def scan_repository(self, repo_path: str, max_files: int = 50) -> List[Tuple[Path, str]]: + """Scan repository and collect files for analysis.""" + print(f"Scanning repository: {repo_path}") + + files_to_analyze = [] + + # Important files to always include + important_files = { + 'README.md', 'package.json', 'requirements.txt', 'Dockerfile', + 'docker-compose.yml', 'tsconfig.json', 'next.config.js', + 'tailwind.config.js', 'webpack.config.js', '.env.example' + } + + for root, dirs, files in os.walk(repo_path): + # Skip common build/cache directories + dirs[:] = [d for d in dirs if not d.startswith('.') and + d not in {'node_modules', '__pycache__', 'build', 'dist', 'target', + 'venv', 'env', '.git', '.next', 'coverage'}] + + for file in files: + if len(files_to_analyze) >= max_files: + break + + file_path = Path(root) / file + + # Skip large files + try: + if file_path.stat().st_size > 1000000: # 1MB limit + continue + except: + continue + + # Include important files or files with code extensions + should_include = ( + file.lower() in important_files or + file_path.suffix.lower() in self.code_extensions or + file.lower().startswith('dockerfile') + ) + + if should_include: + try: + with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: + content = f.read() + if content.strip(): # Only non-empty files + files_to_analyze.append((file_path, content)) + except Exception as e: + print(f"Could not read {file_path}: {e}") + + print(f"Found {len(files_to_analyze)} files to analyze") + return files_to_analyze + + async def analyze_repository_overview(self, repo_path: str, file_analyses: List[FileAnalysis]) -> Tuple[str, str]: + """Analyze repository architecture and security.""" + print("Analyzing repository overview...") + + # Prepare summary data + languages = dict(Counter(fa.language for fa in file_analyses)) + total_lines = sum(fa.lines_of_code for fa in file_analyses) + avg_quality = sum(fa.severity_score for fa in file_analyses) / len(file_analyses) if file_analyses else 5.0 + + # Get repository structure + structure_lines = [] + try: + for root, dirs, files in os.walk(repo_path): + dirs[:] = [d for d in dirs if not d.startswith('.') and d not in {'node_modules', '__pycache__'}] + level = root.replace(repo_path, '').count(os.sep) + indent = ' ' * level + structure_lines.append(f"{indent}{os.path.basename(root)}/") + for file in files[:3]: # Limit files shown per directory + structure_lines.append(f"{indent} {file}") + if len(structure_lines) > 50: # Limit total structure size + break + except Exception as e: + structure_lines = [f"Error reading structure: {e}"] + + # Architecture analysis + arch_prompt = f""" +You are a Senior Software Architect with 25 years of experience. + +Analyze this repository: + +REPOSITORY STRUCTURE: +{chr(10).join(structure_lines[:30])} + +STATISTICS: +- Total files analyzed: {len(file_analyses)} +- Total lines of code: {total_lines:,} +- Languages: {languages} +- Average code quality: {avg_quality:.1f}/10 + +TOP FILE ISSUES: +{chr(10).join([f"- {fa.path}: {len(fa.issues_found)} issues" for fa in file_analyses[:10]])} + +Provide an architectural assessment covering: +1. Project type and purpose +2. Technology stack evaluation +3. Code organization and structure +4. Scalability and maintainability concerns +5. Key recommendations for improvement + +Keep response under 1500 words and focus on actionable insights. +""" + + # Security analysis + security_issues = [] + for fa in file_analyses: + security_issues.extend([issue for issue in fa.issues_found if + any(keyword in issue.lower() for keyword in + ['security', 'vulnerability', 'injection', 'xss', 'auth', 'password'])]) + + sec_prompt = f""" +You are a Senior Security Engineer with 20+ years of experience. + +Security Analysis for repository with {len(file_analyses)} files: + +SECURITY ISSUES FOUND: +{chr(10).join(security_issues[:20]) if security_issues else "No obvious security issues detected"} + +HIGH-RISK FILE TYPES PRESENT: +{[lang for lang, count in languages.items() if lang in ['JavaScript', 'TypeScript', 'Python', 'PHP', 'SQL']]} + +Provide security assessment covering: +1. Overall security posture +2. Main security risks and vulnerabilities +3. Authentication and authorization concerns +4. Data protection and privacy issues +5. Immediate security priorities + +Keep response under 1000 words and focus on actionable security recommendations. +""" + + try: + # Run both analyses + arch_task = self.client.messages.create( + model="claude-3-5-sonnet-20241022", + max_tokens=2000, + temperature=0.1, + messages=[{"role": "user", "content": arch_prompt}] + ) + + sec_task = self.client.messages.create( + model="claude-3-5-sonnet-20241022", + max_tokens=1500, + temperature=0.1, + messages=[{"role": "user", "content": sec_prompt}] + ) + + architecture_assessment = arch_task.content[0].text + security_assessment = sec_task.content[0].text + + return architecture_assessment, security_assessment + + except Exception as e: + return f"Architecture analysis failed: {e}", f"Security analysis failed: {e}" + + async def generate_executive_summary(self, analysis: RepositoryAnalysis) -> str: + """Generate executive summary for leadership.""" + print("Generating executive summary...") + + prompt = f""" +You are presenting to C-level executives. Create an executive summary of this technical analysis: + +REPOSITORY METRICS: +- Total Files: {analysis.total_files} +- Lines of Code: {analysis.total_lines:,} +- Languages: {analysis.languages} +- Code Quality Score: {analysis.code_quality_score:.1f}/10 + +KEY FINDINGS: +- Total issues identified: {sum(len(fa.issues_found) for fa in analysis.file_analyses)} +- Files needing attention: {len([fa for fa in analysis.file_analyses if fa.severity_score < 7])} +- High-quality files: {len([fa for fa in analysis.file_analyses if fa.severity_score >= 8])} + +Create an executive summary for non-technical leadership covering: +1. Business impact of code quality findings +2. Risk assessment and implications +3. Investment priorities and recommendations +4. Expected ROI from addressing technical debt +5. Competitive implications + +Focus on business outcomes, not technical details. Keep under 800 words. +""" + + try: + message = self.client.messages.create( + model="claude-3-5-sonnet-20241022", + max_tokens=1200, + temperature=0.1, + messages=[{"role": "user", "content": prompt}] + ) + return message.content[0].text + except Exception as e: + return f"Executive summary generation failed: {e}" + + def create_pdf_report(self, analysis: RepositoryAnalysis, output_path: str): + """Generate comprehensive PDF report.""" + print(f"Generating PDF report: {output_path}") + + doc = SimpleDocTemplate(output_path, pagesize=A4, + leftMargin=72, rightMargin=72, + topMargin=72, bottomMargin=72) + styles = getSampleStyleSheet() + story = [] + + # Custom styles + title_style = ParagraphStyle( + 'CustomTitle', + parent=styles['Heading1'], + fontSize=24, + textColor=colors.darkblue, + spaceAfter=30, + alignment=TA_CENTER + ) + + heading_style = ParagraphStyle( + 'CustomHeading', + parent=styles['Heading2'], + fontSize=16, + textColor=colors.darkblue, + spaceBefore=20, + spaceAfter=10 + ) + + # Title Page + story.append(Paragraph("Repository Analysis Report", title_style)) + story.append(Spacer(1, 20)) + story.append(Paragraph(f"Repository: {analysis.repo_path}", styles['Normal'])) + story.append(Paragraph(f"Analysis Date: {datetime.now().strftime('%B %d, %Y at %H:%M')}", styles['Normal'])) + story.append(Paragraph("Generated by: AI Senior Engineering Team", styles['Normal'])) + story.append(PageBreak()) + + # Executive Summary + story.append(Paragraph("Executive Summary", heading_style)) + story.append(Paragraph(analysis.executive_summary, styles['Normal'])) + story.append(PageBreak()) + + # Repository Overview + story.append(Paragraph("Repository Overview", heading_style)) + + overview_data = [ + ['Metric', 'Value'], + ['Total Files Analyzed', str(analysis.total_files)], + ['Total Lines of Code', f"{analysis.total_lines:,}"], + ['Primary Languages', ', '.join(list(analysis.languages.keys())[:5])], + ['Overall Code Quality', f"{analysis.code_quality_score:.1f}/10"], + ] + + overview_table = Table(overview_data, colWidths=[200, 300]) + overview_table.setStyle(TableStyle([ + ('BACKGROUND', (0, 0), (-1, 0), colors.grey), + ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), + ('ALIGN', (0, 0), (-1, -1), 'LEFT'), + ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'), + ('FONTSIZE', (0, 0), (-1, 0), 12), + ('BOTTOMPADDING', (0, 0), (-1, 0), 12), + ('BACKGROUND', (0, 1), (-1, -1), colors.beige), + ('GRID', (0, 0), (-1, -1), 1, colors.black) + ])) + + story.append(overview_table) + story.append(Spacer(1, 20)) + + # Languages Distribution + if analysis.languages: + story.append(Paragraph("Language Distribution", heading_style)) + lang_data = [['Language', 'Files']] + for lang, count in sorted(analysis.languages.items(), key=lambda x: x[1], reverse=True): + lang_data.append([lang, str(count)]) + + lang_table = Table(lang_data, colWidths=[200, 100]) + lang_table.setStyle(TableStyle([ + ('BACKGROUND', (0, 0), (-1, 0), colors.grey), + ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), + ('ALIGN', (0, 0), (-1, -1), 'LEFT'), + ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'), + ('GRID', (0, 0), (-1, -1), 1, colors.black) + ])) + story.append(lang_table) + story.append(PageBreak()) + + # Architecture Assessment + story.append(Paragraph("Architecture Assessment", heading_style)) + # Split long text into paragraphs + arch_paragraphs = analysis.architecture_assessment.split('\n\n') + for para in arch_paragraphs[:10]: # Limit paragraphs + if para.strip(): + story.append(Paragraph(para.strip(), styles['Normal'])) + story.append(Spacer(1, 10)) + story.append(PageBreak()) + + # Security Assessment + story.append(Paragraph("Security Assessment", heading_style)) + sec_paragraphs = analysis.security_assessment.split('\n\n') + for para in sec_paragraphs[:10]: # Limit paragraphs + if para.strip(): + story.append(Paragraph(para.strip(), styles['Normal'])) + story.append(Spacer(1, 10)) + story.append(PageBreak()) + + # File Analysis Summary + story.append(Paragraph("File Analysis Summary", heading_style)) + + # Summary statistics + high_quality_files = [fa for fa in analysis.file_analyses if fa.severity_score >= 8] + medium_quality_files = [fa for fa in analysis.file_analyses if 5 <= fa.severity_score < 8] + low_quality_files = [fa for fa in analysis.file_analyses if fa.severity_score < 5] + + quality_data = [ + ['Quality Level', 'Files', 'Percentage'], + ['High Quality (8-10)', str(len(high_quality_files)), f"{len(high_quality_files)/len(analysis.file_analyses)*100:.1f}%"], + ['Medium Quality (5-7)', str(len(medium_quality_files)), f"{len(medium_quality_files)/len(analysis.file_analyses)*100:.1f}%"], + ['Low Quality (1-4)', str(len(low_quality_files)), f"{len(low_quality_files)/len(analysis.file_analyses)*100:.1f}%"] + ] + + quality_table = Table(quality_data) + quality_table.setStyle(TableStyle([ + ('BACKGROUND', (0, 0), (-1, 0), colors.grey), + ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), + ('ALIGN', (0, 0), (-1, -1), 'CENTER'), + ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'), + ('GRID', (0, 0), (-1, -1), 1, colors.black), + ('BACKGROUND', (0, 1), (-1, 1), colors.lightgreen), + ('BACKGROUND', (0, 2), (-1, 2), colors.lightyellow), + ('BACKGROUND', (0, 3), (-1, 3), colors.lightcoral) + ])) + + story.append(quality_table) + story.append(Spacer(1, 20)) + + # Top Issues Found + story.append(Paragraph("Files Requiring Attention", heading_style)) + + # Sort files by severity (lowest scores first - need most attention) + files_by_priority = sorted(analysis.file_analyses, key=lambda x: x.severity_score) + + for i, file_analysis in enumerate(files_by_priority[:15]): # Top 15 files needing attention + story.append(Paragraph(f"{i+1}. {file_analysis.path}", styles['Heading4'])) + story.append(Paragraph(f"Language: {file_analysis.language} | Quality Score: {file_analysis.severity_score:.1f}/10 | Lines: {file_analysis.lines_of_code}", styles['Normal'])) + + # Show top issues + if file_analysis.issues_found: + story.append(Paragraph("Key Issues:", styles['Heading5'])) + for issue in file_analysis.issues_found[:3]: # Top 3 issues + story.append(Paragraph(f"• {issue}", styles['Normal'])) + + # Show top recommendations + if file_analysis.recommendations: + story.append(Paragraph("Recommendations:", styles['Heading5'])) + for rec in file_analysis.recommendations[:2]: # Top 2 recommendations + story.append(Paragraph(f"• {rec}", styles['Normal'])) + + story.append(Spacer(1, 15)) + + # Build PDF + try: + doc.build(story) + print(f"āœ… PDF report generated successfully: {output_path}") + except Exception as e: + print(f"āŒ Error generating PDF: {e}") + + async def analyze_repository(self, repo_path: str, max_files: int = 50) -> RepositoryAnalysis: + """Main analysis function.""" + try: + # Clone/access repository + actual_repo_path = self.clone_repository(repo_path) + + # Scan files + files_to_analyze = self.scan_repository(actual_repo_path, max_files) + + if not files_to_analyze: + raise Exception("No files found to analyze") + + # Analyze each file + print(f"Starting analysis of {len(files_to_analyze)} files...") + file_analyses = [] + + for i, (file_path, content) in enumerate(files_to_analyze): + print(f"Analyzing file {i+1}/{len(files_to_analyze)}: {file_path.name}") + analysis = await self.analyze_file_comprehensive(file_path, content) + file_analyses.append(analysis) + + # Small delay to avoid rate limiting + await asyncio.sleep(0.2) + + # Repository-level analyses + print("Performing repository-level analysis...") + architecture_assessment, security_assessment = await self.analyze_repository_overview( + actual_repo_path, file_analyses) + + # Calculate overall quality score + avg_quality = sum(fa.severity_score for fa in file_analyses) / len(file_analyses) + + # Generate statistics + languages = dict(Counter(fa.language for fa in file_analyses)) + total_lines = sum(fa.lines_of_code for fa in file_analyses) + + # Create repository analysis + repo_analysis = RepositoryAnalysis( + repo_path=repo_path, + total_files=len(file_analyses), + total_lines=total_lines, + languages=languages, + architecture_assessment=architecture_assessment, + security_assessment=security_assessment, + code_quality_score=avg_quality, + file_analyses=file_analyses, + executive_summary="" + ) + + # Generate executive summary + print("Generating executive summary...") + repo_analysis.executive_summary = await self.generate_executive_summary(repo_analysis) + + return repo_analysis + + finally: + # Cleanup + if self.temp_dir and os.path.exists(self.temp_dir): + shutil.rmtree(self.temp_dir) + print("Temporary files cleaned up") + +async def main(): + # Load environment variables + load_dotenv() + + parser = argparse.ArgumentParser(description="Robust GitHub Repository AI Analysis") + parser.add_argument("repo_path", help="Repository path (local directory or Git URL)") + parser.add_argument("--output", "-o", default="repository_analysis.pdf", + help="Output PDF file path") + parser.add_argument("--max-files", type=int, default=50, + help="Maximum files to analyze") + parser.add_argument("--api-key", help="Anthropic API key (overrides .env)") + + args = parser.parse_args() + + # Get API key + api_key = args.api_key or os.getenv('ANTHROPIC_API_KEY') + if not api_key: + print("āŒ Error: ANTHROPIC_API_KEY not found in .env file or command line") + print("Please create a .env file with: ANTHROPIC_API_KEY=your_key_here") + return 1 + + try: + print("šŸš€ Starting Repository Analysis") + print("=" * 60) + print(f"Repository: {args.repo_path}") + print(f"Max files: {args.max_files}") + print(f"Output: {args.output}") + print("=" * 60) + + # Initialize analyzer + analyzer = RobustGitHubAnalyzer(api_key) + + # Perform analysis + analysis = await analyzer.analyze_repository(args.repo_path, args.max_files) + + # Generate PDF report + analyzer.create_pdf_report(analysis, args.output) + + # Print summary to console + print("\n" + "=" * 60) + print("šŸŽÆ ANALYSIS COMPLETE") + print("=" * 60) + print(f"šŸ“Š Repository Statistics:") + print(f" • Files Analyzed: {analysis.total_files}") + print(f" • Lines of Code: {analysis.total_lines:,}") + print(f" • Languages: {len(analysis.languages)}") + print(f" • Code Quality: {analysis.code_quality_score:.1f}/10") + + # Quality breakdown + high_quality = len([fa for fa in analysis.file_analyses if fa.severity_score >= 8]) + low_quality = len([fa for fa in analysis.file_analyses if fa.severity_score < 5]) + + print(f"\nšŸ“ˆ Quality Breakdown:") + print(f" • High Quality Files: {high_quality}") + print(f" • Files Needing Attention: {low_quality}") + print(f" • Total Issues Found: {sum(len(fa.issues_found) for fa in analysis.file_analyses)}") + + print(f"\nšŸ“„ Detailed PDF Report: {args.output}") + print("\nāœ… Analysis completed successfully!") + + return 0 + + except Exception as e: + print(f"āŒ Error during analysis: {e}") + return 1 + +if __name__ == "__main__": + exit(asyncio.run(main())) \ No newline at end of file diff --git a/services/ai-analysis-service/ai-analysis/ai_blog_analysis.pdf b/services/ai-analysis-service/ai-analysis/ai_blog_analysis.pdf new file mode 100644 index 0000000..f42992f --- /dev/null +++ b/services/ai-analysis-service/ai-analysis/ai_blog_analysis.pdf @@ -0,0 +1,232 @@ +%PDF-1.4 +%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com +1 0 obj +<< +/F1 2 0 R /F2 3 0 R /F3 9 0 R +>> +endobj +2 0 obj +<< +/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font +>> +endobj +3 0 obj +<< +/BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font +>> +endobj +4 0 obj +<< +/Contents 17 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 16 0 R /Resources << +/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] +>> /Rotate 0 /Trans << + +>> + /Type /Page +>> +endobj +5 0 obj +<< +/Contents 18 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 16 0 R /Resources << +/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] +>> /Rotate 0 /Trans << + +>> + /Type /Page +>> +endobj +6 0 obj +<< +/Contents 19 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 16 0 R /Resources << +/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] +>> /Rotate 0 /Trans << + +>> + /Type /Page +>> +endobj +7 0 obj +<< +/Contents 20 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 16 0 R /Resources << +/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] +>> /Rotate 0 /Trans << + +>> + /Type /Page +>> +endobj +8 0 obj +<< +/Contents 21 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 16 0 R /Resources << +/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] +>> /Rotate 0 /Trans << + +>> + /Type /Page +>> +endobj +9 0 obj +<< +/BaseFont /Helvetica-BoldOblique /Encoding /WinAnsiEncoding /Name /F3 /Subtype /Type1 /Type /Font +>> +endobj +10 0 obj +<< +/Contents 22 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 16 0 R /Resources << +/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] +>> /Rotate 0 /Trans << + +>> + /Type /Page +>> +endobj +11 0 obj +<< +/Contents 23 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 16 0 R /Resources << +/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] +>> /Rotate 0 /Trans << + +>> + /Type /Page +>> +endobj +12 0 obj +<< +/Contents 24 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 16 0 R /Resources << +/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] +>> /Rotate 0 /Trans << + +>> + /Type /Page +>> +endobj +13 0 obj +<< +/Contents 25 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 16 0 R /Resources << +/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] +>> /Rotate 0 /Trans << + +>> + /Type /Page +>> +endobj +14 0 obj +<< +/PageMode /UseNone /Pages 16 0 R /Type /Catalog +>> +endobj +15 0 obj +<< +/Author (\(anonymous\)) /CreationDate (D:20250919123308+05'00') /Creator (\(unspecified\)) /Keywords () /ModDate (D:20250919123308+05'00') /Producer (ReportLab PDF Library - www.reportlab.com) + /Subject (\(unspecified\)) /Title (\(anonymous\)) /Trapped /False +>> +endobj +16 0 obj +<< +/Count 9 /Kids [ 4 0 R 5 0 R 6 0 R 7 0 R 8 0 R 10 0 R 11 0 R 12 0 R 13 0 R ] /Type /Pages +>> +endobj +17 0 obj +<< +/Filter [ /ASCII85Decode /FlateDecode ] /Length 367 +>> +stream +Gat>Ob>,r/&-^F/^>^aQ+qM;2mo!"Z,rU:'+DFN!-*UmX9fWY/Ec?M%jF#/Z\\ge'p)luOhIPLQ[I2NF=e"ji6TniD.=DH+Kt)n$GsIg"Wei,tr^>pN;0%8ZkRlCGNkJ`@0/m+gMd9CE2":C%X7.gS;0UgGA$4o>n6P`k2MG+p1deWfJ:Cu=FH'YR36n(uendstream +endobj +18 0 obj +<< +/Filter [ /ASCII85Decode /FlateDecode ] /Length 2039 +>> +stream +Gat%#?$"aY&:Dg-\;-rFFG?eDbDtmI7q"KL`h-_gFs\jr#uPA,J,qpglEBXt5Z*^1cEu!O1SKW:]t<)`32J&fC%tuB7.1N[n`Q.b)&4YokE@n@+"8^HI=%4hDn\<2GOs;*q>!hL3.WaXn`4e@3lM2*^I!Tq%#Q_j!mW2W$N\R6gmdY%QG$?=8^"hbL#'J>i_M%Qi'_ea*$m[,9b3C-76c&VkP,JZ@t[#,/CX*n2%okZ/NspFkDY_!Y-'DGs.G(F,i/-f;1;0q;^'>lEX++MHH]M"E9B@8,eb/ms&c3VsDZm#4l%b#&\6%lf;?P'S^%.60J81ZiG+dN1WOVX:0\JIJ:,#X#6NK\h2^k1A:,8bpp(jeAE$(;7*qKZi7=-eF-,%b6Gl7ZQHJk*cc>@hGD?kHicFiCYuCf1KRCWu0tt.:pKu)+/bE.q'r`gr7u>N6MDN;^IqTF2aH?2f4HYkW&ta%CTRi.u*D9idts<89Mf>80)0fG=oJHTlK`<=oI7R_GcJcq]gS3"9IY8j'%+Rlq]E,p6q+b7Z"*IOZJ'J+>r+-!E:<7"P"N_0]ps+6OkIXd<"5c77US33[UeBE*Ki]tYA/Z#AeD#,%[T_fj@[A$ucW^:0MaX"6PeN$%TiT=krA5J"LL1f2CQ.'"d`d?qj07PVAfo#0K!a!#\r%AH$_jA":#,tNUb[XP(6.bf?6Dus+8B)2fnJjH#cB8;LWaqhU63Q\Hp=g?E0%!Rlb7>kckrg&EX+)d=0>;:*sE+d@!B5_@!a!Sc&#Lo#;a!GDJ!.a2i_Ebn`bA@8(`lPLFO]m6s@TLO$(fkG)Z]\j+9s@Tll:ojniKhXUN91eQs7n&ALiR0NKtN"/9%1k-QfCaRf7.dk@Yh%.l/ZNM%`"Rl!UQqK.G2mH9e>/AQ(dmZorU4pRSOE2)CH#i`iKibBM]L`>$nQInMi8,9s?kqko>rnBZ%D!]12Aeh)a_9m_*8@g0\[p%C4D]:ZMi[\nZH-seQZNtjNNmDWF`qb4+9#V@=&^krFr'dUetY-PZrKuT/701G@&e2Qn(G-NU9T_;o<(r6-cu3$qk)o>DhlCR/<.cEBWP0d,'eU9Q4GA5.+%D4Db$s"kI['JUFRIS]66\-:S&U\$%7k,X>@N%H1g&J:H?\(<5d_O'*nM:<'07lq!nrfI5i9cTnrf'#(XVelQJB^qYl$ul+7Lf;7ZJnpbWHO7eC><;G]lg9\\S*V_Q5aTQ;[bq2JTR"bD>qF^,qfZIne5Y$SQ*f*B#f_eW*a[0lT:,CRRKJ)t4FVk:,K9QSf\h\R2"FjUQGoL4O]+$N_+L=2/C\_&$#$\:R%;\Y!rlH5e+^aq@bi)hnuJ18.BD:f0VnGZ;r?[:D=dVXp!c9#W$Y;U@>5qhkgkR9L@I?5X!dgLNYNkE:9GT140pL;Z_<4#a7BNIjZ?Wh?-6j/M$Cfg%URGaj>&I]Nci7+I0Tk+I477c0\ScaE7WoF):_lgUMP!9TmO`C-p/##-kDNW~>endstream +endobj +19 0 obj +<< +/Filter [ /ASCII85Decode /FlateDecode ] /Length 764 +>> +stream +GatU0?#SFN'Rf.GgrHR,WU2Z?o%8*NU^G[MU.K_(MF$Jn_En7-?b[P0OHe^U2FV$:ptTq#qjpH3i'[2;o+KtK"ul8j."c=GPQr26&U__*^BW1Cirig4"\Fk((kE&H*(2n5#h4b5.aWerat-DO!>SclC#uLhe>c^89i^Z@ENAAAY'07VH\(Op9f9bb9?6'XKU>\kU6dZl#YbJVit:mL(m_$1&H_E(%(1]_ocQd/M%^AS0bFhH(if.>KUFT>L!(kD,j&/"#S5D)01-T"qWFs6Q1uu@d]Ir4*KPTi,H]H2S1G#\)jkGPOZ3.rN_7?"$/X&.Bsm'uJA2nI=\k[[A`l[(WJ_'1"\^dC/4S?qP1NDP4OGFk'29Z5d3M%cPAoDh\c`H@!#HR!U&~>endstream +endobj +20 0 obj +<< +/Filter [ /ASCII85Decode /FlateDecode ] /Length 1610 +>> +stream +Gat=*968iG&AJ$Cln(tJaeIY;c-`=]3_AX,b,'4k+M":UK)c:0P1a4">u77:[Zl_@1Ro$XmOn3[/0a<*0+-%$!-l8/lX(ilqQS$`)Kpn?p^A5[(]Rf0S"5`l9ST>1FF#a>05,oDG=TPJO'^K:Jg*U":^U,t^ck0H&9,eN/oPU4PTCKF=bL#Bd('4cIg_/>=T$,%rhSF[b5UmBq";f\`^Jrj_A)dtXs;iFg4'rVH@-Bi_5EnEISS2UU&NHldA(u$AuTLU+F_(M5_D7n(N"Ef:KKo)cu;Of9%Q!C"0/Y9qSGB4+DdId=1MhWlo0_Z?*m[&r\r$;X6MYi#H-SfQVK+`if:C/Mi`(Y0)b*5::I%mMIm-h`[7"r)0ABMs@'T/@7[O)T_TG'sOM5#Gj1<<[JE_B+mI:*qiQCDm0c)(IRQE];O'Xf.j$'*A(W8t:E)bj(jG;OP%H1)1-jQA+r?Z@SqY9Y?OcEnif%h4CF5;,o#m-(Tu$IV*Y)4^J(VN$;;-s(8p*bd"Tp+Z`J_PjOmG;A8Y+q6TStbFtaBC>Z.8i&qrd\fl%#l'Wb?M\JQgNMDV4.5+?%F-3+7W_,'$c'Q72rC.e4mp,aF209Ucrb:diP?3dP6'k\@>l2G$6HfCto)P]ogW=Sfq6s:&r_ILMDdEXKgDV/R*cm6b3"/Y^agaK4:&BE?-76iNlJmK@p!<<8Vr=1J(j8H.8r@Rtd#^0qWVkendstream +endobj +21 0 obj +<< +/Filter [ /ASCII85Decode /FlateDecode ] /Length 1572 +>> +stream +Gat=*gJ[&k&:N^lqIu;LJkSij>i("!/Z9S2Z6W-2"##P5,T:L@/'3@dfC*E6EL`-+(6p?t>?5+Vl-nGp[IHoL?^VR5NTfu+#pgrURS_FLF_UK-^5`^&4\1lGSt=>D\(.7=Ou3f/kL4UE#VUTLbc!AgB0lqo9b"OMe&<\;>QVqF.6gX'C<-1'CNGWUhT:-;fdGlrKE9Vr?sIS_AMT4#H$Z&kMS>3?oT_\$sI36cYuGH`g7'Dk%m&K;/*Zs\FQ[$i6CKR)j"J0!mH&>:<Uj6f(a8@d?9DtX/p&[N)aJfe&K"*r:S?2p[Ql$-h$f(r_EI\=G%eG-KTRCE3)&a7Y@KjF5_tl>8F*CAX8K7@[nnD@YZ3q&/CkCbQ5-BX#fAUW)EhZJocT)[?1s)A2((M"GolUQ])[nP,T!s>?]0_W#!M[\@!f$-VXp,3Z#VZOS4jNO=&54\-'h[^GVT5eEO3dU<=2:fnc;+2+gO&O^-EjHQYWe/Tc-Y$#7g1pn!Rl]S2rP)4/c=Z@ORMJO^Y\`eE[V5^[X8S[_]>M];S7nN!SkR/3g^`ar5A-ktZ/th?2n&m[d*fS;sZ>.Wb8O+AK'b[QnNHfhU[]GIiR&=>gc*i^7OM[aE`Hr9^BNDe\Q:G*6*#nD!DLAYu<)qBs-3C"=Mj7b]N*lr49*\-GOer\k?anWmn996BHf=G-5;m\g5eRrhk.+)A3_uN;3ika"XEZl*mLV=7G76P'!d"D3e!jchp3+Joo)>MPFEb`MUB1$CXMk>h*;5Po34OjWHFSH2VJ/2_RWZDu8emc57MhT7KYjh+RO=1>.\`g/7jSCV7bFQA=ZD:kkfogXD=?Q>6VhEaCX4g1V1Z"h,AN9-RH`eiblG*EEt:cca-VFH@7RKBLKQ48lj8fQjn#s6iWCO\rJ_[G;endstream +endobj +22 0 obj +<< +/Filter [ /ASCII85Decode /FlateDecode ] /Length 1697 +>> +stream +GauHKgN)%,&:O:Slq:u]1dVK)PrCXA)Q1mOT6^tUC3"1eYj7d77kbO$?\P>#Al9(-Wqur(pdeKX>]>eIeaG2D>\K-k%4);(EZhVo1[.t(:"m,tHfp9r8Ns7jLJgN-*`HMF--T6(j+1:jd.A$G*.=`c]#,1@)SfN<=kFp(Ei9qil].Hs/$[ug]GEK`hB3(3PHas8pM7#A84S"4R]>rNGPblp#cGc?qf!;etcT,W52o2:smkAj3`nf58P>JM4Wbi,8POA9H<;Z1VU%_22n`@eS"j.Y)MFSH>%04_uG^MbpoQgKN00;l(c&4p'gCFm+aY`H_C.NeAI=B[`D:(l=r0mSc3\\)8o_$BCG&jqn;\"%'t0_.43>*Fa:VMRLrs6F^UDLTXNIG5ih>ElYCB[dGpX&83!TXD)jSo8\#-L->*h%$2o\m\jQ_ruhm(tX[SDL&*_NW8*OkF]APWR'_Sic=kYH:'N^;SKc+Mp4cCo*%h:NVHhX.P7N>;H;qE<#.Pa%%pqjCk,^$i1($XFj(_g7@=ZA)1Q/f.*m3Jr8:D=LWt0n*Ym-Bc2NIs3k75J+'jkd@];&=N:##AiB]_AUXA8R&\YsUI/0oea#Y=YG;mln-7G1:TL@kHd$9J<<7"UeKZY_BL9+;p(&5mJ85uT;Y0n.&[rk-G8<\e)DqV;*QTc=d'5)fIF4'89u'](X=I\j@pcKYP<,F">uK`kPI77EB5e9Z\Jr@p@l!U>L$^n`Sle':GLMM0t_6q&>QGhJh$D^18T:@1ceNrS9,kq`oBi>&d:D9$U$G"Ce:T4\!/qUdQ@!!M:!a8`'ec%lR\`6;2>O1S1'e(NX.]T#To^P!]k=V\4'XQ1r1[lK`We,N8_%`?PLfpe:Sl$lW[(&)\rDQct")"Q$kpr6MVI$[QX(>BS2R"7nI/f3YNnJV)R\[e4mOr]l^K.osZHUc,2o:DCDa,aAdmF9SL3PA25p"0IS0"^-J0l9)m^?$B=tj*3F=.4>4Z%endstream +endobj +23 0 obj +<< +/Filter [ /ASCII85Decode /FlateDecode ] /Length 1467 +>> +stream +GauHKgN)%,&:O:Sls$$@9aeUi;Voj?C#/R4=Nm841GB,,E(GK_5V(":;g"+*7/@ljI1_rCD*\>SX*"WtFLUcfc!r+@"PE,i;#h]n_*5mr0_eF;`cN.1^R>rCa82(sA7lUSU#&Z]N%WF&RKYmd)L5LKi>c?!R3fF0>C&XCC=E(17GQZV>AA?h$TCMM08X/S1KKMtL:;s^l2))%Mku4N$=q?/7;*bOPq_S85o)$]O[SlJIO!4"V;MK/a.'KK)YgDAJO%l&k%(oF#/6eWDC70+.TRYr%_bg:q[g4h=5T*q7>'!sq5OO#6!R0s:c/24T)]SX=0AU1AH\sCLCiWsE@"+i7dNm*"nB2+j.ed)hY;6gVC-&oOGNl981oU6\''p@!CnechBZG;&L!gdRDX9%=Mpoi[n$9:#bDA/X1627-M?9.^/2U?1s32`6nSl'jVN5j?X,Z8ef6+jAO6eiuG)^K8.\H4VOdYUKRs9e2.^,qGUp=&e+f$L6%OO?ULG5/EVmX03tiC18cVd:T1X6R"`A8!JiL:3d:mq:/@,c;u]_egjoYH7o&H7:,ip>^9?Qr$<5ND\T5mmA[hT(8!6qK4/+^;#\B27OrAj,pJ$0THtd(3GVd-[Od(XX>4%Ua#bfYI#iH6(@-Ea>4b5'UMZtJ=[=&Pc]DsqbCn0dF75iK@6gWbei3f^r1>!:dHRKm$]%($MR^VKRQ/PgM]p$Zp,i"ScqoNXkO*kof3839ic:'u_siqEcH)\$^Su]d..VZ01eB4SiecIm:FM-Oln7*FJendstream +endobj +24 0 obj +<< +/Filter [ /ASCII85Decode /FlateDecode ] /Length 1179 +>> +stream +GauHK9lo&I'YO<\;&;d5hqb,'4k&.)!#8V.,]Nn)74cNE:5%dSWZl"2as%07'Qf_;UT&odA.g@)*GGdDt?HNM-]E9P:G!bWBn6XNpLC9?VPRk]LQh&?ekD9;JXr#hZlk[@:U=oLMW9K=&>2?rDbpV/V1ghEpo.?UWNWg]c!aa;if-%p\fGnY7c6TSNI"i.@/\"![3YN.h@`Md4D4fdM=%p;Z0FFn'#i77##8K94nfVfF\7P^YrQ5UhTi?Y(8"tqZ!MobY2T?Pa437%:6_PqE/4TH!DH*/@8Er:i7/>*n:I"*3Y[2.m0MfB,FPhmM2,*=0_$-m.-lJMXO9p<;)A$`CFbi'Viih>aKX^#1t;\e_SkuAf(k&3U-paseQc)I@Rku.#\;Wbc1:8pe^\^5me,`%HF:1Kq=pJce!Ml+R(b5eH-XK^gdZ.C4VaW*LB\6';3*E7O2&&tp=)%.4RFVZQDSuGT<&mu)Bg[[1:n[ue0a2caMb"6ZV`Q,-NQGlV*(-`i\17uDi*Ot+/4i9'SJ0,8ZC+&QoS*?*aM+iN[9_^0aid9XS.!Ea)p5)!=$=>4:J&5A%Cd*b/OXQY3peJ(?1*SuI^m]($?TKN*$endstream +endobj +25 0 obj +<< +/Filter [ /ASCII85Decode /FlateDecode ] /Length 1292 +>> +stream +GauI7>Ar7U&;B$5/*[e7Bk$aPJH[9XAYq5^P#PTerUCoND@gHi<,/6;#74pHcJe9^j!Y=joO7.UHj+JX_9k=K/;.g?/.Q.1/msh_HoYTh/&s)tP:.9Fa!oh1-\h!p>oqA+m,;q880M3b_a6>#T/C^t>eoY9\;$t;%@XbU6$aLY"bs4Uhg7*&AZ+:S46\M+:KsO"g^5><>62@=&I`$>?%Z80>2>sc_?@U#Nm]TepC5[_k%[='7I.g_Y0gq4.HHoiS&s@6Cc8gd5KTd(QZKo@').NG"#t@c;P9o.I!3W#?(F_D-NBUm9MRd!]UE/=+QOR*QJ^+9deqHS01=LWp@qs5T^(.kLq^=mc$I&m`t)LKSmpaC%O9[J#,=%B1IKQ1o7(:!%2B@j.8ZjAN@Y-H^3NH#'%jC<(L3780C^W)PfA!O7;_!F>W:FA*9Of[FH/>%7(7T"$R#gK&2TrJKH_?7@J2"3c7Y*C?sc7Jm%Heo]Mr)^gq&p7>+fjAguX4@68\$]Vh]2$@)_S*b[B:@2lhsZW20O_YY3WDT=WEPX_AfKq+3#A[9O-KK\XS2(lcO4](M'oJE(ZE$FC5D\47[YE&UH7W2?t(2qCX0KX"qWIo%^\:-+)8Lh^oJooTS';6=PVca3EeXQIsX^:Bu4)N1,oVZg&0YX_aERgg+7V-@]amP7Nnm56mr+&"j]'p"sPs!c7Q*Lq*uBICi0:hnC7ZC'(S?e+j;fkBSl6b,nj0ZkSsA=(;/TIcg"p<\X;TkpWZbIP:KDkr77Q:`'l#efMY,oZ<'#7(9r0sdjYGtQ)Ftbf=e"6RLDk_\D3Xt[Df>YOF\=aI98oM^_m(1&Ndqk>MWc[_)ae&&51f+!$mdtP>#^CGa`;p^[a4A,;)f'[XO;PGMGgVsMX92Zs"dLd7aLL1H_Dj`r:SDSrF5rC->5[f8tP/7L#)DR&63066?9XE#u\=EEjVW3Pa%3\22;GATr'@1QDB&)c@N.11I*~>endstream +endobj +xref +0 26 +0000000000 65535 f +0000000073 00000 n +0000000124 00000 n +0000000231 00000 n +0000000343 00000 n +0000000548 00000 n +0000000753 00000 n +0000000958 00000 n +0000001163 00000 n +0000001368 00000 n +0000001487 00000 n +0000001693 00000 n +0000001899 00000 n +0000002105 00000 n +0000002311 00000 n +0000002381 00000 n +0000002665 00000 n +0000002777 00000 n +0000003235 00000 n +0000005366 00000 n +0000006221 00000 n +0000007923 00000 n +0000009587 00000 n +0000011376 00000 n +0000012935 00000 n +0000014206 00000 n +trailer +<< +/ID +[<18e7918b3296693e83634aaf57fa33ad><18e7918b3296693e83634aaf57fa33ad>] +% ReportLab generated PDF document -- digest (http://www.reportlab.com) + +/Info 15 0 R +/Root 14 0 R +/Size 26 +>> +startxref +15590 +%%EOF diff --git a/services/ai-analysis-service/ai-analysis/analysis_report.md b/services/ai-analysis-service/ai-analysis/analysis_report.md new file mode 100644 index 0000000..d2da737 --- /dev/null +++ b/services/ai-analysis-service/ai-analysis/analysis_report.md @@ -0,0 +1,363 @@ + +# GitHub Repository Analysis Report + +**Repository:** https://github.com/TejasTeju-dev/AI-Blog +**Analysis Date:** 2025-09-19 11:09:14 +**Analyzed by:** Claude AI Assistant + +--- + +## Executive Summary + +Let me provide a comprehensive analysis: + +1. **Project Type & Purpose**: +This appears to be a modern web application built with Next.js, likely a blog or content platform with articles and topics sections. The extensive UI component library suggests it's a full-featured web application with a sophisticated user interface. + +2. **Technology Stack**: +- Frontend Framework: Next.js (React) +- Language: TypeScript +- Styling: Tailwind CSS +- Package Manager: pnpm +- UI Components: Extensive component library (possibly using shadcn/ui) +- State Management: Custom hooks +- Animations: Multiple background animation components + +3. **Architecture Overview**: +The project follows Next.js 13+ App Router structure: +``` +app/ # Main application routes +components/ # Reusable UI components +hooks/ # Custom React hooks +lib/ # Utility functions +public/ # Static assets +styles/ # Global styles +``` + +4. **Key Components**: +- **UI Components**: Comprehensive set of 40+ UI components including: + - Basic elements (Button, Input, Form) + - Navigation (Navbar, Menu, Breadcrumb) + - Feedback (Toast, Alert, Dialog) + - Data display (Table, Chart, Card) + - Layout (Grid, Sidebar) +- **Background Components**: + - AnimatedGrid + - FloatingElements + - ParticleField + - 3DBackground +- **Core Pages**: + - Home (page.tsx) + - Articles + - Blog + - Topics + - About + +5. **Development Setup**: +Required setup likely includes: +```bash +# Install dependencies +pnpm install + +# Development server +pnpm dev + +# Build +pnpm build +``` + +Requirements: +- Node.js +- pnpm +- TypeScript knowledge +- Understanding of Next.js and React + +6. **Code Quality Assessment**: +Strengths: +- Well-organized directory structure +- Consistent use of TypeScript +- Modular component architecture +- Separation of concerns (UI components, hooks, pages) +- Comprehensive UI component library +- Modern development practices (App Router, TypeScript) + +Areas for consideration: +- Large number of UI components might indicate need for documentation +- Multiple background components might need performance optimization +- Could benefit from API documentation +- Might need testing infrastructure (not visible in structure) + +Additional Observations: +- The project uses modern React patterns (hooks) +- Strong focus on UI/UX with multiple animation options +- Built with scalability in mind (modular structure) +- Follows Next.js best practices +- Uses modern tooling (pnpm, TypeScript, Tailwind) + +This appears to be a well-structured, modern web application with a strong focus on UI components and user experience. The architecture suggests it's built for scalability and maintainability. + +--- + +## Detailed Code Analysis + +I'll analyze each aspect of this Next.js project: + +1. **Code Quality** +- Strong TypeScript usage with proper type definitions and configurations +- Consistent code formatting and organization following Next.js 13+ conventions +- Clean project structure with clear separation of concerns +- Good use of modern React patterns and Next.js features +- Well-structured configuration files (next.config.js, tailwind.config.js, etc.) +- Follows React best practices with components organization + +2. **Design Patterns** +- Component-based architecture following React principles +- Server-side rendering approach using Next.js App Router +- Atomic design pattern evident in UI components organization +- Utility-first CSS approach with Tailwind +- Singleton pattern for configuration management +- Dependency injection through React context (seen in theme implementation) + +3. **Key Dependencies** +- Core: Next.js 14.2, React 19, TypeScript +- UI: Radix UI components, Tailwind CSS, shadcn/ui +- 3D: Three.js, React Three Fiber +- Forms: React Hook Form, Zod validation +- Utilities: clsx, tailwind-merge +- Development: PostCSS, TypeScript, ESLint + +4. **Potential Issues** +- Build errors being ignored (typescript.ignoreBuildErrors, eslint.ignoreDuringBuilds) +- Unoptimized images configuration could impact performance +- Missing error boundaries and proper error handling +- Security considerations for client-side rendering of 3D content +- No explicit API rate limiting or security headers +- Missing proper environment variable handling + +5. **Testing Strategy** +- No visible testing setup (Jest, React Testing Library, etc.) +- Missing unit tests, integration tests, and e2e tests +- Should add testing framework and implement test coverage +- Consider adding Cypress or Playwright for e2e testing + +6. **Documentation** +- Good README with clear project structure and setup instructions +- Missing JSDoc comments for components and functions +- Could benefit from more inline documentation +- API documentation could be improved +- Missing contribution guidelines and deployment docs + +7. **Maintainability** +Strengths: +- Clear project structure +- Modern tooling and frameworks +- Type safety with TypeScript +- Component modularity +- Consistent coding style + +Areas for Improvement: +- Add comprehensive testing +- Improve error handling +- Better documentation +- Implement proper CI/CD +- Add proper logging system +- Consider performance monitoring + +Additional Recommendations: + +1. Security: +```typescript +// Add security headers +const securityHeaders = [ + { key: 'X-XSS-Protection', value: '1; mode=block' }, + { key: 'X-Frame-Options', value: 'SAMEORIGIN' }, + { key: 'X-Content-Type-Options', value: 'nosniff' }, +] +``` + +2. Error Handling: +```typescript +// Add error boundary component +class ErrorBoundary extends React.Component { + static getDerivedStateFromError(error) { + return { hasError: true }; + } + + componentDidCatch(error, errorInfo) { + // Log error to service + } +} +``` + +3. Testing Setup: +```json +// Add to package.json +{ + "jest": { + "setupFilesAfterEnv": ["/jest.setup.js"], + "testEnvironment": "jsdom" + }, + "scripts": { + "test": "jest", + "test:watch": "jest --watch", + "test:coverage": "jest --coverage" + } +} +``` + +4. Performance Monitoring: +```typescript +// Add performance monitoring +export function reportWebVitals(metric) { + if (metric.label === 'web-vital') { + console.log(metric); // Send to analytics + } +} +``` + +The project has a solid foundation but would benefit from these improvements for production readiness. + +--- + +## Security & Best Practices Analysis + +I'll analyze the repository based on the provided files and structure: + +1. **Security Issues**: +- āš ļø ESLint and TypeScript build errors are being ignored (`ignoreDuringBuilds: true` and `ignoreBuildErrors: true`), which could mask security-related issues +- āš ļø Image optimization is disabled (`unoptimized: true`), which could lead to performance and security concerns +- āœ… Remote image patterns are properly restricted to specific domains (unsplash.com) +- āš ļø No explicit CSP (Content Security Policy) configuration visible + +2. **Secret Management**: +- āœ… Uses environment variables (process.env) +- āš ļø No visible secret management solution or environment validation +- šŸ” Recommend implementing a secret management solution (e.g., Vault, AWS Secrets Manager) + +3. **Dependencies**: +- Cannot fully assess without package.json +- Using Next.js and Tailwind CSS which are generally well-maintained +- šŸ” Recommend implementing dependency scanning (e.g., Snyk, OWASP Dependency-Check) + +4. **Best Practices**: +āœ… Good: +- TypeScript implementation with strict mode enabled +- Proper module resolution and ES6 target +- Well-organized file structure +- Using modern module systems +- Proper tailwind configuration + +āš ļø Concerns: +- Disabling TypeScript and ESLint checks in production +- Multiple next.config files (both .js and .mjs) +- No visible testing configuration + +5. **Configuration**: +āœ… Good: +- Environment-based configuration for basePath +- Proper TypeScript configuration +- Well-structured Tailwind configuration + +āš ļø Concerns: +- Duplicate next.config files might cause confusion +- Some hardcoded values could be externalized +- No visible staging/production environment separation + +6. **Error Handling**: +- Cannot fully assess without application code +- āš ļø Disabling TypeScript and ESLint checks could mask error handling issues +- šŸ” Recommend implementing proper error boundaries and logging + +7. **Recommendations**: + +Security: +```typescript +// Enable TypeScript and ESLint checks +const nextConfig = { + eslint: { + ignoreDuringBuilds: false, + }, + typescript: { + ignoreBuildErrors: false, + } +} +``` + +Configuration: +```javascript +// Consolidate next.config files +// Add proper environment validation +const validateEnv = () => { + const required = ['API_KEY', 'DATABASE_URL']; + required.forEach(key => { + if (!process.env[key]) throw new Error(`Missing ${key}`); + }); +} +``` + +Best Practices: +1. Implement proper CSP: +```javascript +// next.config.js +{ + async headers() { + return [ + { + source: '/:path*', + headers: [ + { + key: 'Content-Security-Policy', + value: "default-src 'self';" + } + ] + } + ] + } +} +``` + +2. Enable image optimization: +```javascript +images: { + unoptimized: false, + domains: ['images.unsplash.com'], +} +``` + +Additional Recommendations: +1. Implement security headers +2. Add input validation +3. Set up proper error boundaries +4. Add proper testing configuration +5. Implement API rate limiting +6. Add security scanning in CI/CD +7. Implement proper logging +8. Add environment validation +9. Consider implementing authentication/authorization +10. Add proper CORS configuration + +Environment Setup: +```bash +# .env.example +NODE_ENV=development +API_KEY= +DATABASE_URL= +``` + +This analysis is based on the configuration files provided. For a more comprehensive security assessment, access to the actual application code, API endpoints, and authentication mechanisms would be needed. + +--- + +## Recommendations Summary + +Based on the analysis, here are the key recommendations for this repository: + +1. **Immediate Actions**: Critical issues that should be addressed promptly +2. **Code Quality Improvements**: Suggestions for better maintainability +3. **Security Enhancements**: Steps to improve security posture +4. **Documentation**: Areas where documentation could be enhanced +5. **Architecture**: Potential architectural improvements + +--- + +*This analysis was generated using AI and should be reviewed by human developers for accuracy and context.* diff --git a/services/ai-analysis-service/ai-analysis/app.py b/services/ai-analysis-service/ai-analysis/app.py new file mode 100644 index 0000000..58a89d9 --- /dev/null +++ b/services/ai-analysis-service/ai-analysis/app.py @@ -0,0 +1,391 @@ +#!/usr/bin/env python3 +""" +GitHub Repository AI Analysis Tool +Analyzes GitHub repositories using Claude API for comprehensive code insights. +""" + +import os +import git +import json +import requests +import tempfile +import shutil +from pathlib import Path +from typing import Dict, List, Optional, Tuple +import argparse +from datetime import datetime +import mimetypes +import base64 + +class GitHubRepoAnalyzer: + def __init__(self, anthropic_api_key: str): + self.api_key = anthropic_api_key + self.api_url = "https://api.anthropic.com/v1/messages" + self.temp_dir = None + + # File extensions to analyze + self.code_extensions = { + '.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.cpp', '.c', '.h', + '.cs', '.php', '.rb', '.go', '.rs', '.swift', '.kt', '.scala', + '.html', '.css', '.scss', '.sass', '.less', '.vue', '.svelte', + '.sql', '.sh', '.bash', '.yml', '.yaml', '.json', '.xml', + '.dockerfile', '.md', '.rst', '.txt' + } + + # Files to always include in analysis + self.important_files = { + 'README.md', 'readme.md', 'README.txt', 'readme.txt', + 'package.json', 'requirements.txt', 'Cargo.toml', 'pom.xml', + 'build.gradle', 'Makefile', 'dockerfile', 'Dockerfile', + 'docker-compose.yml', '.gitignore', 'setup.py', 'pyproject.toml' + } + + def clone_repository(self, repo_url: str) -> str: + """Clone GitHub repository to temporary directory.""" + print(f"Cloning repository: {repo_url}") + + self.temp_dir = tempfile.mkdtemp(prefix="github_analysis_") + + try: + git.Repo.clone_from(repo_url, self.temp_dir) + print(f"Repository cloned to: {self.temp_dir}") + return self.temp_dir + except git.exc.GitCommandError as e: + raise Exception(f"Failed to clone repository: {e}") + + def get_file_info(self, file_path: Path) -> Dict: + """Get file information and content.""" + try: + # Check file size (skip files larger than 1MB) + if file_path.stat().st_size > 1024 * 1024: + return { + 'path': str(file_path.relative_to(self.temp_dir)), + 'size': file_path.stat().st_size, + 'content': '[File too large to analyze]', + 'encoding': 'skipped' + } + + # Try to read as text + try: + with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: + content = f.read() + encoding = 'utf-8' + except UnicodeDecodeError: + # If text fails, try binary for certain file types + with open(file_path, 'rb') as f: + raw_content = f.read() + if len(raw_content) < 10000: # Only encode small binary files + content = base64.b64encode(raw_content).decode('ascii') + encoding = 'base64' + else: + content = '[Binary file - content not included]' + encoding = 'binary' + + return { + 'path': str(file_path.relative_to(self.temp_dir)), + 'size': file_path.stat().st_size, + 'content': content, + 'encoding': encoding, + 'mime_type': mimetypes.guess_type(str(file_path))[0] + } + except Exception as e: + return { + 'path': str(file_path.relative_to(self.temp_dir)), + 'error': str(e), + 'content': '[Error reading file]' + } + + def scan_repository(self, max_files: int = 50) -> Dict: + """Scan repository and collect file information.""" + print("Scanning repository structure...") + + repo_data = { + 'structure': [], + 'files': [], + 'stats': { + 'total_files': 0, + 'analyzed_files': 0, + 'total_size': 0, + 'languages': {} + } + } + + # Get directory structure + for root, dirs, files in os.walk(self.temp_dir): + # Skip hidden directories and common build/cache directories + dirs[:] = [d for d in dirs if not d.startswith('.') and + d not in {'node_modules', '__pycache__', 'build', 'dist', 'target', 'venv', 'env'}] + + level = root.replace(self.temp_dir, '').count(os.sep) + indent = ' ' * level + folder_name = os.path.basename(root) if root != self.temp_dir else '.' + repo_data['structure'].append(f"{indent}{folder_name}/") + + # Process files + for file in files: + if file.startswith('.'): + continue + + file_path = Path(root) / file + repo_data['stats']['total_files'] += 1 + repo_data['stats']['total_size'] += file_path.stat().st_size + + # Track languages + ext = file_path.suffix.lower() + if ext: + repo_data['stats']['languages'][ext] = repo_data['stats']['languages'].get(ext, 0) + 1 + + # Add to structure + repo_data['structure'].append(f"{indent} {file}") + + # Decide if we should analyze this file + should_analyze = ( + file.lower() in self.important_files or + ext in self.code_extensions or + repo_data['stats']['analyzed_files'] < max_files + ) + + if should_analyze and repo_data['stats']['analyzed_files'] < max_files: + file_info = self.get_file_info(file_path) + repo_data['files'].append(file_info) + repo_data['stats']['analyzed_files'] += 1 + + return repo_data + + def call_claude_api(self, prompt: str, max_tokens: int = 4000) -> str: + """Make API call to Claude.""" + headers = { + "Content-Type": "application/json", + "x-api-key": self.api_key, + "anthropic-version": "2023-06-01" + } + + data = { + "model": "claude-3-5-sonnet-20241022", + "max_tokens": max_tokens, + "messages": [ + {"role": "user", "content": prompt} + ] + } + + try: + response = requests.post(self.api_url, headers=headers, json=data) + response.raise_for_status() + + result = response.json() + return result['content'][0]['text'] + except requests.exceptions.RequestException as e: + raise Exception(f"API request failed: {e}") + + def analyze_repository_overview(self, repo_data: Dict) -> str: + """Get high-level repository analysis.""" + print("Analyzing repository overview...") + + structure_summary = "\n".join(repo_data['structure'][:100]) # Limit structure size + + prompt = f""" +Analyze this GitHub repository and provide a comprehensive overview: + +REPOSITORY STRUCTURE: +{structure_summary} + +STATISTICS: +- Total files: {repo_data['stats']['total_files']} +- Files analyzed: {repo_data['stats']['analyzed_files']} +- Total size: {repo_data['stats']['total_size']} bytes +- Languages found: {dict(list(repo_data['stats']['languages'].items())[:10])} + +Please provide: +1. **Project Type & Purpose**: What kind of project is this? +2. **Technology Stack**: What technologies, frameworks, and languages are used? +3. **Architecture Overview**: How is the project structured? +4. **Key Components**: What are the main modules/components? +5. **Development Setup**: What's needed to run this project? +6. **Code Quality Assessment**: Initial observations about code organization +""" + + return self.call_claude_api(prompt) + + def analyze_code_files(self, repo_data: Dict) -> str: + """Analyze individual code files.""" + print("Analyzing code files...") + + # Prepare file contents for analysis + files_content = [] + for file_info in repo_data['files'][:20]: # Limit to first 20 files + if file_info.get('encoding') == 'utf-8' and len(file_info.get('content', '')) < 5000: + files_content.append(f"=== {file_info['path']} ===\n{file_info['content']}\n") + + files_text = "\n".join(files_content) + + prompt = f""" +Analyze these key files from the repository: + +{files_text} + +Please provide detailed analysis covering: +1. **Code Quality**: Code style, organization, and best practices +2. **Design Patterns**: What patterns and architectural approaches are used? +3. **Dependencies & Libraries**: Key external dependencies identified +4. **Potential Issues**: Any code smells, security concerns, or improvements needed +5. **Testing Strategy**: How is testing implemented (if at all)? +6. **Documentation**: Quality of inline documentation and comments +7. **Maintainability**: How maintainable and extensible is this code? +""" + + return self.call_claude_api(prompt, max_tokens=6000) + + def analyze_security_and_best_practices(self, repo_data: Dict) -> str: + """Analyze security and best practices.""" + print("Analyzing security and best practices...") + + # Look for security-sensitive files + security_files = [] + for file_info in repo_data['files']: + path_lower = file_info['path'].lower() + if any(term in path_lower for term in ['config', 'env', 'secret', 'key', 'auth', 'security']): + if file_info.get('encoding') == 'utf-8': + security_files.append(f"=== {file_info['path']} ===\n{file_info['content'][:2000]}\n") + + security_content = "\n".join(security_files[:10]) + + prompt = f""" +Analyze this repository for security and best practices: + +SECURITY-RELEVANT FILES: +{security_content} + +FILE STRUCTURE ANALYSIS: +{json.dumps(repo_data['stats'], indent=2)} + +Please analyze: +1. **Security Issues**: Potential security vulnerabilities or concerns +2. **Secret Management**: How are secrets/credentials handled? +3. **Dependencies**: Are there any vulnerable dependencies? +4. **Best Practices**: Adherence to language/framework best practices +5. **Configuration**: Are configurations properly externalized? +6. **Error Handling**: How are errors handled throughout the codebase? +7. **Recommendations**: Specific suggestions for improvement +""" + + return self.call_claude_api(prompt, max_tokens=5000) + + def generate_comprehensive_report(self, repo_url: str, overview: str, code_analysis: str, security_analysis: str) -> str: + """Generate final comprehensive report.""" + print("Generating comprehensive report...") + + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + report = f""" +# GitHub Repository Analysis Report + +**Repository:** {repo_url} +**Analysis Date:** {timestamp} +**Analyzed by:** Claude AI Assistant + +--- + +## Executive Summary + +{overview} + +--- + +## Detailed Code Analysis + +{code_analysis} + +--- + +## Security & Best Practices Analysis + +{security_analysis} + +--- + +## Recommendations Summary + +Based on the analysis, here are the key recommendations for this repository: + +1. **Immediate Actions**: Critical issues that should be addressed promptly +2. **Code Quality Improvements**: Suggestions for better maintainability +3. **Security Enhancements**: Steps to improve security posture +4. **Documentation**: Areas where documentation could be enhanced +5. **Architecture**: Potential architectural improvements + +--- + +*This analysis was generated using AI and should be reviewed by human developers for accuracy and context.* +""" + + return report + + def analyze(self, repo_url: str, output_file: Optional[str] = None) -> str: + """Main analysis function.""" + try: + # Clone repository + self.clone_repository(repo_url) + + # Scan repository structure and files + repo_data = self.scan_repository() + + # Perform different types of analysis + overview = self.analyze_repository_overview(repo_data) + code_analysis = self.analyze_code_files(repo_data) + security_analysis = self.analyze_security_and_best_practices(repo_data) + + # Generate comprehensive report + final_report = self.generate_comprehensive_report( + repo_url, overview, code_analysis, security_analysis + ) + + # Save report if output file specified + if output_file: + with open(output_file, 'w', encoding='utf-8') as f: + f.write(final_report) + print(f"Report saved to: {output_file}") + + return final_report + + finally: + # Cleanup temporary directory + if self.temp_dir and os.path.exists(self.temp_dir): + shutil.rmtree(self.temp_dir) + print("Temporary files cleaned up") + +def main(): + parser = argparse.ArgumentParser(description="Analyze GitHub repository using Claude AI") + parser.add_argument("repo_url", help="GitHub repository URL") + parser.add_argument("--api-key", required=True, help="Anthropic API key") + parser.add_argument("--output", "-o", help="Output file path (optional)") + parser.add_argument("--max-files", type=int, default=50, help="Maximum files to analyze") + + args = parser.parse_args() + + # Initialize analyzer + analyzer = GitHubRepoAnalyzer(args.api_key) + + try: + print("Starting GitHub repository analysis...") + print("=" * 50) + + # Perform analysis + report = analyzer.analyze(args.repo_url, args.output) + + # Print report if no output file specified + if not args.output: + print("\n" + "=" * 50) + print("ANALYSIS REPORT") + print("=" * 50) + print(report) + + print("\nAnalysis completed successfully!") + + except Exception as e: + print(f"Error during analysis: {e}") + return 1 + + return 0 + +if __name__ == "__main__": + exit(main()) \ No newline at end of file diff --git a/services/ai-analysis-service/ai-analysis/github_analyzer.py b/services/ai-analysis-service/ai-analysis/github_analyzer.py new file mode 100644 index 0000000..3eadf50 --- /dev/null +++ b/services/ai-analysis-service/ai-analysis/github_analyzer.py @@ -0,0 +1,391 @@ +#!/usr/bin/env python3 +""" +GitHub Repository AI Analysis Tool +Analyzes GitHub repositories using Claude API for comprehensive code insights. +""" + +import os +import git +import json +import tempfile +import shutil +from pathlib import Path +from typing import Dict, List, Optional, Tuple +import argparse +from datetime import datetime +import mimetypes +import base64 +from dotenv import load_dotenv +import anthropic + +class GitHubRepoAnalyzer: + def __init__(self, anthropic_api_key: str = None): + # Load environment variables + load_dotenv() + + # Get API key from parameter or environment + self.api_key = anthropic_api_key or os.getenv('ANTHROPIC_API_KEY') + if not self.api_key: + raise ValueError("Anthropic API key not found. Please set ANTHROPIC_API_KEY in .env file or pass as parameter.") + + # Initialize Anthropic client + self.client = anthropic.Anthropic(api_key=self.api_key) + self.temp_dir = None + + # File extensions to analyze + self.code_extensions = { + '.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.cpp', '.c', '.h', + '.cs', '.php', '.rb', '.go', '.rs', '.swift', '.kt', '.scala', + '.html', '.css', '.scss', '.sass', '.less', '.vue', '.svelte', + '.sql', '.sh', '.bash', '.yml', '.yaml', '.json', '.xml', + '.dockerfile', '.md', '.rst', '.txt' + } + + # Files to always include in analysis + self.important_files = { + 'README.md', 'readme.md', 'README.txt', 'readme.txt', + 'package.json', 'requirements.txt', 'Cargo.toml', 'pom.xml', + 'build.gradle', 'Makefile', 'dockerfile', 'Dockerfile', + 'docker-compose.yml', '.gitignore', 'setup.py', 'pyproject.toml' + } + + def clone_repository(self, repo_url: str) -> str: + """Clone GitHub repository to temporary directory.""" + print(f"Cloning repository: {repo_url}") + + self.temp_dir = tempfile.mkdtemp(prefix="github_analysis_") + + try: + git.Repo.clone_from(repo_url, self.temp_dir) + print(f"Repository cloned to: {self.temp_dir}") + return self.temp_dir + except git.exc.GitCommandError as e: + raise Exception(f"Failed to clone repository: {e}") + + def get_file_info(self, file_path: Path) -> Dict: + """Get file information and content.""" + try: + # Check file size (skip files larger than 1MB) + if file_path.stat().st_size > 1024 * 1024: + return { + 'path': str(file_path.relative_to(self.temp_dir)), + 'size': file_path.stat().st_size, + 'content': '[File too large to analyze]', + 'encoding': 'skipped' + } + + # Try to read as text + try: + with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: + content = f.read() + encoding = 'utf-8' + except UnicodeDecodeError: + # If text fails, try binary for certain file types + with open(file_path, 'rb') as f: + raw_content = f.read() + if len(raw_content) < 10000: # Only encode small binary files + content = base64.b64encode(raw_content).decode('ascii') + encoding = 'base64' + else: + content = '[Binary file - content not included]' + encoding = 'binary' + + return { + 'path': str(file_path.relative_to(self.temp_dir)), + 'size': file_path.stat().st_size, + 'content': content, + 'encoding': encoding, + 'mime_type': mimetypes.guess_type(str(file_path))[0] + } + except Exception as e: + return { + 'path': str(file_path.relative_to(self.temp_dir)), + 'error': str(e), + 'content': '[Error reading file]' + } + + def scan_repository(self, max_files: int = 50) -> Dict: + """Scan repository and collect file information.""" + print("Scanning repository structure...") + + repo_data = { + 'structure': [], + 'files': [], + 'stats': { + 'total_files': 0, + 'analyzed_files': 0, + 'total_size': 0, + 'languages': {} + } + } + + # Get directory structure + for root, dirs, files in os.walk(self.temp_dir): + # Skip hidden directories and common build/cache directories + dirs[:] = [d for d in dirs if not d.startswith('.') and + d not in {'node_modules', '__pycache__', 'build', 'dist', 'target', 'venv', 'env'}] + + level = root.replace(self.temp_dir, '').count(os.sep) + indent = ' ' * level + folder_name = os.path.basename(root) if root != self.temp_dir else '.' + repo_data['structure'].append(f"{indent}{folder_name}/") + + # Process files + for file in files: + if file.startswith('.'): + continue + + file_path = Path(root) / file + repo_data['stats']['total_files'] += 1 + repo_data['stats']['total_size'] += file_path.stat().st_size + + # Track languages + ext = file_path.suffix.lower() + if ext: + repo_data['stats']['languages'][ext] = repo_data['stats']['languages'].get(ext, 0) + 1 + + # Add to structure + repo_data['structure'].append(f"{indent} {file}") + + # Decide if we should analyze this file + should_analyze = ( + file.lower() in self.important_files or + ext in self.code_extensions or + repo_data['stats']['analyzed_files'] < max_files + ) + + if should_analyze and repo_data['stats']['analyzed_files'] < max_files: + file_info = self.get_file_info(file_path) + repo_data['files'].append(file_info) + repo_data['stats']['analyzed_files'] += 1 + + return repo_data + + def call_claude_api(self, prompt: str, max_tokens: int = 4000) -> str: + """Make API call to Claude using official Anthropic client.""" + try: + message = self.client.messages.create( + model="claude-3-sonnet-20240229", + max_tokens=max_tokens, + messages=[ + {"role": "user", "content": prompt} + ] + ) + + return message.content[0].text + + except Exception as e: + raise Exception(f"Claude API call failed: {e}") + + def analyze_repository_overview(self, repo_data: Dict) -> str: + """Get high-level repository analysis.""" + print("Analyzing repository overview...") + + structure_summary = "\n".join(repo_data['structure'][:100]) # Limit structure size + + prompt = f""" +Analyze this GitHub repository and provide a comprehensive overview: + +REPOSITORY STRUCTURE: +{structure_summary} + +STATISTICS: +- Total files: {repo_data['stats']['total_files']} +- Files analyzed: {repo_data['stats']['analyzed_files']} +- Total size: {repo_data['stats']['total_size']} bytes +- Languages found: {dict(list(repo_data['stats']['languages'].items())[:10])} + +Please provide: +1. **Project Type & Purpose**: What kind of project is this? +2. **Technology Stack**: What technologies, frameworks, and languages are used? +3. **Architecture Overview**: How is the project structured? +4. **Key Components**: What are the main modules/components? +5. **Development Setup**: What's needed to run this project? +6. **Code Quality Assessment**: Initial observations about code organization +""" + + return self.call_claude_api(prompt) + + def analyze_code_files(self, repo_data: Dict) -> str: + """Analyze individual code files.""" + print("Analyzing code files...") + + # Prepare file contents for analysis + files_content = [] + for file_info in repo_data['files'][:20]: # Limit to first 20 files + if file_info.get('encoding') == 'utf-8' and len(file_info.get('content', '')) < 5000: + files_content.append(f"=== {file_info['path']} ===\n{file_info['content']}\n") + + files_text = "\n".join(files_content) + + prompt = f""" +Analyze these key files from the repository: + +{files_text} + +Please provide detailed analysis covering: +1. **Code Quality**: Code style, organization, and best practices +2. **Design Patterns**: What patterns and architectural approaches are used? +3. **Dependencies & Libraries**: Key external dependencies identified +4. **Potential Issues**: Any code smells, security concerns, or improvements needed +5. **Testing Strategy**: How is testing implemented (if at all)? +6. **Documentation**: Quality of inline documentation and comments +7. **Maintainability**: How maintainable and extensible is this code? +""" + + return self.call_claude_api(prompt, max_tokens=6000) + + def analyze_security_and_best_practices(self, repo_data: Dict) -> str: + """Analyze security and best practices.""" + print("Analyzing security and best practices...") + + # Look for security-sensitive files + security_files = [] + for file_info in repo_data['files']: + path_lower = file_info['path'].lower() + if any(term in path_lower for term in ['config', 'env', 'secret', 'key', 'auth', 'security']): + if file_info.get('encoding') == 'utf-8': + security_files.append(f"=== {file_info['path']} ===\n{file_info['content'][:2000]}\n") + + security_content = "\n".join(security_files[:10]) + + prompt = f""" +Analyze this repository for security and best practices: + +SECURITY-RELEVANT FILES: +{security_content} + +FILE STRUCTURE ANALYSIS: +{json.dumps(repo_data['stats'], indent=2)} + +Please analyze: +1. **Security Issues**: Potential security vulnerabilities or concerns +2. **Secret Management**: How are secrets/credentials handled? +3. **Dependencies**: Are there any vulnerable dependencies? +4. **Best Practices**: Adherence to language/framework best practices +5. **Configuration**: Are configurations properly externalized? +6. **Error Handling**: How are errors handled throughout the codebase? +7. **Recommendations**: Specific suggestions for improvement +""" + + return self.call_claude_api(prompt, max_tokens=5000) + + def generate_comprehensive_report(self, repo_url: str, overview: str, code_analysis: str, security_analysis: str) -> str: + """Generate final comprehensive report.""" + print("Generating comprehensive report...") + + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + report = f""" +# GitHub Repository Analysis Report + +**Repository:** {repo_url} +**Analysis Date:** {timestamp} +**Analyzed by:** Claude AI Assistant + +--- + +## Executive Summary + +{overview} + +--- + +## Detailed Code Analysis + +{code_analysis} + +--- + +## Security & Best Practices Analysis + +{security_analysis} + +--- + +## Recommendations Summary + +Based on the analysis, here are the key recommendations for this repository: + +1. **Immediate Actions**: Critical issues that should be addressed promptly +2. **Code Quality Improvements**: Suggestions for better maintainability +3. **Security Enhancements**: Steps to improve security posture +4. **Documentation**: Areas where documentation could be enhanced +5. **Architecture**: Potential architectural improvements + +--- + +*This analysis was generated using AI and should be reviewed by human developers for accuracy and context.* +""" + + return report + + def analyze(self, repo_url: str, output_file: Optional[str] = None) -> str: + """Main analysis function.""" + try: + # Clone repository + self.clone_repository(repo_url) + + # Scan repository structure and files + repo_data = self.scan_repository() + + # Perform different types of analysis + overview = self.analyze_repository_overview(repo_data) + code_analysis = self.analyze_code_files(repo_data) + security_analysis = self.analyze_security_and_best_practices(repo_data) + + # Generate comprehensive report + final_report = self.generate_comprehensive_report( + repo_url, overview, code_analysis, security_analysis + ) + + # Save report if output file specified + if output_file: + with open(output_file, 'w', encoding='utf-8') as f: + f.write(final_report) + print(f"Report saved to: {output_file}") + + return final_report + + finally: + # Cleanup temporary directory + if self.temp_dir and os.path.exists(self.temp_dir): + shutil.rmtree(self.temp_dir) + print("Temporary files cleaned up") + +def main(): + parser = argparse.ArgumentParser(description="Analyze GitHub repository using Claude AI") + parser.add_argument("repo_url", help="GitHub repository URL") + parser.add_argument("--api-key", help="Anthropic API key (optional if set in .env)") + parser.add_argument("--output", "-o", help="Output file path (optional)") + parser.add_argument("--max-files", type=int, default=50, help="Maximum files to analyze") + + args = parser.parse_args() + + try: + # Initialize analyzer + analyzer = GitHubRepoAnalyzer(args.api_key) + + print("Starting GitHub repository analysis...") + print("=" * 50) + + # Perform analysis + report = analyzer.analyze(args.repo_url, args.output) + + # Print report if no output file specified + if not args.output: + print("\n" + "=" * 50) + print("ANALYSIS REPORT") + print("=" * 50) + print(report) + + print("\nAnalysis completed successfully!") + + except Exception as e: + print(f"Error during analysis: {e}") + return 1 + + return 0 + +if __name__ == "__main__": + exit(main()) \ No newline at end of file diff --git a/services/ai-analysis-service/ai-analysis/requirements.txt b/services/ai-analysis-service/ai-analysis/requirements.txt new file mode 100644 index 0000000..50994fd --- /dev/null +++ b/services/ai-analysis-service/ai-analysis/requirements.txt @@ -0,0 +1,69 @@ +# Core AI and API +anthropic>=0.7.0 +openai>=1.0.0 + +# Environment management +python-dotenv>=1.0.0 + +# Git operations +GitPython>=3.1.0 + +# PDF generation +reportlab>=4.0.0 +matplotlib>=3.7.0 +pillow>=10.0.0 + +# Code analysis and parsing +ast-comments>=1.1.0 +astroid>=3.0.0 +pygments>=2.15.0 +radon>=6.0.1 +bandit>=1.7.5 +flake8>=6.0.0 +pylint>=3.0.0 + +# File operations and utilities +pathlib2>=2.3.7 +chardet>=5.2.0 +python-magic>=0.4.27 + +# Async operations +aiohttp>=3.8.0 +aiofiles>=23.0.0 +asyncio-throttle>=1.0.2 + +# Data processing +pandas>=2.0.0 +numpy>=1.24.0 +python-dateutil>=2.8.0 + +# Web scraping (for additional repo info) +requests>=2.31.0 +beautifulsoup4>=4.12.0 + +# Testing and code quality +pytest>=7.4.0 +pytest-asyncio>=0.21.0 +coverage>=7.3.0 + +# Additional utilities for advanced analysis +networkx>=3.1.0 # For dependency graph analysis +graphviz>=0.20.0 # For visualization +jinja2>=3.1.0 # For report templating +markdown>=3.4.0 # For markdown processing +pyyaml>=6.0.0 # For YAML config files +toml>=0.10.2 # For TOML config files +xmltodict>=0.13.0 # For XML processing + +# Performance monitoring +psutil>=5.9.0 +memory-profiler>=0.61.0 + +# Progress bars and UI +tqdm>=4.65.0 +rich>=13.5.0 +click>=8.1.0 + +# Security scanning +safety>=2.3.0 +pip-audit>=2.6.0 \ No newline at end of file diff --git a/services/ai-analysis-service/ai-analyze.py b/services/ai-analysis-service/ai-analyze.py new file mode 100644 index 0000000..7a9ac4b --- /dev/null +++ b/services/ai-analysis-service/ai-analyze.py @@ -0,0 +1,1570 @@ +#!/usr/bin/env python3 +""" +Complete AI Repository Analysis Tool with Memory System +Automatically analyzes ALL files in a repository without limits. + +Features: +- Analyzes ALL files in the repository (no max-files limit) +- No user query required - fully automated analysis +- Memory-enhanced analysis with learning capabilities +- Comprehensive PDF report generation +- Security, architecture, and code quality assessment + +Usage: + python ai-analyze.py /path/to/repo --output analysis.pdf + +Example: + python ai-analyze.py ./my-project --output complete_analysis.pdf +""" + +import os +import asyncio +import hashlib +import json +import uuid +from pathlib import Path +from typing import Dict, List, Optional, Tuple, Any +from datetime import datetime, timedelta +from dataclasses import dataclass, asdict +from collections import defaultdict, Counter +import logging +import tempfile +import shutil +import re + +# Core packages +import anthropic +from dotenv import load_dotenv +import git +import redis +import pymongo +import psycopg2 +from psycopg2.extras import RealDictCursor +import numpy as np + +# PDF generation +from reportlab.lib.pagesizes import A4 +from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle +from reportlab.lib.enums import TA_CENTER, TA_LEFT +from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak, Table, TableStyle +from reportlab.lib import colors + +# Enhanced dataclasses for memory system +@dataclass +class MemoryRecord: + id: str + timestamp: datetime + memory_type: str # 'episodic', 'persistent', 'working' + content: Dict[str, Any] + embeddings: Optional[List[float]] = None + metadata: Optional[Dict[str, Any]] = None + expiry: Optional[datetime] = None + +@dataclass +class CodeAnalysisMemory: + repo_id: str + file_path: str + analysis_hash: str + analysis_data: Dict[str, Any] + embedding: List[float] + last_updated: datetime + access_count: int = 0 + relevance_score: float = 1.0 + +@dataclass +class EpisodicMemory: + session_id: str + user_query: str + ai_response: str + repo_context: str + timestamp: datetime + embedding: List[float] + metadata: Dict[str, Any] + +@dataclass +class PersistentMemory: + fact_id: str + content: str + category: str # 'code_pattern', 'best_practice', 'vulnerability', 'architecture' + confidence: float + embedding: List[float] + source_repos: List[str] + created_at: datetime + last_accessed: datetime + access_frequency: int = 0 + +@dataclass +class FileAnalysis: + path: str + language: str + lines_of_code: int + complexity_score: float + issues_found: List[str] + recommendations: List[str] + detailed_analysis: str + severity_score: float + +@dataclass +class RepositoryAnalysis: + repo_path: str + total_files: int + total_lines: int + languages: Dict[str, int] + architecture_assessment: str + security_assessment: str + code_quality_score: float + file_analyses: List[FileAnalysis] + executive_summary: str + +class MemoryManager: + """Advanced memory management system for AI repository analysis.""" + + def __init__(self, config: Dict[str, Any]): + self.config = config + self.setup_logging() + + # Initialize Claude client for embeddings + self.claude_client = anthropic.Anthropic(api_key=config.get('anthropic_api_key', '')) + + # Initialize database connections + self.setup_databases() + + # Memory configuration + self.working_memory_ttl = 3600 # 1 hour + self.episodic_retention_days = 365 # 1 year + self.persistent_memory_threshold = 0.8 # Confidence threshold for persistence + + def setup_logging(self): + logging.basicConfig(level=logging.INFO) + self.logger = logging.getLogger(__name__) + + def setup_databases(self): + """Initialize all database connections.""" + try: + # Redis for working memory (temporary, fast access) + self.redis_client = redis.Redis( + host=self.config.get('redis_host', 'localhost'), + port=self.config.get('redis_port', 6379), + db=self.config.get('redis_db', 0), + decode_responses=True + ) + + # MongoDB for documents and episodic memory + self.mongo_client = pymongo.MongoClient( + self.config.get('mongodb_url', 'mongodb://localhost:27017/') + ) + self.mongo_db = self.mongo_client[self.config.get('mongodb_name', 'repo_analyzer')] + + # Collections + self.episodic_collection = self.mongo_db['episodic_memories'] + self.analysis_collection = self.mongo_db['code_analyses'] + self.persistent_collection = self.mongo_db['persistent_memories'] + self.repo_metadata_collection = self.mongo_db['repository_metadata'] + + # PostgreSQL with pgvector for vector operations + self.pg_conn = psycopg2.connect( + host=self.config.get('postgres_host', 'localhost'), + port=self.config.get('postgres_port', 5432), + database=self.config.get('postgres_db', 'dev_pipeline'), + user=self.config.get('postgres_user', 'pipeline_admin'), + password=self.config.get('postgres_password', 'secure_pipeline_2024') + ) + + # Check if pgvector is available + try: + with self.pg_conn.cursor() as cur: + cur.execute("SELECT 1 FROM pg_extension WHERE extname = 'vector';") + self.has_vector = cur.fetchone() is not None + except: + self.has_vector = False + + self.logger.info("All database connections established successfully") + + except Exception as e: + self.logger.error(f"Database setup failed: {e}") + raise + + def generate_embedding(self, text: str) -> List[float]: + """Generate embedding for text using Claude API.""" + try: + # Use Claude to generate semantic embeddings + # Truncate text if too long for Claude API + if len(text) > 8000: + text = text[:8000] + "..." + + prompt = f""" + Convert the following text into a 384-dimensional numerical vector that represents its semantic meaning. + The vector should be suitable for similarity search and clustering. + + Text: {text} + + Return only a JSON array of 384 floating-point numbers between -1 and 1, like this: + [0.123, -0.456, 0.789, ...] + """ + + message = self.claude_client.messages.create( + model="claude-3-5-sonnet-20240620", + max_tokens=2000, + temperature=0.1, + messages=[{"role": "user", "content": prompt}] + ) + + response_text = message.content[0].text.strip() + + # Extract JSON array from response + import json + import re + + # Find JSON array in response + json_match = re.search(r'\[[\d\.,\s-]+\]', response_text) + if json_match: + embedding = json.loads(json_match.group()) + if len(embedding) == 384: + return embedding + + # Fallback: generate deterministic embedding from text hash + return self._generate_fallback_embedding(text) + + except Exception as e: + self.logger.error(f"Claude embedding generation failed: {e}") + return self._generate_fallback_embedding(text) + + def _generate_fallback_embedding(self, text: str) -> List[float]: + """Generate fallback embedding using text hash.""" + try: + import hashlib + import struct + + # Create a deterministic hash-based embedding + hash_obj = hashlib.sha256(text.encode('utf-8')) + hash_bytes = hash_obj.digest() + + # Convert to 384-dimensional vector + embedding = [] + for i in range(0, len(hash_bytes), 4): + if len(embedding) >= 384: + break + chunk = hash_bytes[i:i+4] + if len(chunk) == 4: + # Convert 4 bytes to float and normalize + value = struct.unpack('>I', chunk)[0] / (2**32 - 1) # Normalize to 0-1 + embedding.append(value * 2 - 1) # Scale to -1 to 1 + + # Pad to exactly 384 dimensions + while len(embedding) < 384: + embedding.append(0.0) + + return embedding[:384] + + except Exception as e: + self.logger.error(f"Fallback embedding generation failed: {e}") + return [0.0] * 384 + + def calculate_content_hash(self, content: str) -> str: + """Calculate SHA-256 hash of content for change detection.""" + return hashlib.sha256(content.encode()).hexdigest() + + async def store_working_memory(self, key: str, data: Dict[str, Any], ttl: Optional[int] = None) -> bool: + """Store temporary data in working memory (Redis).""" + try: + ttl = ttl or self.working_memory_ttl + serialized_data = json.dumps(data, default=str) + self.redis_client.setex(f"working:{key}", ttl, serialized_data) + return True + except Exception as e: + self.logger.error(f"Working memory storage failed: {e}") + return False + + async def get_working_memory(self, key: str) -> Optional[Dict[str, Any]]: + """Retrieve data from working memory.""" + try: + data = self.redis_client.get(f"working:{key}") + return json.loads(data) if data else None + except Exception as e: + self.logger.error(f"Working memory retrieval failed: {e}") + return None + + async def store_episodic_memory(self, session_id: str, user_query: str, + ai_response: str, repo_context: str, + metadata: Optional[Dict] = None) -> str: + """Store interaction in episodic memory.""" + try: + memory_id = str(uuid.uuid4()) + + # Generate embeddings + query_embedding = self.generate_embedding(user_query) + response_embedding = self.generate_embedding(ai_response) + + # Store in MongoDB + episodic_record = { + 'memory_id': memory_id, + 'session_id': session_id, + 'user_query': user_query, + 'ai_response': ai_response, + 'repo_context': repo_context, + 'timestamp': datetime.utcnow(), + 'metadata': metadata or {} + } + self.episodic_collection.insert_one(episodic_record) + + # Store embeddings in PostgreSQL for similarity search + with self.pg_conn.cursor() as cur: + cur.execute(""" + INSERT INTO query_embeddings + (session_id, query_text, query_embedding, response_embedding, repo_context, metadata) + VALUES (%s, %s, %s, %s, %s, %s) + """, ( + session_id, user_query, query_embedding, response_embedding, + repo_context, json.dumps(metadata or {}) + )) + self.pg_conn.commit() + + self.logger.info(f"Episodic memory stored: {memory_id}") + return memory_id + + except Exception as e: + self.logger.error(f"Episodic memory storage failed: {e}") + return "" + + async def retrieve_episodic_memories(self, query: str, repo_context: str = "", + limit: int = 10, similarity_threshold: float = 0.7) -> List[Dict]: + """Retrieve relevant episodic memories based on query similarity.""" + try: + query_embedding = self.generate_embedding(query) + + with self.pg_conn.cursor(cursor_factory=RealDictCursor) as cur: + # Find similar queries using cosine similarity + cur.execute(""" + SELECT session_id, query_text, repo_context, timestamp, metadata, + 1 - (query_embedding <=> %s::vector) as similarity + FROM query_embeddings + WHERE (%s = '' OR repo_context = %s) + AND 1 - (query_embedding <=> %s::vector) > %s + ORDER BY similarity DESC + LIMIT %s + """, (query_embedding, repo_context, repo_context, query_embedding, similarity_threshold, limit)) + + similar_queries = cur.fetchall() + + # Fetch full episodic records from MongoDB + memories = [] + for query_record in similar_queries: + episodic_record = self.episodic_collection.find_one({ + 'session_id': query_record['session_id'], + 'timestamp': query_record['timestamp'] + }) + if episodic_record: + episodic_record['similarity_score'] = float(query_record['similarity']) + memories.append(episodic_record) + + return memories + + except Exception as e: + self.logger.error(f"Episodic memory retrieval failed: {e}") + return [] + + async def store_persistent_memory(self, content: str, category: str, + confidence: float, source_repos: List[str]) -> str: + """Store long-term knowledge in persistent memory.""" + try: + fact_id = str(uuid.uuid4()) + embedding = self.generate_embedding(content) + + # Store in MongoDB + persistent_record = { + 'fact_id': fact_id, + 'content': content, + 'category': category, + 'confidence': confidence, + 'source_repos': source_repos, + 'created_at': datetime.utcnow(), + 'last_accessed': datetime.utcnow(), + 'access_frequency': 1 + } + self.persistent_collection.insert_one(persistent_record) + + # Store embedding in PostgreSQL + with self.pg_conn.cursor() as cur: + if self.has_vector: + cur.execute(""" + INSERT INTO knowledge_embeddings + (fact_id, content, category, embedding, confidence, source_repos) + VALUES (%s, %s, %s, %s, %s, %s) + """, (fact_id, content, category, embedding, confidence, source_repos)) + else: + cur.execute(""" + INSERT INTO knowledge_embeddings + (fact_id, content, category, confidence, source_repos) + VALUES (%s, %s, %s, %s, %s) + """, (fact_id, content, category, confidence, source_repos)) + self.pg_conn.commit() + + self.logger.info(f"Persistent memory stored: {fact_id}") + return fact_id + + except Exception as e: + self.logger.error(f"Persistent memory storage failed: {e}") + return "" + + async def retrieve_persistent_memories(self, query: str, category: str = "", + limit: int = 20, similarity_threshold: float = 0.6) -> List[Dict]: + """Retrieve relevant persistent knowledge.""" + try: + query_embedding = self.generate_embedding(query) + + with self.pg_conn.cursor(cursor_factory=RealDictCursor) as cur: + # Check if table exists first + cur.execute(""" + SELECT EXISTS ( + SELECT FROM information_schema.tables + WHERE table_name = 'knowledge_embeddings' + ); + """) + table_exists = cur.fetchone()[0] + + if not table_exists: + self.logger.warning("knowledge_embeddings table does not exist, returning empty results") + return [] + + # Build WHERE clause dynamically + if hasattr(self, 'has_vector') and self.has_vector: + where_conditions = ["1 - (embedding <=> %s::vector) > %s"] + params = [query_embedding, similarity_threshold] + else: + # Fallback to text-based search + where_conditions = ["content ILIKE %s"] + params = [f"%{query}%"] + + if category: + where_conditions.append("category = %s") + params.append(category) + + where_clause = " AND ".join(where_conditions) + params.extend([limit]) + + if hasattr(self, 'has_vector') and self.has_vector: + cur.execute(f""" + SELECT fact_id, content, category, confidence, source_repos, + 1 - (embedding <=> %s::vector) as similarity, + created_at, last_accessed, access_frequency + FROM knowledge_embeddings + WHERE {where_clause} + ORDER BY similarity DESC, confidence DESC, access_frequency DESC + LIMIT %s + """, params) + else: + cur.execute(f""" + SELECT fact_id, content, category, confidence, source_repos, + 0.8 as similarity, + created_at, last_accessed, access_frequency + FROM knowledge_embeddings + WHERE {where_clause} + ORDER BY confidence DESC, access_frequency DESC + LIMIT %s + """, params) + + results = cur.fetchall() + + # Update access frequency + for result in results: + cur.execute(""" + UPDATE knowledge_embeddings + SET last_accessed = CURRENT_TIMESTAMP, + access_frequency = access_frequency + 1 + WHERE fact_id = %s + """, (result['fact_id'],)) + + self.pg_conn.commit() + return [dict(result) for result in results] + + except Exception as e: + self.logger.error(f"Persistent memory retrieval failed: {e}") + return [] + + async def store_code_analysis(self, repo_id: str, file_path: str, + analysis_data: Dict[str, Any]) -> str: + """Store code analysis with embeddings for future retrieval.""" + try: + content_hash = self.calculate_content_hash(json.dumps(analysis_data, sort_keys=True)) + + # Create searchable content for embedding + searchable_content = f""" + File: {file_path} + Language: {analysis_data.get('language', 'Unknown')} + Issues: {' '.join(analysis_data.get('issues_found', []))} + Recommendations: {' '.join(analysis_data.get('recommendations', []))} + Analysis: {analysis_data.get('detailed_analysis', '')} + """ + + embedding = self.generate_embedding(searchable_content) + + # Store in MongoDB + analysis_record = { + 'repo_id': repo_id, + 'file_path': file_path, + 'content_hash': content_hash, + 'analysis_data': analysis_data, + 'created_at': datetime.utcnow(), + 'last_accessed': datetime.utcnow(), + 'access_count': 1 + } + + # Upsert to handle updates + self.analysis_collection.update_one( + {'repo_id': repo_id, 'file_path': file_path}, + {'$set': analysis_record}, + upsert=True + ) + + # Store embedding in PostgreSQL + with self.pg_conn.cursor() as cur: + if self.has_vector: + cur.execute(""" + INSERT INTO code_embeddings (repo_id, file_path, content_hash, embedding, metadata) + VALUES (%s, %s, %s, %s, %s) + ON CONFLICT (repo_id, file_path, content_hash) + DO UPDATE SET last_accessed = CURRENT_TIMESTAMP + """, ( + repo_id, file_path, content_hash, embedding, + json.dumps({ + 'language': analysis_data.get('language'), + 'lines_of_code': analysis_data.get('lines_of_code', 0), + 'severity_score': analysis_data.get('severity_score', 5.0) + }) + )) + else: + cur.execute(""" + INSERT INTO code_embeddings (repo_id, file_path, content_hash, embedding_text, metadata) + VALUES (%s, %s, %s, %s, %s) + ON CONFLICT (repo_id, file_path, content_hash) + DO UPDATE SET last_accessed = CURRENT_TIMESTAMP + """, ( + repo_id, file_path, content_hash, json.dumps(embedding), + json.dumps({ + 'language': analysis_data.get('language'), + 'lines_of_code': analysis_data.get('lines_of_code', 0), + 'severity_score': analysis_data.get('severity_score', 5.0) + }) + )) + self.pg_conn.commit() + + return content_hash + + except Exception as e: + self.logger.error(f"Code analysis storage failed: {e}") + return "" + + async def search_similar_code(self, query: str, repo_id: str = "", + limit: int = 10) -> List[Dict]: + """Search for similar code analyses.""" + try: + query_embedding = self.generate_embedding(query) + + with self.pg_conn.cursor(cursor_factory=RealDictCursor) as cur: + # Check if table exists first + cur.execute(""" + SELECT EXISTS ( + SELECT FROM information_schema.tables + WHERE table_name = 'code_embeddings' + ); + """) + table_exists = cur.fetchone()[0] + + if not table_exists: + self.logger.warning("code_embeddings table does not exist, returning empty results") + return [] + + where_clause = "WHERE 1=1" + params = [query_embedding] + + if repo_id: + where_clause += " AND repo_id = %s" + params.append(repo_id) + + params.append(limit) + + cur.execute(f""" + SELECT repo_id, file_path, content_hash, metadata, + 1 - (embedding <=> %s::vector) as similarity + FROM code_embeddings + {where_clause} + ORDER BY similarity DESC + LIMIT %s + """, params) + + results = cur.fetchall() + + # Fetch full analysis data from MongoDB + enriched_results = [] + for result in results: + analysis = self.analysis_collection.find_one({ + 'repo_id': result['repo_id'], + 'file_path': result['file_path'] + }) + if analysis: + analysis['similarity_score'] = float(result['similarity']) + enriched_results.append(analysis) + + return enriched_results + + except Exception as e: + self.logger.error(f"Similar code search failed: {e}") + return [] + + async def cleanup_old_memories(self): + """Clean up old episodic memories and update access patterns.""" + try: + cutoff_date = datetime.utcnow() - timedelta(days=self.episodic_retention_days) + + # Clean up old episodic memories + result = self.episodic_collection.delete_many({ + 'timestamp': {'$lt': cutoff_date} + }) + self.logger.info(f"Cleaned up {result.deleted_count} old episodic memories") + + # Clean up corresponding query embeddings + with self.pg_conn.cursor() as cur: + cur.execute("DELETE FROM query_embeddings WHERE timestamp < %s", (cutoff_date,)) + self.pg_conn.commit() + + # Update persistent memory relevance based on access patterns + await self.update_persistent_memory_relevance() + + except Exception as e: + self.logger.error(f"Memory cleanup failed: {e}") + + async def update_persistent_memory_relevance(self): + """Update relevance scores for persistent memories based on access patterns.""" + try: + with self.pg_conn.cursor() as cur: + # Calculate relevance based on recency and frequency + cur.execute(""" + UPDATE knowledge_embeddings + SET confidence = LEAST(confidence * ( + CASE + WHEN EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - last_accessed)) / 86400 < 30 + THEN 1.1 + ELSE 0.95 + END * + (1.0 + LOG(access_frequency + 1) / 10.0) + ), 1.0) + """) + self.pg_conn.commit() + + except Exception as e: + self.logger.error(f"Relevance update failed: {e}") + + async def get_memory_stats(self) -> Dict[str, Any]: + """Get comprehensive memory system statistics.""" + try: + stats = {} + + # Working memory stats (Redis) + working_keys = self.redis_client.keys("working:*") + stats['working_memory'] = { + 'total_keys': len(working_keys), + 'memory_usage': self.redis_client.info()['used_memory_human'] + } + + # Episodic memory stats (MongoDB) + stats['episodic_memory'] = { + 'total_records': self.episodic_collection.count_documents({}), + 'recent_interactions': self.episodic_collection.count_documents({ + 'timestamp': {'$gte': datetime.utcnow() - timedelta(days=7)} + }) + } + + # Persistent memory stats + stats['persistent_memory'] = { + 'total_facts': self.persistent_collection.count_documents({}), + 'high_confidence_facts': self.persistent_collection.count_documents({ + 'confidence': {'$gte': 0.8} + }) + } + + # Code analysis stats + stats['code_analysis'] = { + 'total_analyses': self.analysis_collection.count_documents({}), + 'unique_repositories': len(self.analysis_collection.distinct('repo_id')) + } + + # Vector database stats (PostgreSQL) + with self.pg_conn.cursor(cursor_factory=RealDictCursor) as cur: + cur.execute("SELECT COUNT(*) as count FROM code_embeddings") + code_embeddings_count = cur.fetchone()['count'] + + cur.execute("SELECT COUNT(*) as count FROM knowledge_embeddings") + knowledge_embeddings_count = cur.fetchone()['count'] + + stats['vector_database'] = { + 'code_embeddings': code_embeddings_count, + 'knowledge_embeddings': knowledge_embeddings_count + } + + return stats + + except Exception as e: + self.logger.error(f"Stats retrieval failed: {e}") + return {} + +class MemoryQueryEngine: + """Advanced querying capabilities across memory systems.""" + + def __init__(self, memory_manager: MemoryManager): + self.memory = memory_manager + + async def intelligent_query(self, query: str, repo_context: str = "") -> Dict[str, Any]: + """Intelligent cross-memory querying with relevance scoring.""" + try: + # Multi-source memory retrieval + results = await asyncio.gather( + self.memory.retrieve_episodic_memories(query, repo_context, limit=5), + self.memory.retrieve_persistent_memories(query, limit=10), + self.memory.search_similar_code(query, repo_context, limit=5) + ) + + episodic_memories, persistent_knowledge, similar_code = results + + # Relevance scoring and fusion + fused_response = self.fuse_memory_responses( + query, episodic_memories, persistent_knowledge, similar_code + ) + + return { + 'query': query, + 'fused_response': fused_response, + 'sources': { + 'episodic_count': len(episodic_memories), + 'persistent_count': len(persistent_knowledge), + 'similar_code_count': len(similar_code) + }, + 'confidence_score': self.calculate_response_confidence(fused_response), + 'timestamp': datetime.utcnow() + } + + except Exception as e: + self.memory.logger.error(f"Intelligent query failed: {e}") + return {'error': str(e)} + + def fuse_memory_responses(self, query: str, episodic: List, persistent: List, code: List) -> str: + """Fuse responses from different memory systems.""" + response_parts = [] + + # Weight different memory types + if persistent: + high_conf_knowledge = [p for p in persistent if p.get('confidence', 0) > 0.8] + if high_conf_knowledge: + response_parts.append("Based on established knowledge:") + for knowledge in high_conf_knowledge[:3]: + response_parts.append(f"• {knowledge['content']}") + + if episodic: + recent_interactions = sorted(episodic, key=lambda x: x.get('timestamp', datetime.min), reverse=True)[:2] + if recent_interactions: + response_parts.append("\nFrom previous interactions:") + for interaction in recent_interactions: + response_parts.append(f"• {interaction.get('ai_response', '')[:200]}...") + + if code: + similar_patterns = [c for c in code if c.get('similarity_score', 0) > 0.7] + if similar_patterns: + response_parts.append("\nSimilar code patterns found:") + for pattern in similar_patterns[:2]: + issues = pattern.get('analysis_data', {}).get('issues_found', []) + if issues: + response_parts.append(f"• {pattern['file_path']}: {issues[0]}") + + return '\n'.join(response_parts) if response_parts else "No relevant memories found." + + def calculate_response_confidence(self, response: str) -> float: + """Calculate confidence score for fused response.""" + if not response or response == "No relevant memories found.": + return 0.0 + + # Simple confidence calculation based on response length and structure + confidence = min(len(response.split()) / 100.0, 1.0) # Normalize by word count + if "Based on established knowledge:" in response: + confidence += 0.2 + if "From previous interactions:" in response: + confidence += 0.1 + if "Similar code patterns found:" in response: + confidence += 0.15 + + return min(confidence, 1.0) + +class EnhancedGitHubAnalyzer: + """Enhanced repository analyzer with memory capabilities.""" + + def __init__(self, api_key: str, memory_config: Dict[str, Any]): + self.client = anthropic.Anthropic(api_key=api_key) + self.memory_manager = MemoryManager(memory_config) + self.query_engine = MemoryQueryEngine(self.memory_manager) + self.session_id = str(uuid.uuid4()) + self.temp_dir = None + + # Language mapping for file detection + self.language_map = { + '.py': 'Python', '.js': 'JavaScript', '.ts': 'TypeScript', + '.tsx': 'TypeScript', '.jsx': 'JavaScript', '.java': 'Java', + '.cpp': 'C++', '.c': 'C', '.cs': 'C#', '.go': 'Go', '.rs': 'Rust', + '.php': 'PHP', '.rb': 'Ruby', '.swift': 'Swift', '.kt': 'Kotlin', + '.html': 'HTML', '.css': 'CSS', '.scss': 'SCSS', '.sass': 'SASS', + '.sql': 'SQL', '.yaml': 'YAML', '.yml': 'YAML', '.json': 'JSON', + '.xml': 'XML', '.sh': 'Shell', '.dockerfile': 'Docker', + '.md': 'Markdown', '.txt': 'Text' + } + + # Code file extensions to analyze + self.code_extensions = set(self.language_map.keys()) + + def clone_repository(self, repo_path: str) -> str: + """Clone repository or use existing path.""" + if os.path.exists(repo_path): + print(f"Using existing repository: {repo_path}") + return repo_path + else: + print(f"Cloning repository: {repo_path}") + self.temp_dir = tempfile.mkdtemp(prefix="repo_analysis_") + try: + git.Repo.clone_from(repo_path, self.temp_dir) + return self.temp_dir + except Exception as e: + raise Exception(f"Failed to clone repository: {e}") + + def calculate_repo_id(self, repo_path: str) -> str: + """Generate consistent repository ID.""" + return hashlib.sha256(repo_path.encode()).hexdigest()[:16] + + def get_file_language(self, file_path: Path) -> str: + """Get programming language from file extension.""" + return self.language_map.get(file_path.suffix.lower(), 'Unknown') + + def calculate_complexity_score(self, content: str) -> float: + """Calculate basic complexity score based on code patterns.""" + lines = content.split('\n') + complexity_indicators = ['if', 'else', 'elif', 'for', 'while', 'try', 'except', 'catch', 'switch'] + + complexity = 1 + for line in lines: + line_lower = line.lower().strip() + for indicator in complexity_indicators: + if indicator in line_lower: + complexity += 1 + + # Normalize to 1-10 scale + return min(complexity / max(len(lines), 1) * 100, 10.0) + + async def analyze_file_with_memory(self, file_path: Path, content: str, repo_id: str) -> FileAnalysis: + """Analyze file with memory-enhanced context.""" + language = self.get_file_language(file_path) + lines_of_code = len([line for line in content.split('\n') if line.strip()]) + complexity_score = self.calculate_complexity_score(content) + + # Check for similar code patterns in memory + similar_analyses = await self.memory_manager.search_similar_code( + f"{language} {file_path.name}", repo_id, limit=3 + ) + + # Get relevant knowledge from persistent memory + persistent_knowledge = await self.memory_manager.retrieve_persistent_memories( + f"{language} code quality security", category="", limit=5 + ) + + # Build enhanced context for analysis + context_info = "" + if similar_analyses: + context_info += f"\nSimilar files previously analyzed:\n" + for similar in similar_analyses[:2]: + context_info += f"- {similar['file_path']}: Found {len(similar.get('analysis_data', {}).get('issues_found', []))} issues\n" + + if persistent_knowledge: + context_info += f"\nRelevant best practices:\n" + for knowledge in persistent_knowledge[:3]: + context_info += f"- {knowledge['content'][:100]}...\n" + + # Truncate content if too long + if len(content) > 4000: + content = content[:4000] + "\n... [truncated for analysis]" + + print(f" Analyzing {file_path.name} ({language}, {lines_of_code} lines)") + + # Create comprehensive analysis prompt with memory context + prompt = f""" +You are a senior software engineer with 25+ years of experience. Analyze this {language} code file with context from previous analyses. + +FILENAME: {file_path.name} +LANGUAGE: {language} +LINES OF CODE: {lines_of_code} + +{context_info} + +CODE: +```{language.lower()} +{content} +``` + +Provide a comprehensive analysis covering: + +1. ISSUES FOUND: List specific problems, bugs, security vulnerabilities, or code smells +2. RECOMMENDATIONS: Actionable suggestions for improvement +3. CODE QUALITY: Overall assessment of code quality and maintainability +4. SECURITY: Any security concerns or vulnerabilities +5. PERFORMANCE: Potential performance issues or optimizations +6. BEST PRACTICES: Adherence to coding standards and best practices + +Rate the overall code quality from 1-10 where 10 is excellent. + +ANALYSIS: +""" + + try: + message = self.client.messages.create( + model="claude-3-5-sonnet-20240620", + max_tokens=3000, + temperature=0.1, + messages=[{"role": "user", "content": prompt}] + ) + + analysis_text = message.content[0].text.strip() + + # Extract severity score from analysis + severity_match = re.search(r'(\d+(?:\.\d+)?)/10', analysis_text) + severity_score = float(severity_match.group(1)) if severity_match else 5.0 + + # Parse issues and recommendations from the text + issues = self.extract_issues_from_analysis(analysis_text) + recommendations = self.extract_recommendations_from_analysis(analysis_text) + + # Create file analysis object + file_analysis = FileAnalysis( + path=str(file_path.relative_to(Path(self.temp_dir or '.'))), + language=language, + lines_of_code=lines_of_code, + complexity_score=complexity_score, + issues_found=issues, + recommendations=recommendations, + detailed_analysis=analysis_text, + severity_score=severity_score + ) + + # Store analysis in memory for future reference + await self.memory_manager.store_code_analysis( + repo_id, str(file_analysis.path), asdict(file_analysis) + ) + + # Extract knowledge for persistent memory + await self.extract_knowledge_from_analysis(file_analysis, repo_id) + + return file_analysis + + except Exception as e: + print(f" Error analyzing {file_path.name}: {e}") + return FileAnalysis( + path=str(file_path), + language=language, + lines_of_code=lines_of_code, + complexity_score=complexity_score, + issues_found=[f"Analysis failed: {str(e)}"], + recommendations=["Review file manually due to analysis error"], + detailed_analysis=f"Analysis failed due to error: {str(e)}", + severity_score=5.0 + ) + + def extract_issues_from_analysis(self, analysis_text: str) -> List[str]: + """Extract issues from analysis text.""" + issues = [] + lines = analysis_text.split('\n') + + # Look for common issue indicators + issue_keywords = ['issue', 'problem', 'bug', 'vulnerability', 'error', 'warning', 'concern'] + + for line in lines: + line_lower = line.lower().strip() + if any(keyword in line_lower for keyword in issue_keywords): + if line.strip() and not line.strip().startswith('#'): + issues.append(line.strip()) + + return issues[:10] # Limit to top 10 issues + + def extract_recommendations_from_analysis(self, analysis_text: str) -> List[str]: + """Extract recommendations from analysis text.""" + recommendations = [] + lines = analysis_text.split('\n') + + # Look for recommendation indicators + rec_keywords = ['recommend', 'suggest', 'should', 'consider', 'improve'] + + for line in lines: + line_lower = line.lower().strip() + if any(keyword in line_lower for keyword in rec_keywords): + if line.strip() and not line.strip().startswith('#'): + recommendations.append(line.strip()) + + return recommendations[:10] # Limit to top 10 recommendations + + async def extract_knowledge_from_analysis(self, file_analysis: FileAnalysis, repo_id: str): + """Extract valuable knowledge from analysis for persistent storage.""" + try: + # Extract security-related knowledge + security_issues = [issue for issue in file_analysis.issues_found + if any(sec in issue.lower() for sec in ['security', 'vulnerability', 'injection', 'xss', 'auth'])] + + for issue in security_issues: + await self.memory_manager.store_persistent_memory( + content=f"Security issue in {file_analysis.language}: {issue}", + category='security_vulnerability', + confidence=0.8, + source_repos=[repo_id] + ) + + # Extract best practices + best_practices = [rec for rec in file_analysis.recommendations + if any(bp in rec.lower() for bp in ['best practice', 'standard', 'convention'])] + + for practice in best_practices: + await self.memory_manager.store_persistent_memory( + content=f"{file_analysis.language} best practice: {practice}", + category='best_practice', + confidence=0.7, + source_repos=[repo_id] + ) + + # Extract code patterns + if file_analysis.severity_score < 5: + await self.memory_manager.store_persistent_memory( + content=f"Low quality {file_analysis.language} pattern: {file_analysis.detailed_analysis[:200]}", + category='code_pattern', + confidence=0.6, + source_repos=[repo_id] + ) + + except Exception as e: + self.memory_manager.logger.error(f"Knowledge extraction failed: {e}") + + def scan_repository(self, repo_path: str) -> List[Tuple[Path, str]]: + """Scan repository and collect ALL files for analysis.""" + print(f"Scanning repository: {repo_path}") + + files_to_analyze = [] + + # Important files to always include + important_files = { + 'README.md', 'package.json', 'requirements.txt', 'Dockerfile', + 'docker-compose.yml', 'tsconfig.json', 'next.config.js', + 'tailwind.config.js', 'webpack.config.js', '.env.example', + 'Cargo.toml', 'pom.xml', 'build.gradle', 'composer.json', + 'Gemfile', 'go.mod', 'yarn.lock', 'pnpm-lock.yaml' + } + + for root, dirs, files in os.walk(repo_path): + # Skip common build/cache directories + dirs[:] = [d for d in dirs if not d.startswith('.') and + d not in {'node_modules', '__pycache__', 'build', 'dist', 'target', + 'venv', 'env', '.git', '.next', 'coverage', 'vendor', + 'bower_components', '.gradle', '.m2', '.cargo'}] + + for file in files: + file_path = Path(root) / file + + # Skip large files (increased limit for comprehensive analysis) + try: + if file_path.stat().st_size > 2000000: # 2MB limit + print(f" Skipping large file: {file_path.name} ({file_path.stat().st_size / 1024 / 1024:.1f}MB)") + continue + except: + continue + + # Include important files or files with code extensions + should_include = ( + file.lower() in important_files or + file_path.suffix.lower() in self.code_extensions or + file.lower().startswith('dockerfile') or + file.lower().startswith('makefile') or + file.lower().startswith('cmake') + ) + + if should_include: + try: + with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: + content = f.read() + if content.strip(): # Only non-empty files + files_to_analyze.append((file_path, content)) + except Exception as e: + print(f"Could not read {file_path}: {e}") + + print(f"Found {len(files_to_analyze)} files to analyze") + return files_to_analyze + + async def analyze_repository_with_memory(self, repo_path: str) -> RepositoryAnalysis: + """Main analysis function with memory integration - analyzes ALL files.""" + try: + # Generate repo ID and check for cached analysis + repo_id = self.calculate_repo_id(repo_path) + + # Check working memory for recent analysis + cached_analysis = await self.memory_manager.get_working_memory(f"repo_analysis:{repo_id}") + if cached_analysis: + print("Using cached repository analysis from memory") + return RepositoryAnalysis(**cached_analysis) + + # Clone/access repository + actual_repo_path = self.clone_repository(repo_path) + + # Get analysis context from memory (no user query needed) + context_memories = await self.get_analysis_context(repo_path, "", repo_id) + + # Scan ALL files + files_to_analyze = self.scan_repository(actual_repo_path) + + if not files_to_analyze: + raise Exception("No files found to analyze") + + # Analyze each file with memory context + print(f"Starting comprehensive analysis of {len(files_to_analyze)} files...") + file_analyses = [] + + for i, (file_path, content) in enumerate(files_to_analyze): + print(f"Analyzing file {i+1}/{len(files_to_analyze)}: {file_path.name}") + analysis = await self.analyze_file_with_memory(file_path, content, repo_id) + file_analyses.append(analysis) + + # Small delay to avoid rate limiting + await asyncio.sleep(0.1) + + # Repository-level analyses with memory context + print("Performing repository-level analysis with memory context...") + architecture_assessment, security_assessment = await self.analyze_repository_overview_with_memory( + actual_repo_path, file_analyses, context_memories, repo_id + ) + + # Calculate overall quality score + avg_quality = sum(fa.severity_score for fa in file_analyses) / len(file_analyses) + + # Generate statistics + languages = dict(Counter(fa.language for fa in file_analyses)) + total_lines = sum(fa.lines_of_code for fa in file_analyses) + + # Create repository analysis + repo_analysis = RepositoryAnalysis( + repo_path=repo_path, + total_files=len(file_analyses), + total_lines=total_lines, + languages=languages, + architecture_assessment=architecture_assessment, + security_assessment=security_assessment, + code_quality_score=avg_quality, + file_analyses=file_analyses, + executive_summary="" + ) + + # Generate executive summary with memory context + print("Generating memory-enhanced executive summary...") + repo_analysis.executive_summary = await self.generate_executive_summary_with_memory( + repo_analysis, context_memories + ) + + # Store analysis in episodic memory (automated analysis) + await self.memory_manager.store_episodic_memory( + self.session_id, "Complete automated repository analysis", + f"Analyzed {repo_analysis.total_files} files, found {sum(len(fa.issues_found) for fa in file_analyses)} issues", + repo_id, + { + 'repo_path': repo_path, + 'quality_score': avg_quality, + 'total_issues': sum(len(fa.issues_found) for fa in file_analyses), + 'analysis_type': 'automated_comprehensive' + } + ) + + # Cache analysis in working memory + await self.memory_manager.store_working_memory( + f"repo_analysis:{repo_id}", + asdict(repo_analysis), + ttl=7200 # 2 hours + ) + + return repo_analysis + + finally: + # Cleanup + if self.temp_dir and os.path.exists(self.temp_dir): + shutil.rmtree(self.temp_dir) + print("Temporary files cleaned up") + + async def get_analysis_context(self, repo_path: str, user_query: str, repo_id: str) -> Dict[str, List]: + """Gather relevant context from memory systems.""" + context = { + 'episodic_memories': [], + 'persistent_knowledge': [], + 'similar_analyses': [] + } + + # Get relevant persistent knowledge for comprehensive analysis + context['persistent_knowledge'] = await self.memory_manager.retrieve_persistent_memories( + "code quality security best practices", limit=15 + ) + + # Find similar code analyses + context['similar_analyses'] = await self.memory_manager.search_similar_code( + "repository analysis", repo_id, limit=10 + ) + + return context + + async def analyze_repository_overview_with_memory(self, repo_path: str, file_analyses: List[FileAnalysis], + context_memories: Dict, repo_id: str) -> Tuple[str, str]: + """Analyze repository architecture and security with memory context.""" + print("Analyzing repository overview with memory context...") + + # Prepare summary data + languages = dict(Counter(fa.language for fa in file_analyses)) + total_lines = sum(fa.lines_of_code for fa in file_analyses) + avg_quality = sum(fa.severity_score for fa in file_analyses) / len(file_analyses) if file_analyses else 5.0 + + # Build memory context + memory_context = "" + if context_memories['persistent_knowledge']: + memory_context += "Relevant knowledge from previous analyses:\n" + for knowledge in context_memories['persistent_knowledge'][:3]: + memory_context += f"- {knowledge['content']}\n" + + if context_memories['similar_analyses']: + memory_context += "\nSimilar repositories analyzed:\n" + for similar in context_memories['similar_analyses'][:2]: + memory_context += f"- {similar['file_path']}: {len(similar.get('analysis_data', {}).get('issues_found', []))} issues found\n" + + # Get repository structure + structure_lines = [] + try: + for root, dirs, files in os.walk(repo_path): + dirs[:] = [d for d in dirs if not d.startswith('.') and d not in {'node_modules', '__pycache__'}] + level = root.replace(repo_path, '').count(os.sep) + indent = ' ' * level + structure_lines.append(f"{indent}{os.path.basename(root)}/") + for file in files[:3]: # Limit files shown per directory + structure_lines.append(f"{indent} {file}") + if len(structure_lines) > 50: # Limit total structure size + break + except Exception as e: + structure_lines = [f"Error reading structure: {e}"] + + # Architecture analysis with memory context + arch_prompt = f""" +You are a Senior Software Architect with 25+ years of experience. + +{memory_context} + +Analyze this repository: + +REPOSITORY STRUCTURE: +{chr(10).join(structure_lines[:30])} + +STATISTICS: +- Total files analyzed: {len(file_analyses)} +- Total lines of code: {total_lines:,} +- Languages: {languages} +- Average code quality: {avg_quality:.1f}/10 + +TOP FILE ISSUES: +{chr(10).join([f"- {fa.path}: {len(fa.issues_found)} issues" for fa in file_analyses[:10]])} + +Provide an architectural assessment covering: +1. Project type and purpose +2. Technology stack evaluation +3. Code organization and structure +4. Scalability and maintainability concerns +5. Key recommendations for improvement + +Incorporate insights from the memory context provided above. +Keep response under 1500 words and focus on actionable insights. +""" + + # Security analysis with memory context + security_issues = [] + for fa in file_analyses: + security_issues.extend([issue for issue in fa.issues_found if + any(keyword in issue.lower() for keyword in + ['security', 'vulnerability', 'injection', 'xss', 'auth', 'password'])]) + + sec_prompt = f""" +You are a Senior Security Engineer with 20+ years of experience. + +{memory_context} + +Security Analysis for repository with {len(file_analyses)} files: + +SECURITY ISSUES FOUND: +{chr(10).join(security_issues[:20]) if security_issues else "No obvious security issues detected"} + +HIGH-RISK FILE TYPES PRESENT: +{[lang for lang, count in languages.items() if lang in ['JavaScript', 'TypeScript', 'Python', 'PHP', 'SQL']]} + +Provide security assessment covering: +1. Overall security posture +2. Main security risks and vulnerabilities +3. Authentication and authorization concerns +4. Data protection and privacy issues +5. Immediate security priorities + +Incorporate insights from the memory context provided above. +Keep response under 1000 words and focus on actionable security recommendations. +""" + + try: + # Run both analyses + arch_task = self.client.messages.create( + model="claude-3-5-sonnet-20240620", + max_tokens=2000, + temperature=0.1, + messages=[{"role": "user", "content": arch_prompt}] + ) + + sec_task = self.client.messages.create( + model="claude-3-5-sonnet-20240620", + max_tokens=1500, + temperature=0.1, + messages=[{"role": "user", "content": sec_prompt}] + ) + + architecture_assessment = arch_task.content[0].text + security_assessment = sec_task.content[0].text + + # Store insights as persistent knowledge + await self.memory_manager.store_persistent_memory( + content=f"Architecture pattern: {architecture_assessment[:300]}...", + category='architecture', + confidence=0.7, + source_repos=[repo_id] + ) + + return architecture_assessment, security_assessment + + except Exception as e: + return f"Architecture analysis failed: {e}", f"Security analysis failed: {e}" + + async def generate_executive_summary_with_memory(self, analysis: RepositoryAnalysis, context_memories: Dict) -> str: + """Generate executive summary with memory context.""" + print("Generating executive summary with memory context...") + + # Build memory context for executive summary + executive_context = "" + if context_memories['episodic_memories']: + executive_context += "Previous executive discussions:\n" + for memory in context_memories['episodic_memories'][:2]: + if 'executive' in memory.get('ai_response', '').lower(): + executive_context += f"- {memory['ai_response'][:200]}...\n" + + prompt = f""" +You are presenting to C-level executives. Create an executive summary of this technical analysis. + +{executive_context} + +REPOSITORY METRICS: +- Total Files: {analysis.total_files} +- Lines of Code: {analysis.total_lines:,} +- Languages: {analysis.languages} +- Code Quality Score: {analysis.code_quality_score:.1f}/10 + +KEY FINDINGS: +- Total issues identified: {sum(len(fa.issues_found) for fa in analysis.file_analyses)} +- Files needing attention: {len([fa for fa in analysis.file_analyses if fa.severity_score < 7])} +- High-quality files: {len([fa for fa in analysis.file_analyses if fa.severity_score >= 8])} + +Create an executive summary for non-technical leadership covering: +1. Business impact of code quality findings +2. Risk assessment and implications +3. Investment priorities and recommendations +4. Expected ROI from addressing technical debt +5. Competitive implications + +Focus on business outcomes, not technical details. Keep under 800 words. +""" + + try: + message = self.client.messages.create( + model="claude-3-5-sonnet-20240620", + max_tokens=1200, + temperature=0.1, + messages=[{"role": "user", "content": prompt}] + ) + return message.content[0].text + except Exception as e: + return f"Executive summary generation failed: {e}" + + def create_pdf_report(self, analysis: RepositoryAnalysis, output_path: str): + """Generate comprehensive PDF report.""" + print(f"Generating PDF report: {output_path}") + + doc = SimpleDocTemplate(output_path, pagesize=A4, + leftMargin=72, rightMargin=72, + topMargin=72, bottomMargin=72) + styles = getSampleStyleSheet() + story = [] + + # Custom styles + title_style = ParagraphStyle( + 'CustomTitle', + parent=styles['Heading1'], + fontSize=24, + textColor=colors.darkblue, + spaceAfter=30, + alignment=TA_CENTER + ) + + heading_style = ParagraphStyle( + 'CustomHeading', + parent=styles['Heading2'], + fontSize=16, + textColor=colors.darkblue, + spaceBefore=20, + spaceAfter=10 + ) + + # Title Page + story.append(Paragraph("AI-Enhanced Repository Analysis Report", title_style)) + story.append(Spacer(1, 20)) + story.append(Paragraph(f"Repository: {analysis.repo_path}", styles['Normal'])) + story.append(Paragraph(f"Analysis Date: {datetime.now().strftime('%B %d, %Y at %H:%M')}", styles['Normal'])) + story.append(Paragraph("Generated by: Enhanced AI Analysis System with Memory", styles['Normal'])) + story.append(PageBreak()) + + # Executive Summary + story.append(Paragraph("Executive Summary", heading_style)) + story.append(Paragraph(analysis.executive_summary, styles['Normal'])) + story.append(PageBreak()) + + # Repository Overview + story.append(Paragraph("Repository Overview", heading_style)) + + overview_data = [ + ['Metric', 'Value'], + ['Total Files Analyzed', str(analysis.total_files)], + ['Total Lines of Code', f"{analysis.total_lines:,}"], + ['Primary Languages', ', '.join(list(analysis.languages.keys())[:5])], + ['Overall Code Quality', f"{analysis.code_quality_score:.1f}/10"], + ] + + overview_table = Table(overview_data, colWidths=[200, 300]) + overview_table.setStyle(TableStyle([ + ('BACKGROUND', (0, 0), (-1, 0), colors.grey), + ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), + ('ALIGN', (0, 0), (-1, -1), 'LEFT'), + ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'), + ('FONTSIZE', (0, 0), (-1, 0), 12), + ('BOTTOMPADDING', (0, 0), (-1, 0), 12), + ('BACKGROUND', (0, 1), (-1, -1), colors.beige), + ('GRID', (0, 0), (-1, -1), 1, colors.black) + ])) + + story.append(overview_table) + story.append(Spacer(1, 20)) + + # Build PDF + try: + doc.build(story) + print(f"āœ… PDF report generated successfully: {output_path}") + except Exception as e: + print(f"āŒ Error generating PDF: {e}") + + async def query_memory(self, query: str, repo_context: str = "") -> Dict[str, Any]: + """Query the memory system directly.""" + return await self.query_engine.intelligent_query(query, repo_context) + +def get_memory_config() -> Dict[str, Any]: + """Get memory system configuration from environment variables.""" + return { + 'anthropic_api_key': os.getenv('ANTHROPIC_API_KEY', ''), + 'redis_host': os.getenv('REDIS_HOST', 'localhost'), + 'redis_port': int(os.getenv('REDIS_PORT', 6379)), + 'redis_db': int(os.getenv('REDIS_DB', 0)), + 'mongodb_url': os.getenv('MONGODB_URL', 'mongodb://localhost:27017/'), + 'mongodb_name': os.getenv('MONGODB_DB', 'repo_analyzer'), + 'postgres_host': os.getenv('POSTGRES_HOST', 'localhost'), + 'postgres_port': int(os.getenv('POSTGRES_PORT', 5432)), + 'postgres_db': os.getenv('POSTGRES_DB', 'repo_vectors'), + 'postgres_user': os.getenv('POSTGRES_USER', 'postgres'), + 'postgres_password': os.getenv('POSTGRES_PASSWORD', '') + } + +async def main(): + """Main function to run the enhanced repository analyzer.""" + load_dotenv() + + import argparse + parser = argparse.ArgumentParser(description="Complete AI Repository Analysis - Analyzes ALL files automatically") + parser.add_argument("repo_path", help="Repository path (local directory or Git URL)") + parser.add_argument("--output", "-o", default="complete_repository_analysis.pdf", + help="Output PDF file path") + parser.add_argument("--api-key", help="Anthropic API key (overrides .env)") + + args = parser.parse_args() + + # Get API key + api_key = args.api_key or os.getenv('ANTHROPIC_API_KEY') + if not api_key: + print("āŒ Error: ANTHROPIC_API_KEY not found in .env file or command line") + return 1 + + try: + print("šŸš€ Starting Complete AI Repository Analysis") + print("=" * 60) + print(f"Repository: {args.repo_path}") + print(f"Output: {args.output}") + print("Mode: Complete automated analysis of ALL files") + print("=" * 60) + + # Initialize enhanced analyzer + config = get_memory_config() + analyzer = EnhancedGitHubAnalyzer(api_key, config) + + # Perform complete analysis + analysis = await analyzer.analyze_repository_with_memory(args.repo_path) + + # Generate PDF report + analyzer.create_pdf_report(analysis, args.output) + + # Print summary to console + print("\n" + "=" * 60) + print("šŸŽÆ COMPLETE ANALYSIS FINISHED") + print("=" * 60) + print(f"šŸ“Š Repository Statistics:") + print(f" • Files Analyzed: {analysis.total_files}") + print(f" • Lines of Code: {analysis.total_lines:,}") + print(f" • Languages: {len(analysis.languages)}") + print(f" • Code Quality: {analysis.code_quality_score:.1f}/10") + + # Quality breakdown + high_quality = len([fa for fa in analysis.file_analyses if fa.severity_score >= 8]) + medium_quality = len([fa for fa in analysis.file_analyses if 5 <= fa.severity_score < 8]) + low_quality = len([fa for fa in analysis.file_analyses if fa.severity_score < 5]) + + print(f"\nšŸ“ˆ Quality Breakdown:") + print(f" • High Quality Files (8-10): {high_quality}") + print(f" • Medium Quality Files (5-7): {medium_quality}") + print(f" • Low Quality Files (1-4): {low_quality}") + print(f" • Total Issues Found: {sum(len(fa.issues_found) for fa in analysis.file_analyses)}") + + # Language breakdown + print(f"\nšŸ”¤ Language Distribution:") + for lang, count in sorted(analysis.languages.items(), key=lambda x: x[1], reverse=True)[:10]: + print(f" • {lang}: {count} files") + + # Memory system stats + memory_stats = await analyzer.memory_manager.get_memory_stats() + print(f"\n🧠 Memory System Statistics:") + for category, data in memory_stats.items(): + print(f" • {category.replace('_', ' ').title()}: {data}") + + print(f"\nšŸ“„ Complete PDF Report: {args.output}") + print("\nāœ… Complete analysis finished successfully!") + + return 0 + + except Exception as e: + print(f"āŒ Error during analysis: {e}") + import traceback + traceback.print_exc() + return 1 + +if __name__ == "__main__": + exit(asyncio.run(main())) \ No newline at end of file diff --git a/services/ai-analysis-service/env.example b/services/ai-analysis-service/env.example new file mode 100644 index 0000000..dc3beee --- /dev/null +++ b/services/ai-analysis-service/env.example @@ -0,0 +1,46 @@ +# AI Analysis Service Environment Configuration + +# Service Configuration +PORT=8022 +HOST=0.0.0.0 +NODE_ENV=development + +# AI API Keys +ANTHROPIC_API_KEY=your_anthropic_api_key_here + +# Database Configuration +POSTGRES_HOST=localhost +POSTGRES_PORT=5432 +POSTGRES_DB=dev_pipeline +POSTGRES_USER=pipeline_admin +POSTGRES_PASSWORD=secure_pipeline_2024 + +# Redis Configuration +REDIS_HOST=localhost +REDIS_PORT=6379 +REDIS_PASSWORD=redis_secure_2024 +REDIS_DB=0 + +# MongoDB Configuration +MONGODB_URL=mongodb://pipeline_admin:mongo_secure_2024@localhost:27017/ +MONGODB_DB=repo_analyzer + +# JWT Configuration +JWT_ACCESS_SECRET=access-secret-key-2024-tech4biz-secure_pipeline_2024 + +# Service URLs +USER_AUTH_SERVICE_URL=http://localhost:8011 + +# Analysis Configuration +MAX_FILES_PER_ANALYSIS=100 +MAX_FILE_SIZE_MB=2 +ANALYSIS_TIMEOUT_SECONDS=300 + +# Memory System Configuration +WORKING_MEMORY_TTL=3600 +EPISODIC_RETENTION_DAYS=365 +PERSISTENT_MEMORY_THRESHOLD=0.8 + +# Logging Configuration +LOG_LEVEL=INFO +LOG_FILE_PATH=/app/logs/ai-analysis.log diff --git a/services/ai-analysis-service/migrate.sh b/services/ai-analysis-service/migrate.sh new file mode 100755 index 0000000..0c21c31 --- /dev/null +++ b/services/ai-analysis-service/migrate.sh @@ -0,0 +1,104 @@ +#!/bin/bash + +# Database Migration Script using psql +# Executes the complete 001-schema.sql file + +set -e # Exit on any error + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Load environment variables +if [ -f .env ]; then + export $(cat .env | grep -v '^#' | xargs) +fi + +# Database connection parameters +DB_HOST=${POSTGRES_HOST:-localhost} +DB_PORT=${POSTGRES_PORT:-5432} +DB_NAME=${POSTGRES_DB:-dev_pipeline} +DB_USER=${POSTGRES_USER:-pipeline_admin} +DB_PASSWORD=${POSTGRES_PASSWORD:-secure_pipeline_2024} + +# Schema file +SCHEMA_FILE="001-schema.sql" + +echo -e "${BLUE}šŸ”§ AI Repository Analysis Database Migration${NC}" +echo "==================================================" +echo -e "Database: ${YELLOW}${DB_NAME}@${DB_HOST}:${DB_PORT}${NC}" +echo -e "User: ${YELLOW}${DB_USER}${NC}" +echo -e "Schema file: ${YELLOW}${SCHEMA_FILE}${NC}" +echo "" + +# Check if psql is available +if ! command -v psql &> /dev/null; then + echo -e "${RED}āŒ psql command not found!${NC}" + echo "Please install PostgreSQL client tools:" + echo " Ubuntu/Debian: sudo apt-get install postgresql-client" + echo " CentOS/RHEL: sudo yum install postgresql" + echo " macOS: brew install postgresql" + exit 1 +fi + +# Check if schema file exists +if [ ! -f "$SCHEMA_FILE" ]; then + echo -e "${RED}āŒ Schema file not found: ${SCHEMA_FILE}${NC}" + exit 1 +fi + +echo -e "${BLUE}• Executing migration...${NC}" + +# Set password for psql +export PGPASSWORD="$DB_PASSWORD" + +# Run migration +if psql -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" \ + -f "$SCHEMA_FILE" \ + -v ON_ERROR_STOP=1 \ + --echo-errors \ + --echo-queries; then + + echo -e "${GREEN}āœ… Migration completed successfully!${NC}" + + # Verify migration + echo -e "${BLUE}• Verifying migration...${NC}" + + TABLES=$(psql -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" -t -c " + SELECT table_name + FROM information_schema.tables + WHERE table_schema = 'public' + AND table_name IN ('code_embeddings', 'query_embeddings', 'knowledge_embeddings', + 'repository_metadata', 'analysis_sessions', 'file_analysis_history') + ORDER BY table_name; + " | tr -d ' ') + + if [ -n "$TABLES" ]; then + TABLE_COUNT=$(echo "$TABLES" | wc -l) + echo -e "${GREEN}āœ“ Found ${TABLE_COUNT} core tables: ${TABLES}${NC}" + else + echo -e "${YELLOW}⚠ Could not verify table creation${NC}" + fi + + # Check for pgvector extension + VECTOR_AVAILABLE=$(psql -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" -t -c " + SELECT EXISTS(SELECT 1 FROM pg_extension WHERE extname = 'vector'); + " | tr -d ' ') + + if [ "$VECTOR_AVAILABLE" = "t" ]; then + echo -e "${GREEN}āœ“ pgvector extension is available${NC}" + else + echo -e "${YELLOW}⚠ pgvector extension not available - vector operations will be limited${NC}" + fi + + echo "" + echo -e "${GREEN}šŸš€ Database migration completed successfully!${NC}" + echo -e "${GREEN}šŸ“Š Production-level database ready for AI repository analysis${NC}" + +else + echo -e "${RED}āŒ Migration failed!${NC}" + exit 1 +fi diff --git a/services/ai-analysis-service/migrate_database.py b/services/ai-analysis-service/migrate_database.py new file mode 100644 index 0000000..694d6db --- /dev/null +++ b/services/ai-analysis-service/migrate_database.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python3 +""" +Database Migration Script using psql command +Executes the complete 001-schema.sql file using PostgreSQL's psql command +""" + +import os +import subprocess +import sys +from dotenv import load_dotenv +import logging + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') +logger = logging.getLogger(__name__) + +def run_migration(): + """Run the database migration using psql command.""" + load_dotenv() + + # Database connection parameters + db_config = { + 'host': os.getenv('POSTGRES_HOST', 'localhost'), + 'port': os.getenv('POSTGRES_PORT', 5432), + 'database': os.getenv('POSTGRES_DB', 'dev_pipeline'), + 'user': os.getenv('POSTGRES_USER', 'pipeline_admin'), + 'password': os.getenv('POSTGRES_PASSWORD', 'secure_pipeline_2024') + } + + # Schema file path + schema_file = os.path.join(os.path.dirname(__file__), '001-schema.sql') + + if not os.path.exists(schema_file): + logger.error(f"āŒ Schema file not found: {schema_file}") + return False + + try: + logger.info("šŸ”§ Starting database migration with psql...") + logger.info(f" • Database: {db_config['database']}@{db_config['host']}:{db_config['port']}") + logger.info(f" • User: {db_config['user']}") + logger.info(f" • Schema file: {schema_file}") + + # Set PGPASSWORD environment variable for psql + env = os.environ.copy() + env['PGPASSWORD'] = db_config['password'] + + # Build psql command + psql_cmd = [ + 'psql', + '-h', db_config['host'], + '-p', str(db_config['port']), + '-U', db_config['user'], + '-d', db_config['database'], + '-f', schema_file, + '-v', 'ON_ERROR_STOP=1', # Stop on first error + '--echo-errors', # Show errors + '--echo-queries' # Show queries being executed + ] + + logger.info(" • Executing migration...") + logger.info(f" • Command: {' '.join(psql_cmd)}") + + # Run psql command + result = subprocess.run( + psql_cmd, + env=env, + capture_output=True, + text=True, + timeout=300 # 5 minute timeout + ) + + # Check if psql command exists + if result.returncode == 127: + logger.error("āŒ psql command not found. Please install PostgreSQL client tools.") + logger.error(" On Ubuntu/Debian: sudo apt-get install postgresql-client") + logger.error(" On CentOS/RHEL: sudo yum install postgresql") + return False + + # Check for errors + if result.returncode != 0: + logger.error(f"āŒ Migration failed with return code: {result.returncode}") + if result.stderr: + logger.error("STDERR:") + logger.error(result.stderr) + if result.stdout: + logger.error("STDOUT:") + logger.error(result.stdout) + return False + + # Log success + logger.info("āœ… Migration completed successfully!") + + if result.stdout: + logger.info("Migration output:") + # Filter out common psql output noise + lines = result.stdout.split('\n') + for line in lines: + if line.strip() and not line.startswith('SET') and not line.startswith('NOTICE'): + logger.info(f" {line}") + + # Verify migration by checking if key tables exist + logger.info(" • Verifying migration...") + + verify_cmd = [ + 'psql', + '-h', db_config['host'], + '-p', str(db_config['port']), + '-U', db_config['user'], + '-d', db_config['database'], + '-t', # tuples only + '-c', """ + SELECT table_name + FROM information_schema.tables + WHERE table_schema = 'public' + AND table_name IN ('code_embeddings', 'query_embeddings', 'knowledge_embeddings', + 'repository_metadata', 'analysis_sessions', 'file_analysis_history') + ORDER BY table_name; + """ + ] + + verify_result = subprocess.run( + verify_cmd, + env=env, + capture_output=True, + text=True, + timeout=30 + ) + + if verify_result.returncode == 0: + tables = [line.strip() for line in verify_result.stdout.split('\n') if line.strip()] + logger.info(f" āœ“ Found {len(tables)} core tables: {', '.join(tables)}") + else: + logger.warning(" ⚠ Could not verify table creation") + + # Check for pgvector extension + vector_cmd = [ + 'psql', + '-h', db_config['host'], + '-p', str(db_config['port']), + '-U', db_config['user'], + '-d', db_config['database'], + '-t', + '-c', "SELECT EXISTS(SELECT 1 FROM pg_extension WHERE extname = 'vector');" + ] + + vector_result = subprocess.run( + vector_cmd, + env=env, + capture_output=True, + text=True, + timeout=30 + ) + + if vector_result.returncode == 0: + has_vector = vector_result.stdout.strip() == 't' + if has_vector: + logger.info(" āœ“ pgvector extension is available") + else: + logger.warning(" ⚠ pgvector extension not available - vector operations will be limited") + + logger.info("šŸš€ Database migration completed successfully!") + logger.info("šŸ“Š Production-level database ready for AI repository analysis") + + return True + + except subprocess.TimeoutExpired: + logger.error("āŒ Migration timed out after 5 minutes") + return False + except FileNotFoundError: + logger.error("āŒ psql command not found. Please install PostgreSQL client tools.") + return False + except Exception as e: + logger.error(f"āŒ Migration failed: {e}") + return False + +def check_psql_available(): + """Check if psql command is available.""" + try: + result = subprocess.run(['psql', '--version'], capture_output=True, text=True) + if result.returncode == 0: + logger.info(f"āœ“ Found psql: {result.stdout.strip()}") + return True + else: + return False + except FileNotFoundError: + return False + +if __name__ == "__main__": + logger.info("šŸ”§ AI Repository Analysis Database Migration") + logger.info("=" * 50) + + # Check if psql is available + if not check_psql_available(): + logger.error("āŒ psql command not found!") + logger.error("Please install PostgreSQL client tools:") + logger.error(" Ubuntu/Debian: sudo apt-get install postgresql-client") + logger.error(" CentOS/RHEL: sudo yum install postgresql") + logger.error(" macOS: brew install postgresql") + sys.exit(1) + + # Run migration + success = run_migration() + sys.exit(0 if success else 1) diff --git a/services/ai-analysis-service/requirements.txt b/services/ai-analysis-service/requirements.txt new file mode 100644 index 0000000..78e4a11 --- /dev/null +++ b/services/ai-analysis-service/requirements.txt @@ -0,0 +1,25 @@ +# Core AI and API dependencies +anthropic>=0.7.0 +python-dotenv>=1.0.0 + +# Web framework +fastapi>=0.104.1 +uvicorn>=0.24.0 +pydantic>=2.5.0 + +# Git operations +GitPython>=3.1.40 + +# Database dependencies +redis>=4.5.0 +pymongo>=4.5.0 +psycopg2-binary>=2.9.7 + +# Data processing +numpy>=1.24.0 + +# PDF generation +reportlab>=4.0.0 + +# Optional: For better performance (if needed) +# sentence-transformers>=2.2.2 # Commented out - using Claude API instead diff --git a/services/ai-analysis-service/run_migration.py b/services/ai-analysis-service/run_migration.py new file mode 100644 index 0000000..595fe47 --- /dev/null +++ b/services/ai-analysis-service/run_migration.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 +""" +AI Analysis Service Database Migration Runner +Runs the database migration for AI Analysis Service during container startup. +""" + +import os +import sys +import subprocess +import time +from pathlib import Path + +def log(message): + """Log with timestamp.""" + print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] {message}") + +def check_database_connection(): + """Check if database is available.""" + try: + import psycopg2 + from dotenv import load_dotenv + + load_dotenv() + + conn = psycopg2.connect( + host=os.getenv('POSTGRES_HOST', 'localhost'), + port=os.getenv('POSTGRES_PORT', 5432), + database=os.getenv('POSTGRES_DB', 'dev_pipeline'), + user=os.getenv('POSTGRES_USER', 'pipeline_admin'), + password=os.getenv('POSTGRES_PASSWORD', 'secure_pipeline_2024') + ) + conn.close() + return True + except Exception as e: + log(f"Database connection failed: {e}") + return False + +def run_migration(): + """Run the database migration.""" + try: + log("Starting AI Analysis Service database migration...") + + # Check if database is available + max_retries = 30 + retry_count = 0 + + while retry_count < max_retries: + if check_database_connection(): + log("Database connection successful") + break + else: + retry_count += 1 + log(f"Database not ready, retrying in 2 seconds... ({retry_count}/{max_retries})") + time.sleep(2) + else: + log("ERROR: Could not connect to database after 60 seconds") + return False + + # Run the migration script + schema_file = Path(__file__).parent / "001-schema.sql" + if not schema_file.exists(): + log("ERROR: Schema file not found") + return False + + log(f"Running migration from {schema_file}") + + # Use psql to run the migration + env = os.environ.copy() + env['PGPASSWORD'] = os.getenv('POSTGRES_PASSWORD', 'secure_pipeline_2024') + + result = subprocess.run([ + 'psql', + '-h', os.getenv('POSTGRES_HOST', 'localhost'), + '-p', os.getenv('POSTGRES_PORT', '5432'), + '-U', os.getenv('POSTGRES_USER', 'pipeline_admin'), + '-d', os.getenv('POSTGRES_DB', 'dev_pipeline'), + '-f', str(schema_file), + '-v', 'ON_ERROR_STOP=1' + ], env=env, capture_output=True, text=True) + + if result.returncode == 0: + log("āœ… AI Analysis Service database migration completed successfully") + return True + else: + log(f"āŒ Migration failed: {result.stderr}") + return False + + except Exception as e: + log(f"āŒ Migration error: {e}") + return False + +if __name__ == "__main__": + success = run_migration() + sys.exit(0 if success else 1) diff --git a/services/ai-analysis-service/server.py b/services/ai-analysis-service/server.py new file mode 100644 index 0000000..3de8039 --- /dev/null +++ b/services/ai-analysis-service/server.py @@ -0,0 +1,230 @@ +#!/usr/bin/env python3 +""" +AI Analysis Service HTTP Server +Provides REST API endpoints for repository analysis. +""" + +import os +import asyncio +import json +import tempfile +import shutil +from pathlib import Path +from typing import Dict, Any +from datetime import datetime + +from fastapi import FastAPI, HTTPException, BackgroundTasks +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import FileResponse +from pydantic import BaseModel +import uvicorn + +# Import the AI analysis components +# Note: ai-analyze.py has a hyphen, so we need to handle the import specially +import sys +import importlib.util + +# Load the ai-analyze.py module +spec = importlib.util.spec_from_file_location("ai_analyze", "/app/ai-analyze.py") +ai_analyze_module = importlib.util.module_from_spec(spec) +sys.modules["ai_analyze"] = ai_analyze_module +spec.loader.exec_module(ai_analyze_module) + +# Now import the classes +from ai_analyze import EnhancedGitHubAnalyzer, get_memory_config + +app = FastAPI( + title="AI Analysis Service", + description="AI-powered repository analysis with memory system", + version="1.0.0" +) + +# CORS middleware +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# Global analyzer instance +analyzer = None + +class AnalysisRequest(BaseModel): + repo_path: str + output_format: str = "pdf" # pdf, json + max_files: int = 50 + +class AnalysisResponse(BaseModel): + success: bool + message: str + analysis_id: str = None + report_path: str = None + stats: Dict[str, Any] = None + +@app.on_event("startup") +async def startup_event(): + """Initialize the analyzer on startup.""" + global analyzer + try: + # Load environment variables + from dotenv import load_dotenv + load_dotenv() + + # Get API key + api_key = os.getenv('ANTHROPIC_API_KEY') + if not api_key: + raise Exception("ANTHROPIC_API_KEY not found in environment") + + # Initialize analyzer + config = get_memory_config() + analyzer = EnhancedGitHubAnalyzer(api_key, config) + + print("āœ… AI Analysis Service initialized successfully") + except Exception as e: + print(f"āŒ Failed to initialize AI Analysis Service: {e}") + raise + +@app.get("/health") +async def health_check(): + """Health check endpoint.""" + return { + "status": "healthy", + "service": "ai-analysis-service", + "timestamp": datetime.now().isoformat(), + "version": "1.0.0" + } + +@app.post("/analyze", response_model=AnalysisResponse) +async def analyze_repository(request: AnalysisRequest, background_tasks: BackgroundTasks): + """Analyze a repository.""" + try: + if not analyzer: + raise HTTPException(status_code=500, detail="Analyzer not initialized") + + # Generate unique analysis ID + analysis_id = f"analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}" + + # Create temporary directory for this analysis + temp_dir = tempfile.mkdtemp(prefix=f"ai_analysis_{analysis_id}_") + + try: + # Run analysis + analysis = await analyzer.analyze_repository_with_memory( + request.repo_path, + max_files=request.max_files + ) + + # Generate report + if request.output_format == "pdf": + report_path = f"/app/reports/{analysis_id}_analysis.pdf" + analyzer.create_pdf_report(analysis, report_path) + else: + report_path = f"/app/reports/{analysis_id}_analysis.json" + with open(report_path, 'w') as f: + json.dump({ + "repo_path": analysis.repo_path, + "total_files": analysis.total_files, + "total_lines": analysis.total_lines, + "languages": analysis.languages, + "code_quality_score": analysis.code_quality_score, + "architecture_assessment": analysis.architecture_assessment, + "security_assessment": analysis.security_assessment, + "executive_summary": analysis.executive_summary, + "file_analyses": [ + { + "path": fa.path, + "language": fa.language, + "lines_of_code": fa.lines_of_code, + "severity_score": fa.severity_score, + "issues_found": fa.issues_found, + "recommendations": fa.recommendations + } for fa in analysis.file_analyses + ] + }, f, indent=2) + + # Calculate stats + stats = { + "total_files": analysis.total_files, + "total_lines": analysis.total_lines, + "languages": analysis.languages, + "code_quality_score": analysis.code_quality_score, + "high_quality_files": len([fa for fa in analysis.file_analyses if fa.severity_score >= 8]), + "medium_quality_files": len([fa for fa in analysis.file_analyses if 5 <= fa.severity_score < 8]), + "low_quality_files": len([fa for fa in analysis.file_analyses if fa.severity_score < 5]), + "total_issues": sum(len(fa.issues_found) for fa in analysis.file_analyses) + } + + return AnalysisResponse( + success=True, + message="Analysis completed successfully", + analysis_id=analysis_id, + report_path=report_path, + stats=stats + ) + + finally: + # Cleanup temporary directory + if os.path.exists(temp_dir): + shutil.rmtree(temp_dir) + + except Exception as e: + return AnalysisResponse( + success=False, + message=f"Analysis failed: {str(e)}", + analysis_id=None, + report_path=None, + stats=None + ) + +@app.get("/reports/{filename}") +async def download_report(filename: str): + """Download analysis report.""" + report_path = f"/app/reports/{filename}" + if not os.path.exists(report_path): + raise HTTPException(status_code=404, detail="Report not found") + + return FileResponse( + report_path, + media_type='application/octet-stream', + filename=filename + ) + +@app.get("/memory/stats") +async def get_memory_stats(): + """Get memory system statistics.""" + try: + if not analyzer: + raise HTTPException(status_code=500, detail="Analyzer not initialized") + + stats = await analyzer.memory_manager.get_memory_stats() + return { + "success": True, + "memory_stats": stats + } + except Exception as e: + raise HTTPException(status_code=500, detail=f"Failed to get memory stats: {str(e)}") + +@app.post("/memory/query") +async def query_memory(query: str, repo_context: str = ""): + """Query the memory system.""" + try: + if not analyzer: + raise HTTPException(status_code=500, detail="Analyzer not initialized") + + result = await analyzer.query_memory(query, repo_context) + return { + "success": True, + "query": query, + "result": result + } + except Exception as e: + raise HTTPException(status_code=500, detail=f"Memory query failed: {str(e)}") + +if __name__ == "__main__": + port = int(os.getenv('PORT', 8022)) + host = os.getenv('HOST', '0.0.0.0') + + print(f"šŸš€ Starting AI Analysis Service on {host}:{port}") + uvicorn.run(app, host=host, port=port) diff --git a/services/api-gateway/src/server.js b/services/api-gateway/src/server.js index 4c64e39..2a57093 100644 --- a/services/api-gateway/src/server.js +++ b/services/api-gateway/src/server.js @@ -68,6 +68,7 @@ const serviceTargets = { DASHBOARD_URL: process.env.DASHBOARD_URL || 'http://localhost:8008', SELF_IMPROVING_GENERATOR_URL: process.env.SELF_IMPROVING_GENERATOR_URL || 'http://localhost:8007', AI_MOCKUP_URL: process.env.AI_MOCKUP_URL || 'http://localhost:8021', + AI_ANALYSIS_URL: process.env.AI_ANALYSIS_URL || 'http://localhost:8022', }; // Log service targets for debugging @@ -1984,6 +1985,76 @@ app.use('/api/mockup', } ); +// AI Analysis Service - Direct HTTP forwarding +console.log('šŸ”§ Registering /api/ai-analysis proxy route...'); +app.use('/api/ai-analysis', + createServiceLimiter(200), + // Allow unauthenticated access for AI analysis (public feature) + (req, res, next) => { + console.log(`šŸ¤– [AI ANALYSIS PROXY] ${req.method} ${req.originalUrl}`); + return next(); + }, + (req, res, next) => { + const aiAnalysisServiceUrl = serviceTargets.AI_ANALYSIS_URL; + // Strip the /api/ai-analysis prefix so /api/ai-analysis/analyze -> /analyze at target + const rewrittenPath = (req.originalUrl || '').replace(/^\/api\/ai-analysis/, ''); + const targetUrl = `${aiAnalysisServiceUrl}${rewrittenPath}`; + console.log(`šŸ”„ [AI ANALYSIS PROXY] ${req.method} ${req.originalUrl} → ${targetUrl}`); + + res.setTimeout(300000, () => { // 5 minutes timeout for analysis + console.error('āŒ [AI ANALYSIS PROXY] Response timeout'); + if (!res.headersSent) { + res.status(504).json({ error: 'Gateway timeout', service: 'ai-analysis' }); + } + }); + + const options = { + method: req.method, + url: targetUrl, + headers: { + 'Content-Type': 'application/json', + 'User-Agent': 'API-Gateway/1.0', + 'Connection': 'keep-alive', + 'Authorization': req.headers.authorization, + 'X-User-ID': req.user?.id || req.user?.userId, + ...(req.user?.role && { 'X-User-Role': req.user.role }) + }, + timeout: 240000, // 4 minutes timeout + validateStatus: () => true, + maxRedirects: 0, + maxContentLength: 100 * 1024 * 1024, // 100MB max content length + maxBodyLength: 100 * 1024 * 1024 // 100MB max body length + }; + + if (req.method === 'POST' || req.method === 'PUT' || req.method === 'PATCH') { + options.data = req.body || {}; + console.log(`šŸ“¦ [AI ANALYSIS PROXY] Request body:`, JSON.stringify(req.body)); + } + + axios(options) + .then(response => { + console.log(`āœ… [AI ANALYSIS PROXY] Response: ${response.status} for ${req.method} ${req.originalUrl}`); + if (!res.headersSent) { + res.status(response.status).json(response.data); + } + }) + .catch(error => { + console.error(`āŒ [AI ANALYSIS PROXY ERROR]:`, error.message); + if (!res.headersSent) { + if (error.response) { + res.status(error.response.status).json(error.response.data); + } else { + res.status(502).json({ + error: 'AI Analysis service unavailable', + message: error.code || error.message, + service: 'ai-analysis' + }); + } + } + }); + } +); + // Gateway management endpoints app.get('/api/gateway/info', authMiddleware.verifyToken, (req, res) => { res.json({ @@ -2041,9 +2112,10 @@ app.get('/', (req, res) => { deploy: '/api/deploy', dashboard: '/api/dashboard', self_improving: '/api/self-improving', - mockup: '/api/mockup', - unison: '/api/unison', - unified: '/api/recommendations' + mockup: '/api/mockup', + ai_analysis: '/api/ai-analysis', + unison: '/api/unison', + unified: '/api/recommendations' }, websocket: { endpoint: '/socket.io/',