diff --git a/docker-compose.yml b/docker-compose.yml
index 9c05177..f291a94 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -4,7 +4,7 @@ services:
# =====================================
postgres:
- image: postgres:15
+ image: pgvector/pgvector:pg15
container_name: pipeline_postgres
environment:
POSTGRES_USER: pipeline_admin
@@ -31,7 +31,7 @@ services:
volumes:
- redis_data:/data
ports:
- - "6379:6379"
+ - "6380:6379"
networks:
- pipeline_network
healthcheck:
@@ -714,6 +714,55 @@ services:
timeout: 10s
retries: 3
start_period: 40s
+
+ # =====================================
+ # AI Analysis Service
+ # =====================================
+
+ ai-analysis-service:
+ build: ./services/ai-analysis-service
+ container_name: pipeline_ai_analysis_service
+ ports:
+ - "8022:8022"
+ environment:
+ - PORT=8022
+ - HOST=0.0.0.0
+ - ANTHROPIC_API_KEY=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
+ - POSTGRES_HOST=postgres
+ - POSTGRES_PORT=5432
+ - POSTGRES_DB=dev_pipeline
+ - POSTGRES_USER=pipeline_admin
+ - POSTGRES_PASSWORD=secure_pipeline_2024
+ - REDIS_HOST=redis
+ - REDIS_PORT=6379
+ - REDIS_PASSWORD=redis_secure_2024
+ - MONGODB_URL=mongodb://pipeline_admin:mongo_secure_2024@mongodb:27017/
+ - MONGODB_DB=repo_analyzer
+ - JWT_ACCESS_SECRET=access-secret-key-2024-tech4biz-secure_pipeline_2024
+ - USER_AUTH_SERVICE_URL=http://user-auth:8011
+ - PYTHONUNBUFFERED=1
+ volumes:
+ - ai_analysis_logs:/app/logs
+ - ai_analysis_reports:/app/reports
+ - ai_analysis_temp:/app/temp
+ networks:
+ - pipeline_network
+ depends_on:
+ postgres:
+ condition: service_healthy
+ redis:
+ condition: service_healthy
+ mongodb:
+ condition: service_started
+ migrations:
+ condition: service_completed_successfully
+ healthcheck:
+ test: ["CMD", "curl", "-f", "http://localhost:8022/health"]
+ interval: 30s
+ timeout: 10s
+ retries: 3
+ start_period: 60s
+ restart: unless-stopped
# =====================================
# Workflow Orchestration
# =====================================
@@ -827,6 +876,12 @@ volumes:
driver: local
migration_state:
driver: local
+ ai_analysis_logs:
+ driver: local
+ ai_analysis_reports:
+ driver: local
+ ai_analysis_temp:
+ driver: local
# =====================================
# Networks
@@ -834,11 +889,3 @@ volumes:
networks:
pipeline_network:
driver: bridge
- # =====================================
- # Self-Improving Code Generator
- # =====================================
-
-
- # =====================================
- # Self-Improving Code Generator
- # =====================================
diff --git a/fix_provider_names.sql b/fix_provider_names.sql
new file mode 100644
index 0000000..d894fd2
--- /dev/null
+++ b/fix_provider_names.sql
@@ -0,0 +1,95 @@
+-- Fix provider_name based on repository URLs across ALL tables
+-- This script updates the provider_name field to match the actual provider from the repository URL
+
+-- =============================================
+-- 1. Fix all_repositories table
+-- =============================================
+UPDATE all_repositories
+SET provider_name = 'github'
+WHERE repository_url LIKE '%github.com%'
+ OR repository_url LIKE '%github.io%';
+
+UPDATE all_repositories
+SET provider_name = 'gitlab'
+WHERE repository_url LIKE '%gitlab.com%'
+ OR repository_url LIKE '%gitlab.io%';
+
+UPDATE all_repositories
+SET provider_name = 'bitbucket'
+WHERE repository_url LIKE '%bitbucket.org%'
+ OR repository_url LIKE '%bitbucket.io%';
+
+UPDATE all_repositories
+SET provider_name = 'gitea'
+WHERE repository_url LIKE '%gitea.com%'
+ OR repository_url LIKE '%gitea.io%';
+
+-- =============================================
+-- 2. Fix repository_storage table (linked to all_repositories)
+-- =============================================
+UPDATE repository_storage
+SET provider_name = ar.provider_name
+FROM all_repositories ar
+WHERE repository_storage.repository_id = ar.id;
+
+-- =============================================
+-- 3. Fix repository_commit_details table (linked to all_repositories)
+-- =============================================
+UPDATE repository_commit_details
+SET provider_name = ar.provider_name
+FROM all_repositories ar
+WHERE repository_commit_details.repository_id = ar.id;
+
+-- =============================================
+-- 4. Fix repository_commit_files table (linked to all_repositories)
+-- =============================================
+UPDATE repository_commit_files
+SET provider_name = ar.provider_name
+FROM all_repositories ar
+WHERE repository_commit_files.repository_id = ar.id;
+
+-- =============================================
+-- 5. Fix repository_directories table (linked to all_repositories)
+-- =============================================
+UPDATE repository_directories
+SET provider_name = ar.provider_name
+FROM all_repositories ar
+WHERE repository_directories.repository_id = ar.id;
+
+-- =============================================
+-- 6. Fix repository_files table (linked to all_repositories)
+-- =============================================
+UPDATE repository_files
+SET provider_name = ar.provider_name
+FROM all_repositories ar
+WHERE repository_files.repository_id = ar.id;
+
+-- =============================================
+-- 7. Show results for verification
+-- =============================================
+
+-- Show all_repositories results
+SELECT
+ 'all_repositories' as table_name,
+ repository_url,
+ repository_name,
+ owner_name,
+ provider_name,
+ CASE
+ WHEN repository_url LIKE '%github.com%' OR repository_url LIKE '%github.io%' THEN 'github'
+ WHEN repository_url LIKE '%gitlab.com%' OR repository_url LIKE '%gitlab.io%' THEN 'gitlab'
+ WHEN repository_url LIKE '%bitbucket.org%' OR repository_url LIKE '%bitbucket.io%' THEN 'bitbucket'
+ WHEN repository_url LIKE '%gitea.com%' OR repository_url LIKE '%gitea.io%' THEN 'gitea'
+ ELSE 'unknown'
+ END as detected_provider
+FROM all_repositories
+ORDER BY provider_name, repository_name;
+
+-- Show summary counts by provider
+SELECT
+ 'Summary by Provider' as info,
+ provider_name,
+ COUNT(*) as count
+FROM all_repositories
+GROUP BY provider_name
+ORDER BY provider_name;
diff --git a/services/ai-analysis-service/001-schema.sql b/services/ai-analysis-service/001-schema.sql
new file mode 100644
index 0000000..a775c8c
--- /dev/null
+++ b/services/ai-analysis-service/001-schema.sql
@@ -0,0 +1,613 @@
+-- ================================================
+-- Repository Analyzer Memory System Database Migration
+-- Version: 1.0
+-- Description: Complete database setup for AI memory system
+-- ================================================
+
+-- Enable required extensions
+CREATE EXTENSION IF NOT EXISTS vector;
+CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
+
+-- ================================================
+-- CORE TABLES
+-- ================================================
+
+-- Code embeddings table for semantic search of analyzed code
+CREATE TABLE IF NOT EXISTS code_embeddings (
+ id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
+ repo_id VARCHAR(255) NOT NULL,
+ file_path TEXT NOT NULL,
+ content_hash VARCHAR(64) NOT NULL,
+ embedding vector(384) NOT NULL,
+ metadata JSONB DEFAULT '{}',
+ created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+ last_accessed TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+ access_count INTEGER DEFAULT 0,
+
+ -- Ensure uniqueness per repo/file/hash combination
+ CONSTRAINT unique_code_analysis UNIQUE(repo_id, file_path, content_hash)
+);
+
+-- Query embeddings for episodic memory (user interactions)
+CREATE TABLE IF NOT EXISTS query_embeddings (
+ id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
+ session_id VARCHAR(255) NOT NULL,
+ query_text TEXT NOT NULL,
+ query_embedding vector(384) NOT NULL,
+ response_embedding vector(384),
+ repo_context VARCHAR(255),
+ timestamp TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+ metadata JSONB DEFAULT '{}',
+
+ -- Index for session-based queries
+ CONSTRAINT valid_session_id CHECK (LENGTH(session_id) > 0)
+);
+
+-- Persistent knowledge embeddings for long-term learning
+CREATE TABLE IF NOT EXISTS knowledge_embeddings (
+ id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
+ fact_id VARCHAR(255) UNIQUE NOT NULL,
+ content TEXT NOT NULL,
+ category VARCHAR(100) NOT NULL,
+ embedding vector(384) NOT NULL,
+ confidence REAL DEFAULT 1.0 CHECK (confidence >= 0.0 AND confidence <= 1.0),
+ source_repos TEXT[] DEFAULT '{}',
+ created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+ last_accessed TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+ access_frequency INTEGER DEFAULT 0,
+
+ -- Ensure valid categories
+ CONSTRAINT valid_category CHECK (category IN ('code_pattern', 'best_practice', 'vulnerability', 'architecture', 'security_vulnerability', 'performance'))
+);
+
+-- Repository metadata for tracking analyzed repositories
+CREATE TABLE IF NOT EXISTS repository_metadata (
+ id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
+ repo_id VARCHAR(255) UNIQUE NOT NULL,
+ repo_path TEXT NOT NULL,
+ repo_name VARCHAR(500),
+ primary_language VARCHAR(100),
+ total_files INTEGER DEFAULT 0,
+ total_lines INTEGER DEFAULT 0,
+ last_analyzed TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+ analysis_count INTEGER DEFAULT 0,
+ quality_score REAL DEFAULT 5.0 CHECK (quality_score >= 0.0 AND quality_score <= 10.0),
+ metadata JSONB DEFAULT '{}'
+);
+
+-- Session tracking for episodic memory correlation
+CREATE TABLE IF NOT EXISTS analysis_sessions (
+ id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
+ session_id VARCHAR(255) UNIQUE NOT NULL,
+ user_identifier VARCHAR(255),
+ start_time TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+ end_time TIMESTAMP WITH TIME ZONE,
+ total_queries INTEGER DEFAULT 0,
+ repositories_analyzed TEXT[] DEFAULT '{}',
+ session_metadata JSONB DEFAULT '{}'
+);
+
+-- File analysis history for change tracking
+CREATE TABLE IF NOT EXISTS file_analysis_history (
+ id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
+ repo_id VARCHAR(255) NOT NULL,
+ file_path TEXT NOT NULL,
+ content_hash VARCHAR(64) NOT NULL,
+ language VARCHAR(100),
+ lines_of_code INTEGER DEFAULT 0,
+ complexity_score REAL DEFAULT 0.0,
+ severity_score REAL DEFAULT 5.0 CHECK (severity_score >= 0.0 AND severity_score <= 10.0),
+ issues_count INTEGER DEFAULT 0,
+ analyzed_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+ analysis_version VARCHAR(50) DEFAULT '1.0'
+);
+
+-- Memory consolidation log for tracking knowledge extraction
+CREATE TABLE IF NOT EXISTS memory_consolidation_log (
+ id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
+ source_type VARCHAR(50) NOT NULL, -- 'episodic', 'code_analysis', 'manual'
+ source_id VARCHAR(255) NOT NULL,
+ target_memory_type VARCHAR(50) NOT NULL, -- 'persistent', 'working'
+ target_id VARCHAR(255),
+ consolidation_confidence REAL DEFAULT 0.5,
+ consolidation_timestamp TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+ consolidation_metadata JSONB DEFAULT '{}'
+);
+
+-- ================================================
+-- PERFORMANCE INDEXES
+-- ================================================
+
+-- Code embeddings indexes
+CREATE INDEX IF NOT EXISTS idx_code_embeddings_repo_id ON code_embeddings(repo_id);
+CREATE INDEX IF NOT EXISTS idx_code_embeddings_file_path ON code_embeddings(file_path);
+CREATE INDEX IF NOT EXISTS idx_code_embeddings_accessed ON code_embeddings(last_accessed DESC);
+CREATE INDEX IF NOT EXISTS idx_code_embeddings_metadata ON code_embeddings USING gin(metadata);
+
+-- Vector similarity indexes (using IVFFlat for better performance)
+CREATE INDEX IF NOT EXISTS idx_code_embeddings_vector
+ON code_embeddings USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);
+
+-- Query embeddings indexes
+CREATE INDEX IF NOT EXISTS idx_query_embeddings_session ON query_embeddings(session_id);
+CREATE INDEX IF NOT EXISTS idx_query_embeddings_timestamp ON query_embeddings(timestamp DESC);
+CREATE INDEX IF NOT EXISTS idx_query_embeddings_repo_context ON query_embeddings(repo_context);
+CREATE INDEX IF NOT EXISTS idx_query_embeddings_vector
+ON query_embeddings USING ivfflat (query_embedding vector_cosine_ops) WITH (lists = 100);
+
+-- Knowledge embeddings indexes
+CREATE INDEX IF NOT EXISTS idx_knowledge_embeddings_category ON knowledge_embeddings(category);
+CREATE INDEX IF NOT EXISTS idx_knowledge_embeddings_confidence ON knowledge_embeddings(confidence DESC);
+CREATE INDEX IF NOT EXISTS idx_knowledge_embeddings_access_freq ON knowledge_embeddings(access_frequency DESC);
+CREATE INDEX IF NOT EXISTS idx_knowledge_embeddings_vector
+ON knowledge_embeddings USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);
+CREATE INDEX IF NOT EXISTS idx_knowledge_source_repos ON knowledge_embeddings USING gin(source_repos);
+
+-- Repository metadata indexes
+CREATE INDEX IF NOT EXISTS idx_repository_metadata_repo_id ON repository_metadata(repo_id);
+CREATE INDEX IF NOT EXISTS idx_repository_metadata_analyzed ON repository_metadata(last_analyzed DESC);
+CREATE INDEX IF NOT EXISTS idx_repository_metadata_language ON repository_metadata(primary_language);
+
+-- File history indexes
+CREATE INDEX IF NOT EXISTS idx_file_history_repo_file ON file_analysis_history(repo_id, file_path);
+CREATE INDEX IF NOT EXISTS idx_file_history_analyzed ON file_analysis_history(analyzed_at DESC);
+CREATE INDEX IF NOT EXISTS idx_file_history_severity ON file_analysis_history(severity_score);
+
+-- ================================================
+-- MATERIALIZED VIEWS FOR COMMON QUERIES
+-- ================================================
+
+-- High confidence knowledge view
+CREATE MATERIALIZED VIEW IF NOT EXISTS high_confidence_knowledge AS
+SELECT
+ fact_id,
+ content,
+ category,
+ confidence,
+ source_repos,
+ created_at,
+ last_accessed,
+ access_frequency
+FROM knowledge_embeddings
+WHERE confidence > 0.8
+ORDER BY confidence DESC, access_frequency DESC;
+
+CREATE INDEX ON high_confidence_knowledge (category);
+CREATE INDEX ON high_confidence_knowledge (confidence DESC);
+
+-- Repository quality summary view
+CREATE MATERIALIZED VIEW IF NOT EXISTS repository_quality_summary AS
+SELECT
+ rm.repo_id,
+ rm.repo_path,
+ rm.repo_name,
+ rm.primary_language,
+ rm.total_files,
+ rm.total_lines,
+ rm.quality_score,
+ rm.last_analyzed,
+ COUNT(ce.id) as total_embeddings,
+ AVG(fah.severity_score) as avg_file_quality,
+ COUNT(DISTINCT fah.file_path) as analyzed_files_count
+FROM repository_metadata rm
+LEFT JOIN code_embeddings ce ON rm.repo_id = ce.repo_id
+LEFT JOIN file_analysis_history fah ON rm.repo_id = fah.repo_id
+GROUP BY rm.repo_id, rm.repo_path, rm.repo_name, rm.primary_language,
+ rm.total_files, rm.total_lines, rm.quality_score, rm.last_analyzed;
+
+CREATE INDEX ON repository_quality_summary (quality_score DESC);
+CREATE INDEX ON repository_quality_summary (last_analyzed DESC);
+
+-- Recent activity view
+CREATE MATERIALIZED VIEW IF NOT EXISTS recent_activity AS
+SELECT
+ 'query' as activity_type,
+ session_id as identifier,
+ query_text as description,
+ timestamp as activity_time,
+ repo_context
+FROM query_embeddings
+WHERE timestamp >= CURRENT_TIMESTAMP - INTERVAL '7 days'
+UNION ALL
+SELECT
+ 'analysis' as activity_type,
+ repo_id as identifier,
+ file_path as description,
+ analyzed_at as activity_time,
+ repo_id as repo_context
+FROM file_analysis_history
+WHERE analyzed_at >= CURRENT_TIMESTAMP - INTERVAL '7 days'
+ORDER BY activity_time DESC;
+
+CREATE INDEX ON recent_activity (activity_time DESC);
+CREATE INDEX ON recent_activity (activity_type);
+
+-- ================================================
+-- STORED FUNCTIONS AND PROCEDURES
+-- ================================================
+
+-- Function to refresh all materialized views
+CREATE OR REPLACE FUNCTION refresh_memory_views()
+RETURNS void AS $$
+BEGIN
+ REFRESH MATERIALIZED VIEW CONCURRENTLY high_confidence_knowledge;
+ REFRESH MATERIALIZED VIEW CONCURRENTLY repository_quality_summary;
+ REFRESH MATERIALIZED VIEW CONCURRENTLY recent_activity;
+
+ -- Log the refresh
+ INSERT INTO memory_consolidation_log (
+ source_type, source_id, target_memory_type, target_id,
+ consolidation_confidence, consolidation_metadata
+ ) VALUES (
+ 'system', 'materialized_views', 'system', 'view_refresh',
+ 1.0, '{"refresh_time": "' || CURRENT_TIMESTAMP || '"}'::jsonb
+ );
+END;
+$$ LANGUAGE plpgsql;
+
+-- Function to calculate semantic similarity between texts
+CREATE OR REPLACE FUNCTION calculate_similarity(embedding1 vector(384), embedding2 vector(384))
+RETURNS real AS $$
+BEGIN
+ RETURN 1 - (embedding1 <=> embedding2);
+END;
+$$ LANGUAGE plpgsql IMMUTABLE STRICT;
+
+-- Function to update access patterns
+CREATE OR REPLACE FUNCTION update_access_pattern(table_name text, id_column text, id_value text)
+RETURNS void AS $$
+BEGIN
+ CASE table_name
+ WHEN 'knowledge_embeddings' THEN
+ EXECUTE 'UPDATE knowledge_embeddings SET last_accessed = CURRENT_TIMESTAMP, access_frequency = access_frequency + 1 WHERE fact_id = $1'
+ USING id_value;
+ WHEN 'code_embeddings' THEN
+ EXECUTE 'UPDATE code_embeddings SET last_accessed = CURRENT_TIMESTAMP, access_count = access_count + 1 WHERE id = $1::uuid'
+ USING id_value;
+ ELSE
+ RAISE EXCEPTION 'Unsupported table: %', table_name;
+ END CASE;
+END;
+$$ LANGUAGE plpgsql;
+
+-- Function to cleanup old memories
+CREATE OR REPLACE FUNCTION cleanup_old_memories(retention_days integer DEFAULT 365)
+RETURNS integer AS $$
+DECLARE
+ deleted_count integer := 0;
+ cutoff_date timestamp;
+BEGIN
+ cutoff_date := CURRENT_TIMESTAMP - (retention_days || ' days')::interval;
+
+ -- Delete old query embeddings (episodic memories)
+ DELETE FROM query_embeddings WHERE timestamp < cutoff_date;
+ GET DIAGNOSTICS deleted_count = ROW_COUNT;
+
+ -- Update knowledge confidence based on access patterns
+ UPDATE knowledge_embeddings
+ SET confidence = LEAST(confidence * (
+ CASE
+ WHEN EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - last_accessed)) / 86400 < 30
+ THEN 1.05
+ ELSE 0.98
+ END *
+ (1.0 + LOG(access_frequency + 1) / 20.0)
+ ), 1.0);
+
+ -- Log cleanup activity
+ INSERT INTO memory_consolidation_log (
+ source_type, source_id, target_memory_type, target_id,
+ consolidation_confidence, consolidation_metadata
+ ) VALUES (
+ 'system', 'cleanup_function', 'system', 'memory_cleanup',
+ 1.0, ('{"deleted_records": ' || deleted_count || ', "cutoff_date": "' || cutoff_date || '"}')::jsonb
+ );
+
+ RETURN deleted_count;
+END;
+$$ LANGUAGE plpgsql;
+
+-- Function to find similar code patterns
+CREATE OR REPLACE FUNCTION find_similar_code(
+ query_embedding vector(384),
+ repo_filter text DEFAULT NULL,
+ similarity_threshold real DEFAULT 0.7,
+ max_results integer DEFAULT 10
+)
+RETURNS TABLE (
+ id uuid,
+ repo_id varchar(255),
+ file_path text,
+ similarity real,
+ metadata jsonb
+) AS $$
+BEGIN
+ RETURN QUERY
+ SELECT
+ ce.id,
+ ce.repo_id,
+ ce.file_path,
+ (1 - (ce.embedding <=> query_embedding))::real as similarity,
+ ce.metadata
+ FROM code_embeddings ce
+ WHERE (repo_filter IS NULL OR ce.repo_id = repo_filter)
+ AND (1 - (ce.embedding <=> query_embedding)) > similarity_threshold
+ ORDER BY similarity DESC
+ LIMIT max_results;
+END;
+$$ LANGUAGE plpgsql;
+
+-- Function to get knowledge by category
+CREATE OR REPLACE FUNCTION get_knowledge_by_category(
+ category_filter varchar(100),
+ min_confidence real DEFAULT 0.5,
+ max_results integer DEFAULT 20
+)
+RETURNS TABLE (
+ fact_id varchar(255),
+ content text,
+ confidence real,
+ access_frequency integer,
+ source_repos text[]
+) AS $$
+BEGIN
+ RETURN QUERY
+ SELECT
+ ke.fact_id,
+ ke.content,
+ ke.confidence,
+ ke.access_frequency,
+ ke.source_repos
+ FROM knowledge_embeddings ke
+ WHERE ke.category = category_filter
+ AND ke.confidence >= min_confidence
+ ORDER BY ke.confidence DESC, ke.access_frequency DESC
+ LIMIT max_results;
+END;
+$$ LANGUAGE plpgsql;
+
+-- ================================================
+-- TRIGGERS FOR AUTOMATIC MAINTENANCE
+-- ================================================
+
+-- Trigger function to update repository metadata when embeddings are added
+CREATE OR REPLACE FUNCTION update_repository_stats()
+RETURNS trigger AS $$
+BEGIN
+ -- Update or insert repository metadata
+ INSERT INTO repository_metadata (repo_id, repo_path, analysis_count, last_analyzed)
+ VALUES (NEW.repo_id, NEW.repo_id, 1, CURRENT_TIMESTAMP)
+ ON CONFLICT (repo_id)
+ DO UPDATE SET
+ analysis_count = repository_metadata.analysis_count + 1,
+ last_analyzed = CURRENT_TIMESTAMP;
+
+ RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+
+-- Create triggers
+DROP TRIGGER IF EXISTS trigger_update_repo_stats ON code_embeddings;
+CREATE TRIGGER trigger_update_repo_stats
+ AFTER INSERT ON code_embeddings
+ FOR EACH ROW
+ EXECUTE FUNCTION update_repository_stats();
+
+-- Trigger to automatically update access patterns
+CREATE OR REPLACE FUNCTION auto_update_access()
+RETURNS trigger AS $$
+BEGIN
+ NEW.last_accessed = CURRENT_TIMESTAMP;
+ NEW.access_count = COALESCE(OLD.access_count, 0) + 1;
+ RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+
+DROP TRIGGER IF EXISTS trigger_auto_access_update ON code_embeddings;
+CREATE TRIGGER trigger_auto_access_update
+ BEFORE UPDATE ON code_embeddings
+ FOR EACH ROW
+ EXECUTE FUNCTION auto_update_access();
+
+-- ================================================
+-- SECURITY AND PERMISSIONS
+-- ================================================
+
+-- Create roles for different access levels
+DO $$
+BEGIN
+ IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'repo_analyzer_read') THEN
+ CREATE ROLE repo_analyzer_read;
+ END IF;
+
+ IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'repo_analyzer_write') THEN
+ CREATE ROLE repo_analyzer_write;
+ END IF;
+
+ IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'repo_analyzer_admin') THEN
+ CREATE ROLE repo_analyzer_admin;
+ END IF;
+END
+$$;
+
+-- Grant permissions
+GRANT SELECT ON ALL TABLES IN SCHEMA public TO repo_analyzer_read;
+GRANT SELECT ON high_confidence_knowledge TO repo_analyzer_read;
+GRANT SELECT ON repository_quality_summary TO repo_analyzer_read;
+GRANT SELECT ON recent_activity TO repo_analyzer_read;
+
+GRANT SELECT, INSERT, UPDATE ON ALL TABLES IN SCHEMA public TO repo_analyzer_write;
+GRANT SELECT ON high_confidence_knowledge TO repo_analyzer_write;
+GRANT SELECT ON repository_quality_summary TO repo_analyzer_write;
+GRANT SELECT ON recent_activity TO repo_analyzer_write;
+GRANT USAGE, SELECT ON ALL SEQUENCES IN SCHEMA public TO repo_analyzer_write;
+
+GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO repo_analyzer_admin;
+GRANT ALL PRIVILEGES ON high_confidence_knowledge TO repo_analyzer_admin;
+GRANT ALL PRIVILEGES ON repository_quality_summary TO repo_analyzer_admin;
+GRANT ALL PRIVILEGES ON recent_activity TO repo_analyzer_admin;
+GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA public TO repo_analyzer_admin;
+GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA public TO repo_analyzer_admin;
+
+-- ================================================
+-- DATA VALIDATION AND CONSTRAINTS
+-- ================================================
+
+-- Add check constraints for data quality
+-- Note: Vector dimensions are validated at insertion time, no need for runtime checks
+
+-- Add constraints for reasonable data ranges
+DO $$
+BEGIN
+ IF NOT EXISTS (SELECT 1 FROM pg_constraint WHERE conname = 'reasonable_lines_of_code') THEN
+ ALTER TABLE file_analysis_history ADD CONSTRAINT reasonable_lines_of_code
+ CHECK (lines_of_code >= 0 AND lines_of_code <= 1000000);
+ END IF;
+
+ IF NOT EXISTS (SELECT 1 FROM pg_constraint WHERE conname = 'reasonable_complexity') THEN
+ ALTER TABLE file_analysis_history ADD CONSTRAINT reasonable_complexity
+ CHECK (complexity_score >= 0.0 AND complexity_score <= 100.0);
+ END IF;
+END
+$$;
+
+-- ================================================
+-- INITIAL DATA AND CONFIGURATION
+-- ================================================
+
+-- Insert initial system configuration
+INSERT INTO memory_consolidation_log (
+ source_type, source_id, target_memory_type, target_id,
+ consolidation_confidence, consolidation_metadata
+) VALUES (
+ 'system', 'database_migration', 'system', 'initial_setup',
+ 1.0, ('{"migration_version": "1.0", "setup_time": "' || CURRENT_TIMESTAMP || '"}')::jsonb
+) ON CONFLICT DO NOTHING;
+
+-- Create initial knowledge categories
+INSERT INTO knowledge_embeddings (
+ fact_id, content, category, embedding, confidence, source_repos
+) VALUES
+(
+ 'init_security_001',
+ 'Always validate and sanitize user input to prevent injection attacks',
+ 'security_vulnerability',
+ array_fill(0.0, ARRAY[384])::vector(384),
+ 0.95,
+ ARRAY[]::text[]
+),
+(
+ 'init_performance_001',
+ 'Use appropriate data structures and algorithms for better performance',
+ 'performance',
+ array_fill(0.0, ARRAY[384])::vector(384),
+ 0.9,
+ ARRAY[]::text[]
+),
+(
+ 'init_best_practice_001',
+ 'Follow consistent naming conventions and code formatting standards',
+ 'best_practice',
+ array_fill(0.0, ARRAY[384])::vector(384),
+ 0.85,
+ ARRAY[]::text[]
+)
+ON CONFLICT (fact_id) DO NOTHING;
+
+-- ================================================
+-- BACKUP AND MAINTENANCE PROCEDURES
+-- ================================================
+
+-- Function to create backup of critical memory data
+CREATE OR REPLACE FUNCTION backup_memory_data(backup_path text DEFAULT '/tmp/memory_backup')
+RETURNS text AS $$
+DECLARE
+ backup_file text;
+ result_message text;
+BEGIN
+ backup_file := backup_path || '_' || to_char(CURRENT_TIMESTAMP, 'YYYY-MM-DD_HH24-MI-SS') || '.sql';
+
+ -- This would need to be implemented with actual backup logic
+ -- For now, just return the intended backup file name
+ result_message := 'Backup would be created at: ' || backup_file;
+
+ -- Log backup activity
+ INSERT INTO memory_consolidation_log (
+ source_type, source_id, target_memory_type, target_id,
+ consolidation_confidence, consolidation_metadata
+ ) VALUES (
+ 'system', 'backup_function', 'system', 'backup_created',
+ 1.0, ('{"backup_file": "' || backup_file || '"}')::jsonb
+ );
+
+ RETURN result_message;
+END;
+$$ LANGUAGE plpgsql;
+
+-- ================================================
+-- MONITORING AND ANALYTICS
+-- ================================================
+
+-- View for system health monitoring
+CREATE OR REPLACE VIEW system_health_monitor AS
+SELECT
+ 'code_embeddings' as table_name,
+ COUNT(*) as record_count,
+ MAX(created_at) as latest_record,
+ AVG(access_count) as avg_access_count
+FROM code_embeddings
+UNION ALL
+SELECT
+ 'query_embeddings' as table_name,
+ COUNT(*) as record_count,
+ MAX(timestamp) as latest_record,
+ NULL as avg_access_count
+FROM query_embeddings
+UNION ALL
+SELECT
+ 'knowledge_embeddings' as table_name,
+ COUNT(*) as record_count,
+ MAX(created_at) as latest_record,
+ AVG(access_frequency) as avg_access_count
+FROM knowledge_embeddings;
+
+-- Function to get comprehensive system statistics
+CREATE OR REPLACE FUNCTION get_system_statistics()
+RETURNS jsonb AS $$
+DECLARE
+ stats jsonb;
+BEGIN
+ SELECT jsonb_build_object(
+ 'total_code_embeddings', (SELECT COUNT(*) FROM code_embeddings),
+ 'total_query_embeddings', (SELECT COUNT(*) FROM query_embeddings),
+ 'total_knowledge_embeddings', (SELECT COUNT(*) FROM knowledge_embeddings),
+ 'unique_repositories', (SELECT COUNT(DISTINCT repo_id) FROM code_embeddings),
+ 'high_confidence_knowledge', (SELECT COUNT(*) FROM knowledge_embeddings WHERE confidence > 0.8),
+ 'recent_activity_7d', (SELECT COUNT(*) FROM query_embeddings WHERE timestamp >= CURRENT_TIMESTAMP - INTERVAL '7 days'),
+ 'average_code_quality', (SELECT AVG(quality_score) FROM repository_metadata),
+ 'last_updated', CURRENT_TIMESTAMP
+ ) INTO stats;
+
+ RETURN stats;
+END;
+$$ LANGUAGE plpgsql;
+
+-- ================================================
+-- COMPLETION MESSAGE
+-- ================================================
+
+DO $$
+BEGIN
+ RAISE NOTICE '================================================';
+ RAISE NOTICE 'Repository Analyzer Memory System Database Setup Complete';
+ RAISE NOTICE '================================================';
+ RAISE NOTICE 'Tables created: code_embeddings, query_embeddings, knowledge_embeddings';
+ RAISE NOTICE 'Indexes created: Vector similarity indexes with IVFFlat';
+ RAISE NOTICE 'Functions created: Similarity search, cleanup, statistics';
+ RAISE NOTICE 'Materialized views created: High confidence knowledge, repository summary';
+ RAISE NOTICE 'Triggers created: Auto-update repository stats and access patterns';
+ RAISE NOTICE '================================================';
+ RAISE NOTICE 'Ready for AI-enhanced repository analysis with persistent memory';
+ RAISE NOTICE '================================================';
+END
+$$;
\ No newline at end of file
diff --git a/services/ai-analysis-service/Dockerfile b/services/ai-analysis-service/Dockerfile
new file mode 100644
index 0000000..9f3745a
--- /dev/null
+++ b/services/ai-analysis-service/Dockerfile
@@ -0,0 +1,37 @@
+FROM python:3.11-slim
+
+# Set working directory
+WORKDIR /app
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+ git \
+ postgresql-client \
+ curl \
+ build-essential \
+ && rm -rf /var/lib/apt/lists/*
+
+# Copy requirements and install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy the service code
+COPY . .
+
+# Create necessary directories
+RUN mkdir -p /app/logs /app/temp /app/reports
+
+# Set environment variables
+ENV PYTHONPATH=/app
+ENV PYTHONUNBUFFERED=1
+ENV PORT=8022
+
+# Expose port
+EXPOSE 8022
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+ CMD curl -f http://localhost:8022/health || exit 1
+
+# Run migration and then start the service
+CMD ["sh", "-c", "python run_migration.py && python server.py"]
diff --git a/services/ai-analysis-service/README.md b/services/ai-analysis-service/README.md
new file mode 100644
index 0000000..111c0ff
--- /dev/null
+++ b/services/ai-analysis-service/README.md
@@ -0,0 +1,202 @@
+# Complete AI Repository Analysis Service
+
+A comprehensive AI-powered repository analysis tool that automatically analyzes **ALL files** in a repository without any limits or user queries required.
+
+## š Features
+
+- **Complete Analysis**: Analyzes ALL files in the repository (no max-files limit)
+- **Fully Automated**: No user query required - runs completely automatically
+- **Memory-Enhanced**: Learns from previous analyses using advanced memory systems
+- **Comprehensive Reports**: Generates detailed PDF reports with executive summaries
+- **Multi-Database Support**: Uses PostgreSQL, MongoDB, and Redis for optimal performance
+- **Security Focus**: Identifies security vulnerabilities and code quality issues
+- **Architecture Assessment**: Provides architectural insights and recommendations
+
+## š Requirements
+
+### System Dependencies
+- Python 3.8+
+- PostgreSQL with pgvector extension
+- MongoDB
+- Redis
+
+### Python Dependencies
+```bash
+pip install anthropic python-dotenv git redis pymongo psycopg2-binary numpy reportlab
+```
+
+## š ļø Setup
+
+1. **Install Dependencies**:
+ ```bash
+ pip install -r requirements.txt
+ ```
+
+2. **Database Setup**:
+ ```bash
+ # Run the database migration
+ psql -U postgres -d repo_vectors -f 001-schema.sql
+ ```
+
+3. **Environment Variables**:
+ Create a `.env` file with:
+ ```env
+ ANTHROPIC_API_KEY=your_api_key_here
+ REDIS_HOST=localhost
+ REDIS_PORT=6379
+ REDIS_DB=0
+ MONGODB_URL=mongodb://localhost:27017/
+ MONGODB_DB=repo_analyzer
+ POSTGRES_HOST=localhost
+ POSTGRES_PORT=5432
+ POSTGRES_DB=repo_vectors
+ POSTGRES_USER=postgres
+ POSTGRES_PASSWORD=your_password
+ ```
+
+## šÆ Usage
+
+### Basic Usage
+```bash
+python ai-analyze.py /path/to/repository
+```
+
+### With Custom Output
+```bash
+python ai-analyze.py /path/to/repository --output my_analysis.pdf
+```
+
+### With API Key Override
+```bash
+python ai-analyze.py /path/to/repository --api-key your_api_key
+```
+
+## š What It Analyzes
+
+### File Types Supported
+- **Programming Languages**: Python, JavaScript, TypeScript, Java, C++, C#, Go, Rust, PHP, Ruby, Swift, Kotlin
+- **Web Technologies**: HTML, CSS, SCSS, SASS
+- **Configuration Files**: JSON, YAML, XML, SQL
+- **Build Files**: Dockerfile, Makefile, CMake, package.json, requirements.txt, Cargo.toml, pom.xml, build.gradle
+- **Documentation**: README.md, Markdown files
+
+### Analysis Coverage
+- **Code Quality**: Complexity, maintainability, best practices
+- **Security**: Vulnerabilities, injection attacks, authentication issues
+- **Architecture**: Project structure, scalability, design patterns
+- **Performance**: Optimization opportunities, bottlenecks
+- **Documentation**: Completeness and quality
+
+## š Output
+
+### Console Output
+- Real-time analysis progress
+- Repository statistics
+- Quality breakdown by file
+- Language distribution
+- Memory system statistics
+
+### PDF Report
+- Executive summary for leadership
+- Repository overview with metrics
+- Detailed file-by-file analysis
+- Security assessment
+- Architecture evaluation
+- Recommendations and next steps
+
+## š§ Memory System
+
+The tool uses a sophisticated three-tier memory system:
+
+1. **Working Memory (Redis)**: Temporary, fast access for current analysis
+2. **Episodic Memory (MongoDB)**: User interactions and analysis sessions
+3. **Persistent Memory (PostgreSQL)**: Long-term knowledge and best practices
+
+This allows the tool to learn from previous analyses and provide increasingly accurate insights.
+
+## š§ Configuration
+
+### File Size Limits
+- Default: 2MB per file (configurable in code)
+- Large files are skipped with notification
+
+### Excluded Directories
+- `.git`, `node_modules`, `__pycache__`, `build`, `dist`, `target`
+- `venv`, `env`, `.next`, `coverage`, `vendor`
+- `bower_components`, `.gradle`, `.m2`, `.cargo`
+
+### Rate Limiting
+- 0.1 second delay between file analyses to avoid API rate limits
+- Configurable in the code
+
+## š Example Output
+
+```
+š Starting Complete AI Repository Analysis
+============================================================
+Repository: /path/to/my-project
+Output: complete_repository_analysis.pdf
+Mode: Complete automated analysis of ALL files
+============================================================
+
+Scanning repository: /path/to/my-project
+Found 127 files to analyze
+Starting comprehensive analysis of 127 files...
+Analyzing file 1/127: main.py
+Analyzing file 2/127: config.js
+...
+
+šÆ COMPLETE ANALYSIS FINISHED
+============================================================
+š Repository Statistics:
+ ⢠Files Analyzed: 127
+ ⢠Lines of Code: 15,432
+ ⢠Languages: 8
+ ⢠Code Quality: 7.2/10
+
+š Quality Breakdown:
+ ⢠High Quality Files (8-10): 45
+ ⢠Medium Quality Files (5-7): 67
+ ⢠Low Quality Files (1-4): 15
+ ⢠Total Issues Found: 89
+
+š¤ Language Distribution:
+ ⢠Python: 45 files
+ ⢠JavaScript: 32 files
+ ⢠TypeScript: 28 files
+ ⢠HTML: 12 files
+ ⢠CSS: 10 files
+
+š Complete PDF Report: complete_repository_analysis.pdf
+ā
Complete analysis finished successfully!
+```
+
+## šØ Troubleshooting
+
+### Common Issues
+
+1. **Database Connection Errors**:
+ - Ensure PostgreSQL, MongoDB, and Redis are running
+ - Check connection credentials in `.env` file
+
+2. **API Key Issues**:
+ - Verify Anthropic API key is valid and has sufficient credits
+ - Check rate limits if analysis fails
+
+3. **Memory Issues**:
+ - Large repositories may require more RAM
+ - Consider increasing system memory or processing in batches
+
+4. **File Permission Errors**:
+ - Ensure read access to repository files
+ - Check write permissions for output directory
+
+## š¤ Contributing
+
+This is a complete automated analysis system. The tool will:
+- Analyze every file in the repository
+- Generate comprehensive reports
+- Learn from previous analyses
+- Provide actionable insights
+
+No user interaction required - just run and get results!
diff --git a/services/ai-analysis-service/ai-analysis/adv_git_analyzer.py b/services/ai-analysis-service/ai-analysis/adv_git_analyzer.py
new file mode 100644
index 0000000..a5f3860
--- /dev/null
+++ b/services/ai-analysis-service/ai-analysis/adv_git_analyzer.py
@@ -0,0 +1,710 @@
+#!/usr/bin/env python3
+"""
+Robust GitHub Repository AI Analysis Tool
+Simplified version with better error handling and JSON parsing.
+"""
+
+import os
+import asyncio
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+from datetime import datetime
+import argparse
+from dataclasses import dataclass
+import shutil
+import tempfile
+import json
+import re
+from collections import Counter
+
+# Core packages
+import anthropic
+from dotenv import load_dotenv
+import git
+
+# PDF generation
+from reportlab.lib.pagesizes import A4
+from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
+from reportlab.lib.enums import TA_CENTER, TA_LEFT
+from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak, Table, TableStyle
+from reportlab.lib import colors
+
+@dataclass
+class FileAnalysis:
+ path: str
+ language: str
+ lines_of_code: int
+ complexity_score: float
+ issues_found: List[str]
+ recommendations: List[str]
+ detailed_analysis: str
+ severity_score: float
+
+@dataclass
+class RepositoryAnalysis:
+ repo_path: str
+ total_files: int
+ total_lines: int
+ languages: Dict[str, int]
+ architecture_assessment: str
+ security_assessment: str
+ code_quality_score: float
+ file_analyses: List[FileAnalysis]
+ executive_summary: str
+
+class RobustGitHubAnalyzer:
+ def __init__(self, api_key: str):
+ self.client = anthropic.Anthropic(api_key=api_key)
+ self.temp_dir = None
+
+ # Language mapping for file detection
+ self.language_map = {
+ '.py': 'Python', '.js': 'JavaScript', '.ts': 'TypeScript',
+ '.tsx': 'TypeScript', '.jsx': 'JavaScript', '.java': 'Java',
+ '.cpp': 'C++', '.c': 'C', '.cs': 'C#', '.go': 'Go', '.rs': 'Rust',
+ '.php': 'PHP', '.rb': 'Ruby', '.swift': 'Swift', '.kt': 'Kotlin',
+ '.html': 'HTML', '.css': 'CSS', '.scss': 'SCSS', '.sass': 'SASS',
+ '.sql': 'SQL', '.yaml': 'YAML', '.yml': 'YAML', '.json': 'JSON',
+ '.xml': 'XML', '.sh': 'Shell', '.dockerfile': 'Docker',
+ '.md': 'Markdown', '.txt': 'Text'
+ }
+
+ # Code file extensions to analyze
+ self.code_extensions = set(self.language_map.keys())
+
+ def clone_repository(self, repo_path: str) -> str:
+ """Clone repository or use existing path."""
+ if os.path.exists(repo_path):
+ print(f"Using existing repository: {repo_path}")
+ return repo_path
+ else:
+ print(f"Cloning repository: {repo_path}")
+ self.temp_dir = tempfile.mkdtemp(prefix="repo_analysis_")
+ try:
+ git.Repo.clone_from(repo_path, self.temp_dir)
+ return self.temp_dir
+ except Exception as e:
+ raise Exception(f"Failed to clone repository: {e}")
+
+ def get_file_language(self, file_path: Path) -> str:
+ """Get programming language from file extension."""
+ return self.language_map.get(file_path.suffix.lower(), 'Unknown')
+
+ def calculate_complexity_score(self, content: str) -> float:
+ """Calculate basic complexity score based on code patterns."""
+ lines = content.split('\n')
+ complexity_indicators = ['if', 'else', 'elif', 'for', 'while', 'try', 'except', 'catch', 'switch']
+
+ complexity = 1
+ for line in lines:
+ line_lower = line.lower().strip()
+ for indicator in complexity_indicators:
+ if indicator in line_lower:
+ complexity += 1
+
+ # Normalize to 1-10 scale
+ return min(complexity / max(len(lines), 1) * 100, 10.0)
+
+ async def analyze_file_comprehensive(self, file_path: Path, content: str) -> FileAnalysis:
+ """Perform comprehensive file analysis using a single, robust prompt."""
+ language = self.get_file_language(file_path)
+ lines_of_code = len([line for line in content.split('\n') if line.strip()])
+ complexity_score = self.calculate_complexity_score(content)
+
+ # Truncate content if too long
+ if len(content) > 4000:
+ content = content[:4000] + "\n... [truncated for analysis]"
+
+ print(f" Analyzing {file_path.name} ({language}, {lines_of_code} lines)")
+
+ # Create comprehensive analysis prompt
+ prompt = f"""
+You are a senior software engineer with 25 years of experience. Analyze this {language} code file:
+
+FILENAME: {file_path.name}
+LANGUAGE: {language}
+LINES OF CODE: {lines_of_code}
+
+CODE:
+```{language.lower()}
+{content}
+```
+
+Provide a comprehensive analysis covering:
+
+1. ISSUES FOUND: List specific problems, bugs, security vulnerabilities, or code smells
+2. RECOMMENDATIONS: Actionable suggestions for improvement
+3. CODE QUALITY: Overall assessment of code quality and maintainability
+4. SECURITY: Any security concerns or vulnerabilities
+5. PERFORMANCE: Potential performance issues or optimizations
+6. BEST PRACTICES: Adherence to coding standards and best practices
+
+Provide your analysis in clear, structured text (not JSON). Be specific and actionable.
+Rate the overall code quality from 1-10 where 10 is excellent.
+
+ANALYSIS:
+"""
+
+ try:
+ message = self.client.messages.create(
+ model="claude-3-5-sonnet-20241022",
+ max_tokens=3000,
+ temperature=0.1,
+ messages=[{"role": "user", "content": prompt}]
+ )
+
+ analysis_text = message.content[0].text.strip()
+
+ # Extract severity score from analysis
+ severity_match = re.search(r'(\d+(?:\.\d+)?)/10', analysis_text)
+ severity_score = float(severity_match.group(1)) if severity_match else 5.0
+
+ # Parse issues and recommendations from the text
+ issues = self.extract_issues_from_analysis(analysis_text)
+ recommendations = self.extract_recommendations_from_analysis(analysis_text)
+
+ return FileAnalysis(
+ path=str(file_path.relative_to(Path(self.temp_dir or '.'))),
+ language=language,
+ lines_of_code=lines_of_code,
+ complexity_score=complexity_score,
+ issues_found=issues,
+ recommendations=recommendations,
+ detailed_analysis=analysis_text,
+ severity_score=severity_score
+ )
+
+ except Exception as e:
+ print(f" Error analyzing {file_path.name}: {e}")
+ return FileAnalysis(
+ path=str(file_path),
+ language=language,
+ lines_of_code=lines_of_code,
+ complexity_score=complexity_score,
+ issues_found=[f"Analysis failed: {str(e)}"],
+ recommendations=["Review file manually due to analysis error"],
+ detailed_analysis=f"Analysis failed due to error: {str(e)}",
+ severity_score=5.0
+ )
+
+ def extract_issues_from_analysis(self, analysis_text: str) -> List[str]:
+ """Extract issues from analysis text."""
+ issues = []
+ lines = analysis_text.split('\n')
+
+ # Look for common issue indicators
+ issue_keywords = ['issue', 'problem', 'bug', 'vulnerability', 'error', 'warning', 'concern']
+
+ for line in lines:
+ line_lower = line.lower().strip()
+ if any(keyword in line_lower for keyword in issue_keywords):
+ if line.strip() and not line.strip().startswith('#'):
+ issues.append(line.strip())
+
+ return issues[:10] # Limit to top 10 issues
+
+ def extract_recommendations_from_analysis(self, analysis_text: str) -> List[str]:
+ """Extract recommendations from analysis text."""
+ recommendations = []
+ lines = analysis_text.split('\n')
+
+ # Look for recommendation indicators
+ rec_keywords = ['recommend', 'suggest', 'should', 'consider', 'improve']
+
+ for line in lines:
+ line_lower = line.lower().strip()
+ if any(keyword in line_lower for keyword in rec_keywords):
+ if line.strip() and not line.strip().startswith('#'):
+ recommendations.append(line.strip())
+
+ return recommendations[:10] # Limit to top 10 recommendations
+
+ def scan_repository(self, repo_path: str, max_files: int = 50) -> List[Tuple[Path, str]]:
+ """Scan repository and collect files for analysis."""
+ print(f"Scanning repository: {repo_path}")
+
+ files_to_analyze = []
+
+ # Important files to always include
+ important_files = {
+ 'README.md', 'package.json', 'requirements.txt', 'Dockerfile',
+ 'docker-compose.yml', 'tsconfig.json', 'next.config.js',
+ 'tailwind.config.js', 'webpack.config.js', '.env.example'
+ }
+
+ for root, dirs, files in os.walk(repo_path):
+ # Skip common build/cache directories
+ dirs[:] = [d for d in dirs if not d.startswith('.') and
+ d not in {'node_modules', '__pycache__', 'build', 'dist', 'target',
+ 'venv', 'env', '.git', '.next', 'coverage'}]
+
+ for file in files:
+ if len(files_to_analyze) >= max_files:
+ break
+
+ file_path = Path(root) / file
+
+ # Skip large files
+ try:
+ if file_path.stat().st_size > 1000000: # 1MB limit
+ continue
+ except:
+ continue
+
+ # Include important files or files with code extensions
+ should_include = (
+ file.lower() in important_files or
+ file_path.suffix.lower() in self.code_extensions or
+ file.lower().startswith('dockerfile')
+ )
+
+ if should_include:
+ try:
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+ content = f.read()
+ if content.strip(): # Only non-empty files
+ files_to_analyze.append((file_path, content))
+ except Exception as e:
+ print(f"Could not read {file_path}: {e}")
+
+ print(f"Found {len(files_to_analyze)} files to analyze")
+ return files_to_analyze
+
+ async def analyze_repository_overview(self, repo_path: str, file_analyses: List[FileAnalysis]) -> Tuple[str, str]:
+ """Analyze repository architecture and security."""
+ print("Analyzing repository overview...")
+
+ # Prepare summary data
+ languages = dict(Counter(fa.language for fa in file_analyses))
+ total_lines = sum(fa.lines_of_code for fa in file_analyses)
+ avg_quality = sum(fa.severity_score for fa in file_analyses) / len(file_analyses) if file_analyses else 5.0
+
+ # Get repository structure
+ structure_lines = []
+ try:
+ for root, dirs, files in os.walk(repo_path):
+ dirs[:] = [d for d in dirs if not d.startswith('.') and d not in {'node_modules', '__pycache__'}]
+ level = root.replace(repo_path, '').count(os.sep)
+ indent = ' ' * level
+ structure_lines.append(f"{indent}{os.path.basename(root)}/")
+ for file in files[:3]: # Limit files shown per directory
+ structure_lines.append(f"{indent} {file}")
+ if len(structure_lines) > 50: # Limit total structure size
+ break
+ except Exception as e:
+ structure_lines = [f"Error reading structure: {e}"]
+
+ # Architecture analysis
+ arch_prompt = f"""
+You are a Senior Software Architect with 25 years of experience.
+
+Analyze this repository:
+
+REPOSITORY STRUCTURE:
+{chr(10).join(structure_lines[:30])}
+
+STATISTICS:
+- Total files analyzed: {len(file_analyses)}
+- Total lines of code: {total_lines:,}
+- Languages: {languages}
+- Average code quality: {avg_quality:.1f}/10
+
+TOP FILE ISSUES:
+{chr(10).join([f"- {fa.path}: {len(fa.issues_found)} issues" for fa in file_analyses[:10]])}
+
+Provide an architectural assessment covering:
+1. Project type and purpose
+2. Technology stack evaluation
+3. Code organization and structure
+4. Scalability and maintainability concerns
+5. Key recommendations for improvement
+
+Keep response under 1500 words and focus on actionable insights.
+"""
+
+ # Security analysis
+ security_issues = []
+ for fa in file_analyses:
+ security_issues.extend([issue for issue in fa.issues_found if
+ any(keyword in issue.lower() for keyword in
+ ['security', 'vulnerability', 'injection', 'xss', 'auth', 'password'])])
+
+ sec_prompt = f"""
+You are a Senior Security Engineer with 20+ years of experience.
+
+Security Analysis for repository with {len(file_analyses)} files:
+
+SECURITY ISSUES FOUND:
+{chr(10).join(security_issues[:20]) if security_issues else "No obvious security issues detected"}
+
+HIGH-RISK FILE TYPES PRESENT:
+{[lang for lang, count in languages.items() if lang in ['JavaScript', 'TypeScript', 'Python', 'PHP', 'SQL']]}
+
+Provide security assessment covering:
+1. Overall security posture
+2. Main security risks and vulnerabilities
+3. Authentication and authorization concerns
+4. Data protection and privacy issues
+5. Immediate security priorities
+
+Keep response under 1000 words and focus on actionable security recommendations.
+"""
+
+ try:
+ # Run both analyses
+ arch_task = self.client.messages.create(
+ model="claude-3-5-sonnet-20241022",
+ max_tokens=2000,
+ temperature=0.1,
+ messages=[{"role": "user", "content": arch_prompt}]
+ )
+
+ sec_task = self.client.messages.create(
+ model="claude-3-5-sonnet-20241022",
+ max_tokens=1500,
+ temperature=0.1,
+ messages=[{"role": "user", "content": sec_prompt}]
+ )
+
+ architecture_assessment = arch_task.content[0].text
+ security_assessment = sec_task.content[0].text
+
+ return architecture_assessment, security_assessment
+
+ except Exception as e:
+ return f"Architecture analysis failed: {e}", f"Security analysis failed: {e}"
+
+ async def generate_executive_summary(self, analysis: RepositoryAnalysis) -> str:
+ """Generate executive summary for leadership."""
+ print("Generating executive summary...")
+
+ prompt = f"""
+You are presenting to C-level executives. Create an executive summary of this technical analysis:
+
+REPOSITORY METRICS:
+- Total Files: {analysis.total_files}
+- Lines of Code: {analysis.total_lines:,}
+- Languages: {analysis.languages}
+- Code Quality Score: {analysis.code_quality_score:.1f}/10
+
+KEY FINDINGS:
+- Total issues identified: {sum(len(fa.issues_found) for fa in analysis.file_analyses)}
+- Files needing attention: {len([fa for fa in analysis.file_analyses if fa.severity_score < 7])}
+- High-quality files: {len([fa for fa in analysis.file_analyses if fa.severity_score >= 8])}
+
+Create an executive summary for non-technical leadership covering:
+1. Business impact of code quality findings
+2. Risk assessment and implications
+3. Investment priorities and recommendations
+4. Expected ROI from addressing technical debt
+5. Competitive implications
+
+Focus on business outcomes, not technical details. Keep under 800 words.
+"""
+
+ try:
+ message = self.client.messages.create(
+ model="claude-3-5-sonnet-20241022",
+ max_tokens=1200,
+ temperature=0.1,
+ messages=[{"role": "user", "content": prompt}]
+ )
+ return message.content[0].text
+ except Exception as e:
+ return f"Executive summary generation failed: {e}"
+
+ def create_pdf_report(self, analysis: RepositoryAnalysis, output_path: str):
+ """Generate comprehensive PDF report."""
+ print(f"Generating PDF report: {output_path}")
+
+ doc = SimpleDocTemplate(output_path, pagesize=A4,
+ leftMargin=72, rightMargin=72,
+ topMargin=72, bottomMargin=72)
+ styles = getSampleStyleSheet()
+ story = []
+
+ # Custom styles
+ title_style = ParagraphStyle(
+ 'CustomTitle',
+ parent=styles['Heading1'],
+ fontSize=24,
+ textColor=colors.darkblue,
+ spaceAfter=30,
+ alignment=TA_CENTER
+ )
+
+ heading_style = ParagraphStyle(
+ 'CustomHeading',
+ parent=styles['Heading2'],
+ fontSize=16,
+ textColor=colors.darkblue,
+ spaceBefore=20,
+ spaceAfter=10
+ )
+
+ # Title Page
+ story.append(Paragraph("Repository Analysis Report", title_style))
+ story.append(Spacer(1, 20))
+ story.append(Paragraph(f"Repository: {analysis.repo_path}", styles['Normal']))
+ story.append(Paragraph(f"Analysis Date: {datetime.now().strftime('%B %d, %Y at %H:%M')}", styles['Normal']))
+ story.append(Paragraph("Generated by: AI Senior Engineering Team", styles['Normal']))
+ story.append(PageBreak())
+
+ # Executive Summary
+ story.append(Paragraph("Executive Summary", heading_style))
+ story.append(Paragraph(analysis.executive_summary, styles['Normal']))
+ story.append(PageBreak())
+
+ # Repository Overview
+ story.append(Paragraph("Repository Overview", heading_style))
+
+ overview_data = [
+ ['Metric', 'Value'],
+ ['Total Files Analyzed', str(analysis.total_files)],
+ ['Total Lines of Code', f"{analysis.total_lines:,}"],
+ ['Primary Languages', ', '.join(list(analysis.languages.keys())[:5])],
+ ['Overall Code Quality', f"{analysis.code_quality_score:.1f}/10"],
+ ]
+
+ overview_table = Table(overview_data, colWidths=[200, 300])
+ overview_table.setStyle(TableStyle([
+ ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
+ ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
+ ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
+ ('FONTSIZE', (0, 0), (-1, 0), 12),
+ ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
+ ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
+ ('GRID', (0, 0), (-1, -1), 1, colors.black)
+ ]))
+
+ story.append(overview_table)
+ story.append(Spacer(1, 20))
+
+ # Languages Distribution
+ if analysis.languages:
+ story.append(Paragraph("Language Distribution", heading_style))
+ lang_data = [['Language', 'Files']]
+ for lang, count in sorted(analysis.languages.items(), key=lambda x: x[1], reverse=True):
+ lang_data.append([lang, str(count)])
+
+ lang_table = Table(lang_data, colWidths=[200, 100])
+ lang_table.setStyle(TableStyle([
+ ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
+ ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
+ ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
+ ('GRID', (0, 0), (-1, -1), 1, colors.black)
+ ]))
+ story.append(lang_table)
+ story.append(PageBreak())
+
+ # Architecture Assessment
+ story.append(Paragraph("Architecture Assessment", heading_style))
+ # Split long text into paragraphs
+ arch_paragraphs = analysis.architecture_assessment.split('\n\n')
+ for para in arch_paragraphs[:10]: # Limit paragraphs
+ if para.strip():
+ story.append(Paragraph(para.strip(), styles['Normal']))
+ story.append(Spacer(1, 10))
+ story.append(PageBreak())
+
+ # Security Assessment
+ story.append(Paragraph("Security Assessment", heading_style))
+ sec_paragraphs = analysis.security_assessment.split('\n\n')
+ for para in sec_paragraphs[:10]: # Limit paragraphs
+ if para.strip():
+ story.append(Paragraph(para.strip(), styles['Normal']))
+ story.append(Spacer(1, 10))
+ story.append(PageBreak())
+
+ # File Analysis Summary
+ story.append(Paragraph("File Analysis Summary", heading_style))
+
+ # Summary statistics
+ high_quality_files = [fa for fa in analysis.file_analyses if fa.severity_score >= 8]
+ medium_quality_files = [fa for fa in analysis.file_analyses if 5 <= fa.severity_score < 8]
+ low_quality_files = [fa for fa in analysis.file_analyses if fa.severity_score < 5]
+
+ quality_data = [
+ ['Quality Level', 'Files', 'Percentage'],
+ ['High Quality (8-10)', str(len(high_quality_files)), f"{len(high_quality_files)/len(analysis.file_analyses)*100:.1f}%"],
+ ['Medium Quality (5-7)', str(len(medium_quality_files)), f"{len(medium_quality_files)/len(analysis.file_analyses)*100:.1f}%"],
+ ['Low Quality (1-4)', str(len(low_quality_files)), f"{len(low_quality_files)/len(analysis.file_analyses)*100:.1f}%"]
+ ]
+
+ quality_table = Table(quality_data)
+ quality_table.setStyle(TableStyle([
+ ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
+ ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
+ ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
+ ('GRID', (0, 0), (-1, -1), 1, colors.black),
+ ('BACKGROUND', (0, 1), (-1, 1), colors.lightgreen),
+ ('BACKGROUND', (0, 2), (-1, 2), colors.lightyellow),
+ ('BACKGROUND', (0, 3), (-1, 3), colors.lightcoral)
+ ]))
+
+ story.append(quality_table)
+ story.append(Spacer(1, 20))
+
+ # Top Issues Found
+ story.append(Paragraph("Files Requiring Attention", heading_style))
+
+ # Sort files by severity (lowest scores first - need most attention)
+ files_by_priority = sorted(analysis.file_analyses, key=lambda x: x.severity_score)
+
+ for i, file_analysis in enumerate(files_by_priority[:15]): # Top 15 files needing attention
+ story.append(Paragraph(f"{i+1}. {file_analysis.path}", styles['Heading4']))
+ story.append(Paragraph(f"Language: {file_analysis.language} | Quality Score: {file_analysis.severity_score:.1f}/10 | Lines: {file_analysis.lines_of_code}", styles['Normal']))
+
+ # Show top issues
+ if file_analysis.issues_found:
+ story.append(Paragraph("Key Issues:", styles['Heading5']))
+ for issue in file_analysis.issues_found[:3]: # Top 3 issues
+ story.append(Paragraph(f"⢠{issue}", styles['Normal']))
+
+ # Show top recommendations
+ if file_analysis.recommendations:
+ story.append(Paragraph("Recommendations:", styles['Heading5']))
+ for rec in file_analysis.recommendations[:2]: # Top 2 recommendations
+ story.append(Paragraph(f"⢠{rec}", styles['Normal']))
+
+ story.append(Spacer(1, 15))
+
+ # Build PDF
+ try:
+ doc.build(story)
+ print(f"ā
PDF report generated successfully: {output_path}")
+ except Exception as e:
+ print(f"ā Error generating PDF: {e}")
+
+ async def analyze_repository(self, repo_path: str, max_files: int = 50) -> RepositoryAnalysis:
+ """Main analysis function."""
+ try:
+ # Clone/access repository
+ actual_repo_path = self.clone_repository(repo_path)
+
+ # Scan files
+ files_to_analyze = self.scan_repository(actual_repo_path, max_files)
+
+ if not files_to_analyze:
+ raise Exception("No files found to analyze")
+
+ # Analyze each file
+ print(f"Starting analysis of {len(files_to_analyze)} files...")
+ file_analyses = []
+
+ for i, (file_path, content) in enumerate(files_to_analyze):
+ print(f"Analyzing file {i+1}/{len(files_to_analyze)}: {file_path.name}")
+ analysis = await self.analyze_file_comprehensive(file_path, content)
+ file_analyses.append(analysis)
+
+ # Small delay to avoid rate limiting
+ await asyncio.sleep(0.2)
+
+ # Repository-level analyses
+ print("Performing repository-level analysis...")
+ architecture_assessment, security_assessment = await self.analyze_repository_overview(
+ actual_repo_path, file_analyses)
+
+ # Calculate overall quality score
+ avg_quality = sum(fa.severity_score for fa in file_analyses) / len(file_analyses)
+
+ # Generate statistics
+ languages = dict(Counter(fa.language for fa in file_analyses))
+ total_lines = sum(fa.lines_of_code for fa in file_analyses)
+
+ # Create repository analysis
+ repo_analysis = RepositoryAnalysis(
+ repo_path=repo_path,
+ total_files=len(file_analyses),
+ total_lines=total_lines,
+ languages=languages,
+ architecture_assessment=architecture_assessment,
+ security_assessment=security_assessment,
+ code_quality_score=avg_quality,
+ file_analyses=file_analyses,
+ executive_summary=""
+ )
+
+ # Generate executive summary
+ print("Generating executive summary...")
+ repo_analysis.executive_summary = await self.generate_executive_summary(repo_analysis)
+
+ return repo_analysis
+
+ finally:
+ # Cleanup
+ if self.temp_dir and os.path.exists(self.temp_dir):
+ shutil.rmtree(self.temp_dir)
+ print("Temporary files cleaned up")
+
+async def main():
+ # Load environment variables
+ load_dotenv()
+
+ parser = argparse.ArgumentParser(description="Robust GitHub Repository AI Analysis")
+ parser.add_argument("repo_path", help="Repository path (local directory or Git URL)")
+ parser.add_argument("--output", "-o", default="repository_analysis.pdf",
+ help="Output PDF file path")
+ parser.add_argument("--max-files", type=int, default=50,
+ help="Maximum files to analyze")
+ parser.add_argument("--api-key", help="Anthropic API key (overrides .env)")
+
+ args = parser.parse_args()
+
+ # Get API key
+ api_key = args.api_key or os.getenv('ANTHROPIC_API_KEY')
+ if not api_key:
+ print("ā Error: ANTHROPIC_API_KEY not found in .env file or command line")
+ print("Please create a .env file with: ANTHROPIC_API_KEY=your_key_here")
+ return 1
+
+ try:
+ print("š Starting Repository Analysis")
+ print("=" * 60)
+ print(f"Repository: {args.repo_path}")
+ print(f"Max files: {args.max_files}")
+ print(f"Output: {args.output}")
+ print("=" * 60)
+
+ # Initialize analyzer
+ analyzer = RobustGitHubAnalyzer(api_key)
+
+ # Perform analysis
+ analysis = await analyzer.analyze_repository(args.repo_path, args.max_files)
+
+ # Generate PDF report
+ analyzer.create_pdf_report(analysis, args.output)
+
+ # Print summary to console
+ print("\n" + "=" * 60)
+ print("šÆ ANALYSIS COMPLETE")
+ print("=" * 60)
+ print(f"š Repository Statistics:")
+ print(f" ⢠Files Analyzed: {analysis.total_files}")
+ print(f" ⢠Lines of Code: {analysis.total_lines:,}")
+ print(f" ⢠Languages: {len(analysis.languages)}")
+ print(f" ⢠Code Quality: {analysis.code_quality_score:.1f}/10")
+
+ # Quality breakdown
+ high_quality = len([fa for fa in analysis.file_analyses if fa.severity_score >= 8])
+ low_quality = len([fa for fa in analysis.file_analyses if fa.severity_score < 5])
+
+ print(f"\nš Quality Breakdown:")
+ print(f" ⢠High Quality Files: {high_quality}")
+ print(f" ⢠Files Needing Attention: {low_quality}")
+ print(f" ⢠Total Issues Found: {sum(len(fa.issues_found) for fa in analysis.file_analyses)}")
+
+ print(f"\nš Detailed PDF Report: {args.output}")
+ print("\nā
Analysis completed successfully!")
+
+ return 0
+
+ except Exception as e:
+ print(f"ā Error during analysis: {e}")
+ return 1
+
+if __name__ == "__main__":
+ exit(asyncio.run(main()))
\ No newline at end of file
diff --git a/services/ai-analysis-service/ai-analysis/ai_blog_analysis.pdf b/services/ai-analysis-service/ai-analysis/ai_blog_analysis.pdf
new file mode 100644
index 0000000..f42992f
--- /dev/null
+++ b/services/ai-analysis-service/ai-analysis/ai_blog_analysis.pdf
@@ -0,0 +1,232 @@
+%PDF-1.4
+% ReportLab Generated PDF document http://www.reportlab.com
+1 0 obj
+<<
+/F1 2 0 R /F2 3 0 R /F3 9 0 R
+>>
+endobj
+2 0 obj
+<<
+/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
+>>
+endobj
+3 0 obj
+<<
+/BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font
+>>
+endobj
+4 0 obj
+<<
+/Contents 17 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 16 0 R /Resources <<
+/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
+>> /Rotate 0 /Trans <<
+
+>>
+ /Type /Page
+>>
+endobj
+5 0 obj
+<<
+/Contents 18 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 16 0 R /Resources <<
+/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
+>> /Rotate 0 /Trans <<
+
+>>
+ /Type /Page
+>>
+endobj
+6 0 obj
+<<
+/Contents 19 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 16 0 R /Resources <<
+/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
+>> /Rotate 0 /Trans <<
+
+>>
+ /Type /Page
+>>
+endobj
+7 0 obj
+<<
+/Contents 20 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 16 0 R /Resources <<
+/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
+>> /Rotate 0 /Trans <<
+
+>>
+ /Type /Page
+>>
+endobj
+8 0 obj
+<<
+/Contents 21 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 16 0 R /Resources <<
+/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
+>> /Rotate 0 /Trans <<
+
+>>
+ /Type /Page
+>>
+endobj
+9 0 obj
+<<
+/BaseFont /Helvetica-BoldOblique /Encoding /WinAnsiEncoding /Name /F3 /Subtype /Type1 /Type /Font
+>>
+endobj
+10 0 obj
+<<
+/Contents 22 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 16 0 R /Resources <<
+/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
+>> /Rotate 0 /Trans <<
+
+>>
+ /Type /Page
+>>
+endobj
+11 0 obj
+<<
+/Contents 23 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 16 0 R /Resources <<
+/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
+>> /Rotate 0 /Trans <<
+
+>>
+ /Type /Page
+>>
+endobj
+12 0 obj
+<<
+/Contents 24 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 16 0 R /Resources <<
+/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
+>> /Rotate 0 /Trans <<
+
+>>
+ /Type /Page
+>>
+endobj
+13 0 obj
+<<
+/Contents 25 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 16 0 R /Resources <<
+/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
+>> /Rotate 0 /Trans <<
+
+>>
+ /Type /Page
+>>
+endobj
+14 0 obj
+<<
+/PageMode /UseNone /Pages 16 0 R /Type /Catalog
+>>
+endobj
+15 0 obj
+<<
+/Author (\(anonymous\)) /CreationDate (D:20250919123308+05'00') /Creator (\(unspecified\)) /Keywords () /ModDate (D:20250919123308+05'00') /Producer (ReportLab PDF Library - www.reportlab.com)
+ /Subject (\(unspecified\)) /Title (\(anonymous\)) /Trapped /False
+>>
+endobj
+16 0 obj
+<<
+/Count 9 /Kids [ 4 0 R 5 0 R 6 0 R 7 0 R 8 0 R 10 0 R 11 0 R 12 0 R 13 0 R ] /Type /Pages
+>>
+endobj
+17 0 obj
+<<
+/Filter [ /ASCII85Decode /FlateDecode ] /Length 367
+>>
+stream
+Gat>Ob>,r/&-^F/^>^aQ+qM;2mo!"Z,rU:'+DFN!-*UmX9fWY/Ec?M%jF#/Z\\ge'p)luOhIPLQ[I2NF=e"ji6TniD.=DH+Kt)n$GsIg"Wei,tr^>pN;0%8ZkRlCGNkJ`@0/m+gMd9CE2":C%X7.gS;0UgGA$4o>n6P`k2MG+p1deWfJ:Cu=FH'YR36n(uendstream
+endobj
+18 0 obj
+<<
+/Filter [ /ASCII85Decode /FlateDecode ] /Length 2039
+>>
+stream
+Gat%#?$"aY&:Dg-\;-rFFG?eDbDtmI7q"KL`h-_gFs\jr#uPA,J,qpglEBXt5Z*^1cEu!O1SKW:]t<)`32J&fC%tuB7.1N[n`Q.b)&4YokE@n@+"8^HI=%4hDn\<2GOs;*q>!hL3.WaXn`4e@3lM2*^I!Tq%#Q_j!mW2W$N\R6gmdY%QG$?=8^"hbL#'J>i_M%Qi'_ea*$m[,9b3C-76c&VkP,JZ@t[#,/CX*n2%okZ/NspFkDY_!Y-'DGs.G(F,i/-f;1;0q;^'>lEX++MHH]M"E9B@8,eb/ms&c3VsDZm#4l%b#&\6%lf;?P'S^%.60J81ZiG+dN1WOVX:0\JIJ:,#X#6NK\h2^k1A:,8bpp(jeAE$(;7*qKZi7=-eF-,%b6Gl7ZQHJk*cc>@hGD?kHicFiCYuCf1KRCWu0tt.:pKu)+/bE.q'r`gr7u>N6MDN;^IqTF2aH?2f4HYkW&ta%CTRi.u*D9idts<89Mf>80)0fG=oJHTlK`<=oI7R_GcJcq]gS3"9IY8j'%+Rlq]E,p6q+b7Z"*IOZJ'J+>r+-!E:<7"P"N_0]ps+6OkIXd<"5c77US33[UeBE*Ki]tYA/Z#AeD#,%[T_fj@[A$ucW^:0MaX"6PeN$%TiT=krA5J"LL1f2CQ.'"d`d?qj07PVAfo#0K!a!#\r%AH$_jA":#,tNUb[XP(6.bf?6Dus+8B)2fnJjH#cB8;LWaqhU63Q\Hp=g?E0%!Rlb7>kckrg&EX+)d=0>;:*sE+d@!B5_@!a!ScLo#;a!GDJ!.a2i_Ebn`bA@8(`lPLFO]m6s@TLO$(fkG)Z]\j+9s@Tll:ojniKhXUN91eQs7n&ALiR0NKtN"/9%1k-QfCaRf7.dk@Yh%.l/ZNM%`"Rl!UQqK.G2mH9e>/AQ(dmZorU4pRSOE2)CH#i`iKibBM]L`>$nQInMi8,9s?kqko>rnBZ%D!]12Aeh)a_9m_*8@g0\[p%C4D]:ZMi[\nZH-seQZNtjNNmDWF`qb4+9#V@=&^krFr'dUetY-PZrKuT/701G@&e2Qn(G-NU9T_;o<(r6-cu3$qk)o>DhlCR/<.cEBWP0d,'eU9Q4GA5.+%D4Db$s"kI['JUFRIS]66\-:S&U\$%7k,X>@N%H1g&J:H?\(<5d_O'*nM:<'07lq!nrfI5i9cTnrf'#(XVelQJB^qYl$ul+7Lf;7ZJnpbWHO7eC><;G]lg9\\S*V_Q5aTQ;[bq2JTR"bD>qF^,qfZIne5Y$SQ*f*B#f_eW*a[0lT:,CRRKJ)t4FVk:,K9QSf\h\R2"FjUQGoL4O]+$N_+L=2/C\_&$#$\:R%;\Y!rlH5e+^aq@bi)hnuJ18.BD:f0VnGZ;r?[:D=dVXp!c9#W$Y;U@>5qhkgkR9L@I?5X!dgLNYNkE:9GT140pL;Z_<4#a7BNIjZ?Wh?-6j/M$Cfg%URGaj>&I]Nci7+I0Tk+I477c0\ScaE7WoF):_lgUMP!9TmO`C-p/##-kDNW~>endstream
+endobj
+19 0 obj
+<<
+/Filter [ /ASCII85Decode /FlateDecode ] /Length 764
+>>
+stream
+GatU0?#SFN'Rf.GgrHR,WU2Z?o%8*NU^G[MU.K_(MF$Jn_En7-?b[P0OHe^U2FV$:ptTq#qjpH3i'[2;o+KtK"ul8j."c=GPQr26&U__*^BW1Cirig4"\Fk((kE&H*(2n5#h4b5.aWerat-DO!>SclC#uLhe>c^89i^Z@ENAAAY'07VH\(Op9f9bb9?6'XKU>\kU6dZl#YbJVit:mL(m_$1&H_E(%(1]_ocQd/M%^AS0bFhH(if.>KUFT>L!(kD,j&/"#S5D)01-T"qWFs6Q1uu@d]Ir4*KPTi,H]H2S1G#\)jkGPOZ3.rN_7?"$/X&.Bsm'uJA2nI=\k[[A`l[(WJ_'1"\^dC/4S?qP1NDP4OGFk'29Z5d3M%cPAoDh\c`H@!#HR!U&~>endstream
+endobj
+20 0 obj
+<<
+/Filter [ /ASCII85Decode /FlateDecode ] /Length 1610
+>>
+stream
+Gat=*968iG&AJ$Cln(tJaeIY;c-`=]3_AX,b,'4k+M":UK)c:0P1a4">u77:[Zl_@1Ro$XmOn3[/0a<*0+-%$!-l8/lX(ilqQS$`)Kpn?p^A5[(]Rf0S"5`l9ST>1FF#a>05,oDG=TPJO'^K:Jg*U":^U,t^ck0H&9,eN/oPU4PTCKF=bL#Bd('4cIg_/>=T$,%rhSF[b5UmBq";f\`^Jrj_A)dtXs;iFg4'rVH@-Bi_5EnEISS2UU&NHldA(u$AuTLU+F_(M5_D7n(N"Ef:KKo)cu;Of9%Q!C"0/Y9qSGB4+DdId=1MhWlo0_Z?*m[&r\r$;X6MYi#H-SfQVK+`if:C/Mi`(Y0)b*5::I%mMIm-h`[7"r)0ABMs@'T/@7[O)T_TG'sOM5#Gj1<<[JE_B+mI:*qiQCDm0c)(IRQE];O'Xf.j$'*A(W8t:E)bj(jG;OP%H1)1-jQA+r?Z@SqY9Y?OcEnif%h4CF5;,o#m-(Tu$IV*Y)4^J(VN$;;-s(8p*bd"Tp+Z`J_PjOmG;A8Y+q6TStbFtaBC>Z.8i&qrd\fl%#l'Wb?M\JQgNMDV4.5+?%F-3+7W_,'$c'Q72rC.e4mp,aF209Ucrb:diP?3dP6'k\@>l2G$6HfCto)P]ogW=Sfq6s:&r_ILMDdEXKgDV/R*cm6b3"/Y^agaK4:&BE?-76iNlJmK@p!<<8Vr=1J(j8H.8r@Rtd#^0qWVkendstream
+endobj
+21 0 obj
+<<
+/Filter [ /ASCII85Decode /FlateDecode ] /Length 1572
+>>
+stream
+Gat=*gJ[&k&:N^lqIu;LJkSij>i("!/Z9S2Z6W-2"##P5,T:L@/'3@dfC*E6EL`-+(6p?t>?5+Vl-nGp[IHoL?^VR5NTfu+#pgrURS_FLF_UK-^5`^&4\1lGSt=>D\(.7=Ou3f/kL4UE#VUTLbc!AgB0lqo9b"OMe&<\;>QVqF.6gX'C<-1'CNGWUhT:-;fdGlrKE9Vr?sIS_AMT4#H$Z&kMS>3?oT_\$sI36cYuGH`g7'Dk%m&K;/*Zs\FQ[$i6CKR)j"J0!mH&>:<Uj6f(a8@d?9DtX/p&[N)aJfe&K"*r:S?2p[Ql$-h$f(r_EI\=G%eG-KTRCE3)&a7Y@KjF5_tl>8F*CAX8K7@[nnD@YZ3q&/CkCbQ5-BX#fAUW)EhZJocT)[?1s)A2((M"GolUQ])[nP,T!s>?]0_W#!M[\@!f$-VXp,3Z#VZOS4jNO=&54\-'h[^GVT5eEO3dU<=2:fnc;+2+gO&O^-EjHQYWe/Tc-Y$#7g1pn!Rl]S2rP)4/c=Z@ORMJO^Y\`eE[V5^[X8S[_]>M];S7nN!SkR/3g^`ar5A-ktZ/th?2n&m[d*fS;sZ>.Wb8O+AK'b[QnNHfhU[]GIiR&=>gc*i^7OM[aE`Hr9^BNDe\Q:G*6*#nD!DLAYu<)qBs-3C"=Mj7b]N*lr49*\-GOer\k?anWmn996BHf=G-5;m\g5eRrhk.+)A3_uN;3ika"XEZl*mLV=7G76P'!d"D3e!jchp3+Joo)>MPFEb`MUB1$CXMk>h*;5Po34OjWHFSH2VJ/2_RWZDu8emc57MhT7KYjh+RO=1>.\`g/7jSCV7bFQA=ZD:kkfogXD=?Q>6VhEaCX4g1V1Z"h,AN9-RH`eiblG*EEt:cca-VFH@7RKBLKQ48lj8fQjn#s6iWCO\rJ_[G;endstream
+endobj
+22 0 obj
+<<
+/Filter [ /ASCII85Decode /FlateDecode ] /Length 1697
+>>
+stream
+GauHKgN)%,&:O:Slq:u]1dVK)PrCXA)Q1mOT6^tUC3"1eYj7d77kbO$?\P>#Al9(-Wqur(pdeKX>]>eIeaG2D>\K-k%4);(EZhVo1[.t(:"m,tHfp9r8Ns7jLJgN-*`HMF--T6(j+1:jd.A$G*.=`c]#,1@)SfN<=kFp(Ei9qil].Hs/$[ug]GEK`hB3(3PHas8pM7#A84S"4R]>rNGPblp#cGc?qf!;etcT,W52o2:smkAj3`nf58P>JM4Wbi,8POA9H<;Z1VU%_22n`@eS"j.Y)MFSH>%04_uG^MbpoQgKN00;l(c&4p'gCFm+aY`H_C.NeAI=B[`D:(l=r0mSc3\\)8o_$BCG&jqn;\"%'t0_.43>*Fa:VMRLrs6F^UDLTXNIG5ih>ElYCB[dGpX&83!TXD)jSo8\#-L->*h%$2o\m\jQ_ruhm(tX[SDL&*_NW8*OkF]APWR'_Sic=kYH:'N^;SKc+Mp4cCo*%h:NVHhX.P7N>;H;qE<#.Pa%%pqjCk,^$i1($XFj(_g7@=ZA)1Q/f.*m3Jr8:D=LWt0n*Ym-Bc2NIs3k75J+'jkd@];&=N:##AiB]_AUXA8R&\YsUI/0oea#Y=YG;mln-7G1:TL@kHd$9J<<7"UeKZY_BL9+;p(&5mJ85uT;Y0n.&[rk-G8<\e)DqV;*QTc=d'5)fIF4'89u'](X=I\j@pcKYP<,F">uK`kPI77EB5e9Z\Jr@p@l!U>L$^n`Sle':GLMM0t_6q&>QGhJh$D^18T:@1ceNrS9,kq`oBi>&d:D9$U$G"Ce:T4\!/qUdQ@!!M:!a8`'ec%lR\`6;2>O1S1'e(NX.]T#To^P!]k=V\4'XQ1r1[lK`We,N8_%`?PLfpe:Sl$lW[(&)\rDQct")"Q$kpr6MVI$[QX(>BS2R"7nI/f3YNnJV)R\[e4mOr]l^K.osZHUc,2o:DCDa,aAdmF9SL3PA25p"0IS0"^-J0l9)m^?$B=tj*3F=.4>4Z%endstream
+endobj
+23 0 obj
+<<
+/Filter [ /ASCII85Decode /FlateDecode ] /Length 1467
+>>
+stream
+GauHKgN)%,&:O:Sls$$@9aeUi;Voj?C#/R4=Nm841GB,,E(GK_5V(":;g"+*7/@ljI1_rCD*\>SX*"WtFLUcfc!r+@"PE,i;#h]n_*5mr0_eF;`cN.1^R>rCa82(sA7lUSU#&Z]N%WF&RKYmd)L5LKi>c?!R3fF0>C&XCC=E(17GQZV>AA?h$TCMM08X/S1KKMtL:;s^l2))%Mku4N$=q?/7;*bOPq_S85o)$]O[SlJIO!4"V;MK/a.'KK)YgDAJO%l&k%(oF#/6eWDC70+.TRYr%_bg:q[g4h=5T*q7>'!sq5OO#6!R0s:c/24T)]SX=0AU1AH\sCLCiWsE@"+i7dNm*"nB2+j.ed)hY;6gVC-&oOGNl981oU6\''p@!CnechBZG;&L!gdRDX9%=Mpoi[n$9:#bDA/X1627-M?9.^/2U?1s32`6nSl'jVN5j?X,Z8ef6+jAO6eiuG)^K8.\H4VOdYUKRs9e2.^,qGUp=&e+f$L6%OO?ULG5/EVmX03tiC18cVd:T1X6R"`A8!JiL:3d:mq:/@,c;u]_egjoYH7o&H7:,ip>^9?Qr$<5ND\T5mmA[hT(8!6qK4/+^;#\B27OrAj,pJ$0THtd(3GVd-[Od(XX>4%Ua#bfYI#iH6(@-Ea>4b5'UMZtJ=[=&Pc]DsqbCn0dF75iK@6gWbei3f^r1>!:dHRKm$]%($MR^VKRQ/PgM]p$Zp,i"ScqoNXkO*kof3839ic:'u_siqEcH)\$^Su]d..VZ01eB4SiecIm:FM-Oln7*FJendstream
+endobj
+24 0 obj
+<<
+/Filter [ /ASCII85Decode /FlateDecode ] /Length 1179
+>>
+stream
+GauHK9lo&I'YO<\;&;d5hqb,'4k&.)!#8V.,]Nn)74cNE:5%dSWZl"2as%07'Qf_;UT&odA.g@)*GGdDt?HNM-]E9P:G!bWBn6XNpLC9?VPRk]LQh&?ekD9;JXr#hZlk[@:U=oLMW9K=&>2?rDbpV/V1ghEpo.?UWNWg]c!aa;if-%p\fGnY7c6TSNI"i.@/\"tP:.9Fa!oh1-\h!p>oqA+m,;q880M3b_a6>#T/C^t>eoY9\;$t;%@XbU6$aLY"bs4Uhg7*&AZ+:S46\M+:KsO"g^5><>62@=&I`$>?%Z80>2>sc_?@U#Nm]TepC5[_k%[='7I.g_Y0gq4.HHoiS&s@6Cc8gd5KTd(QZKo@').NG"#t@c;P9o.I!3W#?(F_D-NBUm9MRd!]UE/=+QOR*QJ^+9deqHS01=LWp@qs5T^(.kLq^=mc$I&m`t)LKSmpaC%O9[J#,=%B1IKQ1o7(:!%2B@j.8ZjAN@Y-H^3NH#'%jC<(L3780C^W)PfA!O7;_!F>W:FA*9Of[FH/>%7(7T"$R#gK&2TrJKH_?7@J2"3c7Y*C?sc7Jm%Heo]Mr)^gq&p7>+fjAguX4@68\$]Vh]2$@)_S*b[B:@2lhsZW20O_YY3WDT=WEPX_AfKq+3#A[9O-KK\XS2(lcO4](M'oJE(ZE$FC5D\47[YE&UH7W2?t(2qCX0KX"qWIo%^\:-+)8Lh^oJooTS';6=PVca3EeXQIsX^:Bu4)N1,oVZg&0YX_aERgg+7V-@]amP7Nnm56mr+&"j]'p"sPs!c7Q*Lq*uBICi0:hnC7ZC'(S?e+j;fkBSl6b,nj0ZkSsA=(;/TIcg"p<\X;TkpWZbIP:KDkr77Q:`'l#efMY,oZ<'#7(9r0sdjYGtQ)Ftbf=e"6RLDk_\D3Xt[Df>YOF\=aI98oM^_m(1&Ndqk>MWc[_)ae&&51f+!$mdtP>#^CGa`;p^[a4A,;)f'[XO;PGMGgVsMX92Zs"dLd7aLL1H_Dj`r:SDSrF5rC->5[f8tP/7L#)DR&63066?9XE#u\=EEjVW3Pa%3\22;GATr'@1QDB&)c@N.11I*~>endstream
+endobj
+xref
+0 26
+0000000000 65535 f
+0000000073 00000 n
+0000000124 00000 n
+0000000231 00000 n
+0000000343 00000 n
+0000000548 00000 n
+0000000753 00000 n
+0000000958 00000 n
+0000001163 00000 n
+0000001368 00000 n
+0000001487 00000 n
+0000001693 00000 n
+0000001899 00000 n
+0000002105 00000 n
+0000002311 00000 n
+0000002381 00000 n
+0000002665 00000 n
+0000002777 00000 n
+0000003235 00000 n
+0000005366 00000 n
+0000006221 00000 n
+0000007923 00000 n
+0000009587 00000 n
+0000011376 00000 n
+0000012935 00000 n
+0000014206 00000 n
+trailer
+<<
+/ID
+[<18e7918b3296693e83634aaf57fa33ad><18e7918b3296693e83634aaf57fa33ad>]
+% ReportLab generated PDF document -- digest (http://www.reportlab.com)
+
+/Info 15 0 R
+/Root 14 0 R
+/Size 26
+>>
+startxref
+15590
+%%EOF
diff --git a/services/ai-analysis-service/ai-analysis/analysis_report.md b/services/ai-analysis-service/ai-analysis/analysis_report.md
new file mode 100644
index 0000000..d2da737
--- /dev/null
+++ b/services/ai-analysis-service/ai-analysis/analysis_report.md
@@ -0,0 +1,363 @@
+
+# GitHub Repository Analysis Report
+
+**Repository:** https://github.com/TejasTeju-dev/AI-Blog
+**Analysis Date:** 2025-09-19 11:09:14
+**Analyzed by:** Claude AI Assistant
+
+---
+
+## Executive Summary
+
+Let me provide a comprehensive analysis:
+
+1. **Project Type & Purpose**:
+This appears to be a modern web application built with Next.js, likely a blog or content platform with articles and topics sections. The extensive UI component library suggests it's a full-featured web application with a sophisticated user interface.
+
+2. **Technology Stack**:
+- Frontend Framework: Next.js (React)
+- Language: TypeScript
+- Styling: Tailwind CSS
+- Package Manager: pnpm
+- UI Components: Extensive component library (possibly using shadcn/ui)
+- State Management: Custom hooks
+- Animations: Multiple background animation components
+
+3. **Architecture Overview**:
+The project follows Next.js 13+ App Router structure:
+```
+app/ # Main application routes
+components/ # Reusable UI components
+hooks/ # Custom React hooks
+lib/ # Utility functions
+public/ # Static assets
+styles/ # Global styles
+```
+
+4. **Key Components**:
+- **UI Components**: Comprehensive set of 40+ UI components including:
+ - Basic elements (Button, Input, Form)
+ - Navigation (Navbar, Menu, Breadcrumb)
+ - Feedback (Toast, Alert, Dialog)
+ - Data display (Table, Chart, Card)
+ - Layout (Grid, Sidebar)
+- **Background Components**:
+ - AnimatedGrid
+ - FloatingElements
+ - ParticleField
+ - 3DBackground
+- **Core Pages**:
+ - Home (page.tsx)
+ - Articles
+ - Blog
+ - Topics
+ - About
+
+5. **Development Setup**:
+Required setup likely includes:
+```bash
+# Install dependencies
+pnpm install
+
+# Development server
+pnpm dev
+
+# Build
+pnpm build
+```
+
+Requirements:
+- Node.js
+- pnpm
+- TypeScript knowledge
+- Understanding of Next.js and React
+
+6. **Code Quality Assessment**:
+Strengths:
+- Well-organized directory structure
+- Consistent use of TypeScript
+- Modular component architecture
+- Separation of concerns (UI components, hooks, pages)
+- Comprehensive UI component library
+- Modern development practices (App Router, TypeScript)
+
+Areas for consideration:
+- Large number of UI components might indicate need for documentation
+- Multiple background components might need performance optimization
+- Could benefit from API documentation
+- Might need testing infrastructure (not visible in structure)
+
+Additional Observations:
+- The project uses modern React patterns (hooks)
+- Strong focus on UI/UX with multiple animation options
+- Built with scalability in mind (modular structure)
+- Follows Next.js best practices
+- Uses modern tooling (pnpm, TypeScript, Tailwind)
+
+This appears to be a well-structured, modern web application with a strong focus on UI components and user experience. The architecture suggests it's built for scalability and maintainability.
+
+---
+
+## Detailed Code Analysis
+
+I'll analyze each aspect of this Next.js project:
+
+1. **Code Quality**
+- Strong TypeScript usage with proper type definitions and configurations
+- Consistent code formatting and organization following Next.js 13+ conventions
+- Clean project structure with clear separation of concerns
+- Good use of modern React patterns and Next.js features
+- Well-structured configuration files (next.config.js, tailwind.config.js, etc.)
+- Follows React best practices with components organization
+
+2. **Design Patterns**
+- Component-based architecture following React principles
+- Server-side rendering approach using Next.js App Router
+- Atomic design pattern evident in UI components organization
+- Utility-first CSS approach with Tailwind
+- Singleton pattern for configuration management
+- Dependency injection through React context (seen in theme implementation)
+
+3. **Key Dependencies**
+- Core: Next.js 14.2, React 19, TypeScript
+- UI: Radix UI components, Tailwind CSS, shadcn/ui
+- 3D: Three.js, React Three Fiber
+- Forms: React Hook Form, Zod validation
+- Utilities: clsx, tailwind-merge
+- Development: PostCSS, TypeScript, ESLint
+
+4. **Potential Issues**
+- Build errors being ignored (typescript.ignoreBuildErrors, eslint.ignoreDuringBuilds)
+- Unoptimized images configuration could impact performance
+- Missing error boundaries and proper error handling
+- Security considerations for client-side rendering of 3D content
+- No explicit API rate limiting or security headers
+- Missing proper environment variable handling
+
+5. **Testing Strategy**
+- No visible testing setup (Jest, React Testing Library, etc.)
+- Missing unit tests, integration tests, and e2e tests
+- Should add testing framework and implement test coverage
+- Consider adding Cypress or Playwright for e2e testing
+
+6. **Documentation**
+- Good README with clear project structure and setup instructions
+- Missing JSDoc comments for components and functions
+- Could benefit from more inline documentation
+- API documentation could be improved
+- Missing contribution guidelines and deployment docs
+
+7. **Maintainability**
+Strengths:
+- Clear project structure
+- Modern tooling and frameworks
+- Type safety with TypeScript
+- Component modularity
+- Consistent coding style
+
+Areas for Improvement:
+- Add comprehensive testing
+- Improve error handling
+- Better documentation
+- Implement proper CI/CD
+- Add proper logging system
+- Consider performance monitoring
+
+Additional Recommendations:
+
+1. Security:
+```typescript
+// Add security headers
+const securityHeaders = [
+ { key: 'X-XSS-Protection', value: '1; mode=block' },
+ { key: 'X-Frame-Options', value: 'SAMEORIGIN' },
+ { key: 'X-Content-Type-Options', value: 'nosniff' },
+]
+```
+
+2. Error Handling:
+```typescript
+// Add error boundary component
+class ErrorBoundary extends React.Component {
+ static getDerivedStateFromError(error) {
+ return { hasError: true };
+ }
+
+ componentDidCatch(error, errorInfo) {
+ // Log error to service
+ }
+}
+```
+
+3. Testing Setup:
+```json
+// Add to package.json
+{
+ "jest": {
+ "setupFilesAfterEnv": ["/jest.setup.js"],
+ "testEnvironment": "jsdom"
+ },
+ "scripts": {
+ "test": "jest",
+ "test:watch": "jest --watch",
+ "test:coverage": "jest --coverage"
+ }
+}
+```
+
+4. Performance Monitoring:
+```typescript
+// Add performance monitoring
+export function reportWebVitals(metric) {
+ if (metric.label === 'web-vital') {
+ console.log(metric); // Send to analytics
+ }
+}
+```
+
+The project has a solid foundation but would benefit from these improvements for production readiness.
+
+---
+
+## Security & Best Practices Analysis
+
+I'll analyze the repository based on the provided files and structure:
+
+1. **Security Issues**:
+- ā ļø ESLint and TypeScript build errors are being ignored (`ignoreDuringBuilds: true` and `ignoreBuildErrors: true`), which could mask security-related issues
+- ā ļø Image optimization is disabled (`unoptimized: true`), which could lead to performance and security concerns
+- ā
Remote image patterns are properly restricted to specific domains (unsplash.com)
+- ā ļø No explicit CSP (Content Security Policy) configuration visible
+
+2. **Secret Management**:
+- ā
Uses environment variables (process.env)
+- ā ļø No visible secret management solution or environment validation
+- š Recommend implementing a secret management solution (e.g., Vault, AWS Secrets Manager)
+
+3. **Dependencies**:
+- Cannot fully assess without package.json
+- Using Next.js and Tailwind CSS which are generally well-maintained
+- š Recommend implementing dependency scanning (e.g., Snyk, OWASP Dependency-Check)
+
+4. **Best Practices**:
+ā
Good:
+- TypeScript implementation with strict mode enabled
+- Proper module resolution and ES6 target
+- Well-organized file structure
+- Using modern module systems
+- Proper tailwind configuration
+
+ā ļø Concerns:
+- Disabling TypeScript and ESLint checks in production
+- Multiple next.config files (both .js and .mjs)
+- No visible testing configuration
+
+5. **Configuration**:
+ā
Good:
+- Environment-based configuration for basePath
+- Proper TypeScript configuration
+- Well-structured Tailwind configuration
+
+ā ļø Concerns:
+- Duplicate next.config files might cause confusion
+- Some hardcoded values could be externalized
+- No visible staging/production environment separation
+
+6. **Error Handling**:
+- Cannot fully assess without application code
+- ā ļø Disabling TypeScript and ESLint checks could mask error handling issues
+- š Recommend implementing proper error boundaries and logging
+
+7. **Recommendations**:
+
+Security:
+```typescript
+// Enable TypeScript and ESLint checks
+const nextConfig = {
+ eslint: {
+ ignoreDuringBuilds: false,
+ },
+ typescript: {
+ ignoreBuildErrors: false,
+ }
+}
+```
+
+Configuration:
+```javascript
+// Consolidate next.config files
+// Add proper environment validation
+const validateEnv = () => {
+ const required = ['API_KEY', 'DATABASE_URL'];
+ required.forEach(key => {
+ if (!process.env[key]) throw new Error(`Missing ${key}`);
+ });
+}
+```
+
+Best Practices:
+1. Implement proper CSP:
+```javascript
+// next.config.js
+{
+ async headers() {
+ return [
+ {
+ source: '/:path*',
+ headers: [
+ {
+ key: 'Content-Security-Policy',
+ value: "default-src 'self';"
+ }
+ ]
+ }
+ ]
+ }
+}
+```
+
+2. Enable image optimization:
+```javascript
+images: {
+ unoptimized: false,
+ domains: ['images.unsplash.com'],
+}
+```
+
+Additional Recommendations:
+1. Implement security headers
+2. Add input validation
+3. Set up proper error boundaries
+4. Add proper testing configuration
+5. Implement API rate limiting
+6. Add security scanning in CI/CD
+7. Implement proper logging
+8. Add environment validation
+9. Consider implementing authentication/authorization
+10. Add proper CORS configuration
+
+Environment Setup:
+```bash
+# .env.example
+NODE_ENV=development
+API_KEY=
+DATABASE_URL=
+```
+
+This analysis is based on the configuration files provided. For a more comprehensive security assessment, access to the actual application code, API endpoints, and authentication mechanisms would be needed.
+
+---
+
+## Recommendations Summary
+
+Based on the analysis, here are the key recommendations for this repository:
+
+1. **Immediate Actions**: Critical issues that should be addressed promptly
+2. **Code Quality Improvements**: Suggestions for better maintainability
+3. **Security Enhancements**: Steps to improve security posture
+4. **Documentation**: Areas where documentation could be enhanced
+5. **Architecture**: Potential architectural improvements
+
+---
+
+*This analysis was generated using AI and should be reviewed by human developers for accuracy and context.*
diff --git a/services/ai-analysis-service/ai-analysis/app.py b/services/ai-analysis-service/ai-analysis/app.py
new file mode 100644
index 0000000..58a89d9
--- /dev/null
+++ b/services/ai-analysis-service/ai-analysis/app.py
@@ -0,0 +1,391 @@
+#!/usr/bin/env python3
+"""
+GitHub Repository AI Analysis Tool
+Analyzes GitHub repositories using Claude API for comprehensive code insights.
+"""
+
+import os
+import git
+import json
+import requests
+import tempfile
+import shutil
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import argparse
+from datetime import datetime
+import mimetypes
+import base64
+
+class GitHubRepoAnalyzer:
+ def __init__(self, anthropic_api_key: str):
+ self.api_key = anthropic_api_key
+ self.api_url = "https://api.anthropic.com/v1/messages"
+ self.temp_dir = None
+
+ # File extensions to analyze
+ self.code_extensions = {
+ '.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.cpp', '.c', '.h',
+ '.cs', '.php', '.rb', '.go', '.rs', '.swift', '.kt', '.scala',
+ '.html', '.css', '.scss', '.sass', '.less', '.vue', '.svelte',
+ '.sql', '.sh', '.bash', '.yml', '.yaml', '.json', '.xml',
+ '.dockerfile', '.md', '.rst', '.txt'
+ }
+
+ # Files to always include in analysis
+ self.important_files = {
+ 'README.md', 'readme.md', 'README.txt', 'readme.txt',
+ 'package.json', 'requirements.txt', 'Cargo.toml', 'pom.xml',
+ 'build.gradle', 'Makefile', 'dockerfile', 'Dockerfile',
+ 'docker-compose.yml', '.gitignore', 'setup.py', 'pyproject.toml'
+ }
+
+ def clone_repository(self, repo_url: str) -> str:
+ """Clone GitHub repository to temporary directory."""
+ print(f"Cloning repository: {repo_url}")
+
+ self.temp_dir = tempfile.mkdtemp(prefix="github_analysis_")
+
+ try:
+ git.Repo.clone_from(repo_url, self.temp_dir)
+ print(f"Repository cloned to: {self.temp_dir}")
+ return self.temp_dir
+ except git.exc.GitCommandError as e:
+ raise Exception(f"Failed to clone repository: {e}")
+
+ def get_file_info(self, file_path: Path) -> Dict:
+ """Get file information and content."""
+ try:
+ # Check file size (skip files larger than 1MB)
+ if file_path.stat().st_size > 1024 * 1024:
+ return {
+ 'path': str(file_path.relative_to(self.temp_dir)),
+ 'size': file_path.stat().st_size,
+ 'content': '[File too large to analyze]',
+ 'encoding': 'skipped'
+ }
+
+ # Try to read as text
+ try:
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+ content = f.read()
+ encoding = 'utf-8'
+ except UnicodeDecodeError:
+ # If text fails, try binary for certain file types
+ with open(file_path, 'rb') as f:
+ raw_content = f.read()
+ if len(raw_content) < 10000: # Only encode small binary files
+ content = base64.b64encode(raw_content).decode('ascii')
+ encoding = 'base64'
+ else:
+ content = '[Binary file - content not included]'
+ encoding = 'binary'
+
+ return {
+ 'path': str(file_path.relative_to(self.temp_dir)),
+ 'size': file_path.stat().st_size,
+ 'content': content,
+ 'encoding': encoding,
+ 'mime_type': mimetypes.guess_type(str(file_path))[0]
+ }
+ except Exception as e:
+ return {
+ 'path': str(file_path.relative_to(self.temp_dir)),
+ 'error': str(e),
+ 'content': '[Error reading file]'
+ }
+
+ def scan_repository(self, max_files: int = 50) -> Dict:
+ """Scan repository and collect file information."""
+ print("Scanning repository structure...")
+
+ repo_data = {
+ 'structure': [],
+ 'files': [],
+ 'stats': {
+ 'total_files': 0,
+ 'analyzed_files': 0,
+ 'total_size': 0,
+ 'languages': {}
+ }
+ }
+
+ # Get directory structure
+ for root, dirs, files in os.walk(self.temp_dir):
+ # Skip hidden directories and common build/cache directories
+ dirs[:] = [d for d in dirs if not d.startswith('.') and
+ d not in {'node_modules', '__pycache__', 'build', 'dist', 'target', 'venv', 'env'}]
+
+ level = root.replace(self.temp_dir, '').count(os.sep)
+ indent = ' ' * level
+ folder_name = os.path.basename(root) if root != self.temp_dir else '.'
+ repo_data['structure'].append(f"{indent}{folder_name}/")
+
+ # Process files
+ for file in files:
+ if file.startswith('.'):
+ continue
+
+ file_path = Path(root) / file
+ repo_data['stats']['total_files'] += 1
+ repo_data['stats']['total_size'] += file_path.stat().st_size
+
+ # Track languages
+ ext = file_path.suffix.lower()
+ if ext:
+ repo_data['stats']['languages'][ext] = repo_data['stats']['languages'].get(ext, 0) + 1
+
+ # Add to structure
+ repo_data['structure'].append(f"{indent} {file}")
+
+ # Decide if we should analyze this file
+ should_analyze = (
+ file.lower() in self.important_files or
+ ext in self.code_extensions or
+ repo_data['stats']['analyzed_files'] < max_files
+ )
+
+ if should_analyze and repo_data['stats']['analyzed_files'] < max_files:
+ file_info = self.get_file_info(file_path)
+ repo_data['files'].append(file_info)
+ repo_data['stats']['analyzed_files'] += 1
+
+ return repo_data
+
+ def call_claude_api(self, prompt: str, max_tokens: int = 4000) -> str:
+ """Make API call to Claude."""
+ headers = {
+ "Content-Type": "application/json",
+ "x-api-key": self.api_key,
+ "anthropic-version": "2023-06-01"
+ }
+
+ data = {
+ "model": "claude-3-5-sonnet-20241022",
+ "max_tokens": max_tokens,
+ "messages": [
+ {"role": "user", "content": prompt}
+ ]
+ }
+
+ try:
+ response = requests.post(self.api_url, headers=headers, json=data)
+ response.raise_for_status()
+
+ result = response.json()
+ return result['content'][0]['text']
+ except requests.exceptions.RequestException as e:
+ raise Exception(f"API request failed: {e}")
+
+ def analyze_repository_overview(self, repo_data: Dict) -> str:
+ """Get high-level repository analysis."""
+ print("Analyzing repository overview...")
+
+ structure_summary = "\n".join(repo_data['structure'][:100]) # Limit structure size
+
+ prompt = f"""
+Analyze this GitHub repository and provide a comprehensive overview:
+
+REPOSITORY STRUCTURE:
+{structure_summary}
+
+STATISTICS:
+- Total files: {repo_data['stats']['total_files']}
+- Files analyzed: {repo_data['stats']['analyzed_files']}
+- Total size: {repo_data['stats']['total_size']} bytes
+- Languages found: {dict(list(repo_data['stats']['languages'].items())[:10])}
+
+Please provide:
+1. **Project Type & Purpose**: What kind of project is this?
+2. **Technology Stack**: What technologies, frameworks, and languages are used?
+3. **Architecture Overview**: How is the project structured?
+4. **Key Components**: What are the main modules/components?
+5. **Development Setup**: What's needed to run this project?
+6. **Code Quality Assessment**: Initial observations about code organization
+"""
+
+ return self.call_claude_api(prompt)
+
+ def analyze_code_files(self, repo_data: Dict) -> str:
+ """Analyze individual code files."""
+ print("Analyzing code files...")
+
+ # Prepare file contents for analysis
+ files_content = []
+ for file_info in repo_data['files'][:20]: # Limit to first 20 files
+ if file_info.get('encoding') == 'utf-8' and len(file_info.get('content', '')) < 5000:
+ files_content.append(f"=== {file_info['path']} ===\n{file_info['content']}\n")
+
+ files_text = "\n".join(files_content)
+
+ prompt = f"""
+Analyze these key files from the repository:
+
+{files_text}
+
+Please provide detailed analysis covering:
+1. **Code Quality**: Code style, organization, and best practices
+2. **Design Patterns**: What patterns and architectural approaches are used?
+3. **Dependencies & Libraries**: Key external dependencies identified
+4. **Potential Issues**: Any code smells, security concerns, or improvements needed
+5. **Testing Strategy**: How is testing implemented (if at all)?
+6. **Documentation**: Quality of inline documentation and comments
+7. **Maintainability**: How maintainable and extensible is this code?
+"""
+
+ return self.call_claude_api(prompt, max_tokens=6000)
+
+ def analyze_security_and_best_practices(self, repo_data: Dict) -> str:
+ """Analyze security and best practices."""
+ print("Analyzing security and best practices...")
+
+ # Look for security-sensitive files
+ security_files = []
+ for file_info in repo_data['files']:
+ path_lower = file_info['path'].lower()
+ if any(term in path_lower for term in ['config', 'env', 'secret', 'key', 'auth', 'security']):
+ if file_info.get('encoding') == 'utf-8':
+ security_files.append(f"=== {file_info['path']} ===\n{file_info['content'][:2000]}\n")
+
+ security_content = "\n".join(security_files[:10])
+
+ prompt = f"""
+Analyze this repository for security and best practices:
+
+SECURITY-RELEVANT FILES:
+{security_content}
+
+FILE STRUCTURE ANALYSIS:
+{json.dumps(repo_data['stats'], indent=2)}
+
+Please analyze:
+1. **Security Issues**: Potential security vulnerabilities or concerns
+2. **Secret Management**: How are secrets/credentials handled?
+3. **Dependencies**: Are there any vulnerable dependencies?
+4. **Best Practices**: Adherence to language/framework best practices
+5. **Configuration**: Are configurations properly externalized?
+6. **Error Handling**: How are errors handled throughout the codebase?
+7. **Recommendations**: Specific suggestions for improvement
+"""
+
+ return self.call_claude_api(prompt, max_tokens=5000)
+
+ def generate_comprehensive_report(self, repo_url: str, overview: str, code_analysis: str, security_analysis: str) -> str:
+ """Generate final comprehensive report."""
+ print("Generating comprehensive report...")
+
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+ report = f"""
+# GitHub Repository Analysis Report
+
+**Repository:** {repo_url}
+**Analysis Date:** {timestamp}
+**Analyzed by:** Claude AI Assistant
+
+---
+
+## Executive Summary
+
+{overview}
+
+---
+
+## Detailed Code Analysis
+
+{code_analysis}
+
+---
+
+## Security & Best Practices Analysis
+
+{security_analysis}
+
+---
+
+## Recommendations Summary
+
+Based on the analysis, here are the key recommendations for this repository:
+
+1. **Immediate Actions**: Critical issues that should be addressed promptly
+2. **Code Quality Improvements**: Suggestions for better maintainability
+3. **Security Enhancements**: Steps to improve security posture
+4. **Documentation**: Areas where documentation could be enhanced
+5. **Architecture**: Potential architectural improvements
+
+---
+
+*This analysis was generated using AI and should be reviewed by human developers for accuracy and context.*
+"""
+
+ return report
+
+ def analyze(self, repo_url: str, output_file: Optional[str] = None) -> str:
+ """Main analysis function."""
+ try:
+ # Clone repository
+ self.clone_repository(repo_url)
+
+ # Scan repository structure and files
+ repo_data = self.scan_repository()
+
+ # Perform different types of analysis
+ overview = self.analyze_repository_overview(repo_data)
+ code_analysis = self.analyze_code_files(repo_data)
+ security_analysis = self.analyze_security_and_best_practices(repo_data)
+
+ # Generate comprehensive report
+ final_report = self.generate_comprehensive_report(
+ repo_url, overview, code_analysis, security_analysis
+ )
+
+ # Save report if output file specified
+ if output_file:
+ with open(output_file, 'w', encoding='utf-8') as f:
+ f.write(final_report)
+ print(f"Report saved to: {output_file}")
+
+ return final_report
+
+ finally:
+ # Cleanup temporary directory
+ if self.temp_dir and os.path.exists(self.temp_dir):
+ shutil.rmtree(self.temp_dir)
+ print("Temporary files cleaned up")
+
+def main():
+ parser = argparse.ArgumentParser(description="Analyze GitHub repository using Claude AI")
+ parser.add_argument("repo_url", help="GitHub repository URL")
+ parser.add_argument("--api-key", required=True, help="Anthropic API key")
+ parser.add_argument("--output", "-o", help="Output file path (optional)")
+ parser.add_argument("--max-files", type=int, default=50, help="Maximum files to analyze")
+
+ args = parser.parse_args()
+
+ # Initialize analyzer
+ analyzer = GitHubRepoAnalyzer(args.api_key)
+
+ try:
+ print("Starting GitHub repository analysis...")
+ print("=" * 50)
+
+ # Perform analysis
+ report = analyzer.analyze(args.repo_url, args.output)
+
+ # Print report if no output file specified
+ if not args.output:
+ print("\n" + "=" * 50)
+ print("ANALYSIS REPORT")
+ print("=" * 50)
+ print(report)
+
+ print("\nAnalysis completed successfully!")
+
+ except Exception as e:
+ print(f"Error during analysis: {e}")
+ return 1
+
+ return 0
+
+if __name__ == "__main__":
+ exit(main())
\ No newline at end of file
diff --git a/services/ai-analysis-service/ai-analysis/github_analyzer.py b/services/ai-analysis-service/ai-analysis/github_analyzer.py
new file mode 100644
index 0000000..3eadf50
--- /dev/null
+++ b/services/ai-analysis-service/ai-analysis/github_analyzer.py
@@ -0,0 +1,391 @@
+#!/usr/bin/env python3
+"""
+GitHub Repository AI Analysis Tool
+Analyzes GitHub repositories using Claude API for comprehensive code insights.
+"""
+
+import os
+import git
+import json
+import tempfile
+import shutil
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import argparse
+from datetime import datetime
+import mimetypes
+import base64
+from dotenv import load_dotenv
+import anthropic
+
+class GitHubRepoAnalyzer:
+ def __init__(self, anthropic_api_key: str = None):
+ # Load environment variables
+ load_dotenv()
+
+ # Get API key from parameter or environment
+ self.api_key = anthropic_api_key or os.getenv('ANTHROPIC_API_KEY')
+ if not self.api_key:
+ raise ValueError("Anthropic API key not found. Please set ANTHROPIC_API_KEY in .env file or pass as parameter.")
+
+ # Initialize Anthropic client
+ self.client = anthropic.Anthropic(api_key=self.api_key)
+ self.temp_dir = None
+
+ # File extensions to analyze
+ self.code_extensions = {
+ '.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.cpp', '.c', '.h',
+ '.cs', '.php', '.rb', '.go', '.rs', '.swift', '.kt', '.scala',
+ '.html', '.css', '.scss', '.sass', '.less', '.vue', '.svelte',
+ '.sql', '.sh', '.bash', '.yml', '.yaml', '.json', '.xml',
+ '.dockerfile', '.md', '.rst', '.txt'
+ }
+
+ # Files to always include in analysis
+ self.important_files = {
+ 'README.md', 'readme.md', 'README.txt', 'readme.txt',
+ 'package.json', 'requirements.txt', 'Cargo.toml', 'pom.xml',
+ 'build.gradle', 'Makefile', 'dockerfile', 'Dockerfile',
+ 'docker-compose.yml', '.gitignore', 'setup.py', 'pyproject.toml'
+ }
+
+ def clone_repository(self, repo_url: str) -> str:
+ """Clone GitHub repository to temporary directory."""
+ print(f"Cloning repository: {repo_url}")
+
+ self.temp_dir = tempfile.mkdtemp(prefix="github_analysis_")
+
+ try:
+ git.Repo.clone_from(repo_url, self.temp_dir)
+ print(f"Repository cloned to: {self.temp_dir}")
+ return self.temp_dir
+ except git.exc.GitCommandError as e:
+ raise Exception(f"Failed to clone repository: {e}")
+
+ def get_file_info(self, file_path: Path) -> Dict:
+ """Get file information and content."""
+ try:
+ # Check file size (skip files larger than 1MB)
+ if file_path.stat().st_size > 1024 * 1024:
+ return {
+ 'path': str(file_path.relative_to(self.temp_dir)),
+ 'size': file_path.stat().st_size,
+ 'content': '[File too large to analyze]',
+ 'encoding': 'skipped'
+ }
+
+ # Try to read as text
+ try:
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+ content = f.read()
+ encoding = 'utf-8'
+ except UnicodeDecodeError:
+ # If text fails, try binary for certain file types
+ with open(file_path, 'rb') as f:
+ raw_content = f.read()
+ if len(raw_content) < 10000: # Only encode small binary files
+ content = base64.b64encode(raw_content).decode('ascii')
+ encoding = 'base64'
+ else:
+ content = '[Binary file - content not included]'
+ encoding = 'binary'
+
+ return {
+ 'path': str(file_path.relative_to(self.temp_dir)),
+ 'size': file_path.stat().st_size,
+ 'content': content,
+ 'encoding': encoding,
+ 'mime_type': mimetypes.guess_type(str(file_path))[0]
+ }
+ except Exception as e:
+ return {
+ 'path': str(file_path.relative_to(self.temp_dir)),
+ 'error': str(e),
+ 'content': '[Error reading file]'
+ }
+
+ def scan_repository(self, max_files: int = 50) -> Dict:
+ """Scan repository and collect file information."""
+ print("Scanning repository structure...")
+
+ repo_data = {
+ 'structure': [],
+ 'files': [],
+ 'stats': {
+ 'total_files': 0,
+ 'analyzed_files': 0,
+ 'total_size': 0,
+ 'languages': {}
+ }
+ }
+
+ # Get directory structure
+ for root, dirs, files in os.walk(self.temp_dir):
+ # Skip hidden directories and common build/cache directories
+ dirs[:] = [d for d in dirs if not d.startswith('.') and
+ d not in {'node_modules', '__pycache__', 'build', 'dist', 'target', 'venv', 'env'}]
+
+ level = root.replace(self.temp_dir, '').count(os.sep)
+ indent = ' ' * level
+ folder_name = os.path.basename(root) if root != self.temp_dir else '.'
+ repo_data['structure'].append(f"{indent}{folder_name}/")
+
+ # Process files
+ for file in files:
+ if file.startswith('.'):
+ continue
+
+ file_path = Path(root) / file
+ repo_data['stats']['total_files'] += 1
+ repo_data['stats']['total_size'] += file_path.stat().st_size
+
+ # Track languages
+ ext = file_path.suffix.lower()
+ if ext:
+ repo_data['stats']['languages'][ext] = repo_data['stats']['languages'].get(ext, 0) + 1
+
+ # Add to structure
+ repo_data['structure'].append(f"{indent} {file}")
+
+ # Decide if we should analyze this file
+ should_analyze = (
+ file.lower() in self.important_files or
+ ext in self.code_extensions or
+ repo_data['stats']['analyzed_files'] < max_files
+ )
+
+ if should_analyze and repo_data['stats']['analyzed_files'] < max_files:
+ file_info = self.get_file_info(file_path)
+ repo_data['files'].append(file_info)
+ repo_data['stats']['analyzed_files'] += 1
+
+ return repo_data
+
+ def call_claude_api(self, prompt: str, max_tokens: int = 4000) -> str:
+ """Make API call to Claude using official Anthropic client."""
+ try:
+ message = self.client.messages.create(
+ model="claude-3-sonnet-20240229",
+ max_tokens=max_tokens,
+ messages=[
+ {"role": "user", "content": prompt}
+ ]
+ )
+
+ return message.content[0].text
+
+ except Exception as e:
+ raise Exception(f"Claude API call failed: {e}")
+
+ def analyze_repository_overview(self, repo_data: Dict) -> str:
+ """Get high-level repository analysis."""
+ print("Analyzing repository overview...")
+
+ structure_summary = "\n".join(repo_data['structure'][:100]) # Limit structure size
+
+ prompt = f"""
+Analyze this GitHub repository and provide a comprehensive overview:
+
+REPOSITORY STRUCTURE:
+{structure_summary}
+
+STATISTICS:
+- Total files: {repo_data['stats']['total_files']}
+- Files analyzed: {repo_data['stats']['analyzed_files']}
+- Total size: {repo_data['stats']['total_size']} bytes
+- Languages found: {dict(list(repo_data['stats']['languages'].items())[:10])}
+
+Please provide:
+1. **Project Type & Purpose**: What kind of project is this?
+2. **Technology Stack**: What technologies, frameworks, and languages are used?
+3. **Architecture Overview**: How is the project structured?
+4. **Key Components**: What are the main modules/components?
+5. **Development Setup**: What's needed to run this project?
+6. **Code Quality Assessment**: Initial observations about code organization
+"""
+
+ return self.call_claude_api(prompt)
+
+ def analyze_code_files(self, repo_data: Dict) -> str:
+ """Analyze individual code files."""
+ print("Analyzing code files...")
+
+ # Prepare file contents for analysis
+ files_content = []
+ for file_info in repo_data['files'][:20]: # Limit to first 20 files
+ if file_info.get('encoding') == 'utf-8' and len(file_info.get('content', '')) < 5000:
+ files_content.append(f"=== {file_info['path']} ===\n{file_info['content']}\n")
+
+ files_text = "\n".join(files_content)
+
+ prompt = f"""
+Analyze these key files from the repository:
+
+{files_text}
+
+Please provide detailed analysis covering:
+1. **Code Quality**: Code style, organization, and best practices
+2. **Design Patterns**: What patterns and architectural approaches are used?
+3. **Dependencies & Libraries**: Key external dependencies identified
+4. **Potential Issues**: Any code smells, security concerns, or improvements needed
+5. **Testing Strategy**: How is testing implemented (if at all)?
+6. **Documentation**: Quality of inline documentation and comments
+7. **Maintainability**: How maintainable and extensible is this code?
+"""
+
+ return self.call_claude_api(prompt, max_tokens=6000)
+
+ def analyze_security_and_best_practices(self, repo_data: Dict) -> str:
+ """Analyze security and best practices."""
+ print("Analyzing security and best practices...")
+
+ # Look for security-sensitive files
+ security_files = []
+ for file_info in repo_data['files']:
+ path_lower = file_info['path'].lower()
+ if any(term in path_lower for term in ['config', 'env', 'secret', 'key', 'auth', 'security']):
+ if file_info.get('encoding') == 'utf-8':
+ security_files.append(f"=== {file_info['path']} ===\n{file_info['content'][:2000]}\n")
+
+ security_content = "\n".join(security_files[:10])
+
+ prompt = f"""
+Analyze this repository for security and best practices:
+
+SECURITY-RELEVANT FILES:
+{security_content}
+
+FILE STRUCTURE ANALYSIS:
+{json.dumps(repo_data['stats'], indent=2)}
+
+Please analyze:
+1. **Security Issues**: Potential security vulnerabilities or concerns
+2. **Secret Management**: How are secrets/credentials handled?
+3. **Dependencies**: Are there any vulnerable dependencies?
+4. **Best Practices**: Adherence to language/framework best practices
+5. **Configuration**: Are configurations properly externalized?
+6. **Error Handling**: How are errors handled throughout the codebase?
+7. **Recommendations**: Specific suggestions for improvement
+"""
+
+ return self.call_claude_api(prompt, max_tokens=5000)
+
+ def generate_comprehensive_report(self, repo_url: str, overview: str, code_analysis: str, security_analysis: str) -> str:
+ """Generate final comprehensive report."""
+ print("Generating comprehensive report...")
+
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+ report = f"""
+# GitHub Repository Analysis Report
+
+**Repository:** {repo_url}
+**Analysis Date:** {timestamp}
+**Analyzed by:** Claude AI Assistant
+
+---
+
+## Executive Summary
+
+{overview}
+
+---
+
+## Detailed Code Analysis
+
+{code_analysis}
+
+---
+
+## Security & Best Practices Analysis
+
+{security_analysis}
+
+---
+
+## Recommendations Summary
+
+Based on the analysis, here are the key recommendations for this repository:
+
+1. **Immediate Actions**: Critical issues that should be addressed promptly
+2. **Code Quality Improvements**: Suggestions for better maintainability
+3. **Security Enhancements**: Steps to improve security posture
+4. **Documentation**: Areas where documentation could be enhanced
+5. **Architecture**: Potential architectural improvements
+
+---
+
+*This analysis was generated using AI and should be reviewed by human developers for accuracy and context.*
+"""
+
+ return report
+
+ def analyze(self, repo_url: str, output_file: Optional[str] = None) -> str:
+ """Main analysis function."""
+ try:
+ # Clone repository
+ self.clone_repository(repo_url)
+
+ # Scan repository structure and files
+ repo_data = self.scan_repository()
+
+ # Perform different types of analysis
+ overview = self.analyze_repository_overview(repo_data)
+ code_analysis = self.analyze_code_files(repo_data)
+ security_analysis = self.analyze_security_and_best_practices(repo_data)
+
+ # Generate comprehensive report
+ final_report = self.generate_comprehensive_report(
+ repo_url, overview, code_analysis, security_analysis
+ )
+
+ # Save report if output file specified
+ if output_file:
+ with open(output_file, 'w', encoding='utf-8') as f:
+ f.write(final_report)
+ print(f"Report saved to: {output_file}")
+
+ return final_report
+
+ finally:
+ # Cleanup temporary directory
+ if self.temp_dir and os.path.exists(self.temp_dir):
+ shutil.rmtree(self.temp_dir)
+ print("Temporary files cleaned up")
+
+def main():
+ parser = argparse.ArgumentParser(description="Analyze GitHub repository using Claude AI")
+ parser.add_argument("repo_url", help="GitHub repository URL")
+ parser.add_argument("--api-key", help="Anthropic API key (optional if set in .env)")
+ parser.add_argument("--output", "-o", help="Output file path (optional)")
+ parser.add_argument("--max-files", type=int, default=50, help="Maximum files to analyze")
+
+ args = parser.parse_args()
+
+ try:
+ # Initialize analyzer
+ analyzer = GitHubRepoAnalyzer(args.api_key)
+
+ print("Starting GitHub repository analysis...")
+ print("=" * 50)
+
+ # Perform analysis
+ report = analyzer.analyze(args.repo_url, args.output)
+
+ # Print report if no output file specified
+ if not args.output:
+ print("\n" + "=" * 50)
+ print("ANALYSIS REPORT")
+ print("=" * 50)
+ print(report)
+
+ print("\nAnalysis completed successfully!")
+
+ except Exception as e:
+ print(f"Error during analysis: {e}")
+ return 1
+
+ return 0
+
+if __name__ == "__main__":
+ exit(main())
\ No newline at end of file
diff --git a/services/ai-analysis-service/ai-analysis/requirements.txt b/services/ai-analysis-service/ai-analysis/requirements.txt
new file mode 100644
index 0000000..50994fd
--- /dev/null
+++ b/services/ai-analysis-service/ai-analysis/requirements.txt
@@ -0,0 +1,69 @@
+# Core AI and API
+anthropic>=0.7.0
+openai>=1.0.0
+
+# Environment management
+python-dotenv>=1.0.0
+
+# Git operations
+GitPython>=3.1.0
+
+# PDF generation
+reportlab>=4.0.0
+matplotlib>=3.7.0
+pillow>=10.0.0
+
+# Code analysis and parsing
+ast-comments>=1.1.0
+astroid>=3.0.0
+pygments>=2.15.0
+radon>=6.0.1
+bandit>=1.7.5
+flake8>=6.0.0
+pylint>=3.0.0
+
+# File operations and utilities
+pathlib2>=2.3.7
+chardet>=5.2.0
+python-magic>=0.4.27
+
+# Async operations
+aiohttp>=3.8.0
+aiofiles>=23.0.0
+asyncio-throttle>=1.0.2
+
+# Data processing
+pandas>=2.0.0
+numpy>=1.24.0
+python-dateutil>=2.8.0
+
+# Web scraping (for additional repo info)
+requests>=2.31.0
+beautifulsoup4>=4.12.0
+
+# Testing and code quality
+pytest>=7.4.0
+pytest-asyncio>=0.21.0
+coverage>=7.3.0
+
+# Additional utilities for advanced analysis
+networkx>=3.1.0 # For dependency graph analysis
+graphviz>=0.20.0 # For visualization
+jinja2>=3.1.0 # For report templating
+markdown>=3.4.0 # For markdown processing
+pyyaml>=6.0.0 # For YAML config files
+toml>=0.10.2 # For TOML config files
+xmltodict>=0.13.0 # For XML processing
+
+# Performance monitoring
+psutil>=5.9.0
+memory-profiler>=0.61.0
+
+# Progress bars and UI
+tqdm>=4.65.0
+rich>=13.5.0
+click>=8.1.0
+
+# Security scanning
+safety>=2.3.0
+pip-audit>=2.6.0
\ No newline at end of file
diff --git a/services/ai-analysis-service/ai-analyze.py b/services/ai-analysis-service/ai-analyze.py
new file mode 100644
index 0000000..7a9ac4b
--- /dev/null
+++ b/services/ai-analysis-service/ai-analyze.py
@@ -0,0 +1,1570 @@
+#!/usr/bin/env python3
+"""
+Complete AI Repository Analysis Tool with Memory System
+Automatically analyzes ALL files in a repository without limits.
+
+Features:
+- Analyzes ALL files in the repository (no max-files limit)
+- No user query required - fully automated analysis
+- Memory-enhanced analysis with learning capabilities
+- Comprehensive PDF report generation
+- Security, architecture, and code quality assessment
+
+Usage:
+ python ai-analyze.py /path/to/repo --output analysis.pdf
+
+Example:
+ python ai-analyze.py ./my-project --output complete_analysis.pdf
+"""
+
+import os
+import asyncio
+import hashlib
+import json
+import uuid
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Any
+from datetime import datetime, timedelta
+from dataclasses import dataclass, asdict
+from collections import defaultdict, Counter
+import logging
+import tempfile
+import shutil
+import re
+
+# Core packages
+import anthropic
+from dotenv import load_dotenv
+import git
+import redis
+import pymongo
+import psycopg2
+from psycopg2.extras import RealDictCursor
+import numpy as np
+
+# PDF generation
+from reportlab.lib.pagesizes import A4
+from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
+from reportlab.lib.enums import TA_CENTER, TA_LEFT
+from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak, Table, TableStyle
+from reportlab.lib import colors
+
+# Enhanced dataclasses for memory system
+@dataclass
+class MemoryRecord:
+ id: str
+ timestamp: datetime
+ memory_type: str # 'episodic', 'persistent', 'working'
+ content: Dict[str, Any]
+ embeddings: Optional[List[float]] = None
+ metadata: Optional[Dict[str, Any]] = None
+ expiry: Optional[datetime] = None
+
+@dataclass
+class CodeAnalysisMemory:
+ repo_id: str
+ file_path: str
+ analysis_hash: str
+ analysis_data: Dict[str, Any]
+ embedding: List[float]
+ last_updated: datetime
+ access_count: int = 0
+ relevance_score: float = 1.0
+
+@dataclass
+class EpisodicMemory:
+ session_id: str
+ user_query: str
+ ai_response: str
+ repo_context: str
+ timestamp: datetime
+ embedding: List[float]
+ metadata: Dict[str, Any]
+
+@dataclass
+class PersistentMemory:
+ fact_id: str
+ content: str
+ category: str # 'code_pattern', 'best_practice', 'vulnerability', 'architecture'
+ confidence: float
+ embedding: List[float]
+ source_repos: List[str]
+ created_at: datetime
+ last_accessed: datetime
+ access_frequency: int = 0
+
+@dataclass
+class FileAnalysis:
+ path: str
+ language: str
+ lines_of_code: int
+ complexity_score: float
+ issues_found: List[str]
+ recommendations: List[str]
+ detailed_analysis: str
+ severity_score: float
+
+@dataclass
+class RepositoryAnalysis:
+ repo_path: str
+ total_files: int
+ total_lines: int
+ languages: Dict[str, int]
+ architecture_assessment: str
+ security_assessment: str
+ code_quality_score: float
+ file_analyses: List[FileAnalysis]
+ executive_summary: str
+
+class MemoryManager:
+ """Advanced memory management system for AI repository analysis."""
+
+ def __init__(self, config: Dict[str, Any]):
+ self.config = config
+ self.setup_logging()
+
+ # Initialize Claude client for embeddings
+ self.claude_client = anthropic.Anthropic(api_key=config.get('anthropic_api_key', ''))
+
+ # Initialize database connections
+ self.setup_databases()
+
+ # Memory configuration
+ self.working_memory_ttl = 3600 # 1 hour
+ self.episodic_retention_days = 365 # 1 year
+ self.persistent_memory_threshold = 0.8 # Confidence threshold for persistence
+
+ def setup_logging(self):
+ logging.basicConfig(level=logging.INFO)
+ self.logger = logging.getLogger(__name__)
+
+ def setup_databases(self):
+ """Initialize all database connections."""
+ try:
+ # Redis for working memory (temporary, fast access)
+ self.redis_client = redis.Redis(
+ host=self.config.get('redis_host', 'localhost'),
+ port=self.config.get('redis_port', 6379),
+ db=self.config.get('redis_db', 0),
+ decode_responses=True
+ )
+
+ # MongoDB for documents and episodic memory
+ self.mongo_client = pymongo.MongoClient(
+ self.config.get('mongodb_url', 'mongodb://localhost:27017/')
+ )
+ self.mongo_db = self.mongo_client[self.config.get('mongodb_name', 'repo_analyzer')]
+
+ # Collections
+ self.episodic_collection = self.mongo_db['episodic_memories']
+ self.analysis_collection = self.mongo_db['code_analyses']
+ self.persistent_collection = self.mongo_db['persistent_memories']
+ self.repo_metadata_collection = self.mongo_db['repository_metadata']
+
+ # PostgreSQL with pgvector for vector operations
+ self.pg_conn = psycopg2.connect(
+ host=self.config.get('postgres_host', 'localhost'),
+ port=self.config.get('postgres_port', 5432),
+ database=self.config.get('postgres_db', 'dev_pipeline'),
+ user=self.config.get('postgres_user', 'pipeline_admin'),
+ password=self.config.get('postgres_password', 'secure_pipeline_2024')
+ )
+
+ # Check if pgvector is available
+ try:
+ with self.pg_conn.cursor() as cur:
+ cur.execute("SELECT 1 FROM pg_extension WHERE extname = 'vector';")
+ self.has_vector = cur.fetchone() is not None
+ except:
+ self.has_vector = False
+
+ self.logger.info("All database connections established successfully")
+
+ except Exception as e:
+ self.logger.error(f"Database setup failed: {e}")
+ raise
+
+ def generate_embedding(self, text: str) -> List[float]:
+ """Generate embedding for text using Claude API."""
+ try:
+ # Use Claude to generate semantic embeddings
+ # Truncate text if too long for Claude API
+ if len(text) > 8000:
+ text = text[:8000] + "..."
+
+ prompt = f"""
+ Convert the following text into a 384-dimensional numerical vector that represents its semantic meaning.
+ The vector should be suitable for similarity search and clustering.
+
+ Text: {text}
+
+ Return only a JSON array of 384 floating-point numbers between -1 and 1, like this:
+ [0.123, -0.456, 0.789, ...]
+ """
+
+ message = self.claude_client.messages.create(
+ model="claude-3-5-sonnet-20240620",
+ max_tokens=2000,
+ temperature=0.1,
+ messages=[{"role": "user", "content": prompt}]
+ )
+
+ response_text = message.content[0].text.strip()
+
+ # Extract JSON array from response
+ import json
+ import re
+
+ # Find JSON array in response
+ json_match = re.search(r'\[[\d\.,\s-]+\]', response_text)
+ if json_match:
+ embedding = json.loads(json_match.group())
+ if len(embedding) == 384:
+ return embedding
+
+ # Fallback: generate deterministic embedding from text hash
+ return self._generate_fallback_embedding(text)
+
+ except Exception as e:
+ self.logger.error(f"Claude embedding generation failed: {e}")
+ return self._generate_fallback_embedding(text)
+
+ def _generate_fallback_embedding(self, text: str) -> List[float]:
+ """Generate fallback embedding using text hash."""
+ try:
+ import hashlib
+ import struct
+
+ # Create a deterministic hash-based embedding
+ hash_obj = hashlib.sha256(text.encode('utf-8'))
+ hash_bytes = hash_obj.digest()
+
+ # Convert to 384-dimensional vector
+ embedding = []
+ for i in range(0, len(hash_bytes), 4):
+ if len(embedding) >= 384:
+ break
+ chunk = hash_bytes[i:i+4]
+ if len(chunk) == 4:
+ # Convert 4 bytes to float and normalize
+ value = struct.unpack('>I', chunk)[0] / (2**32 - 1) # Normalize to 0-1
+ embedding.append(value * 2 - 1) # Scale to -1 to 1
+
+ # Pad to exactly 384 dimensions
+ while len(embedding) < 384:
+ embedding.append(0.0)
+
+ return embedding[:384]
+
+ except Exception as e:
+ self.logger.error(f"Fallback embedding generation failed: {e}")
+ return [0.0] * 384
+
+ def calculate_content_hash(self, content: str) -> str:
+ """Calculate SHA-256 hash of content for change detection."""
+ return hashlib.sha256(content.encode()).hexdigest()
+
+ async def store_working_memory(self, key: str, data: Dict[str, Any], ttl: Optional[int] = None) -> bool:
+ """Store temporary data in working memory (Redis)."""
+ try:
+ ttl = ttl or self.working_memory_ttl
+ serialized_data = json.dumps(data, default=str)
+ self.redis_client.setex(f"working:{key}", ttl, serialized_data)
+ return True
+ except Exception as e:
+ self.logger.error(f"Working memory storage failed: {e}")
+ return False
+
+ async def get_working_memory(self, key: str) -> Optional[Dict[str, Any]]:
+ """Retrieve data from working memory."""
+ try:
+ data = self.redis_client.get(f"working:{key}")
+ return json.loads(data) if data else None
+ except Exception as e:
+ self.logger.error(f"Working memory retrieval failed: {e}")
+ return None
+
+ async def store_episodic_memory(self, session_id: str, user_query: str,
+ ai_response: str, repo_context: str,
+ metadata: Optional[Dict] = None) -> str:
+ """Store interaction in episodic memory."""
+ try:
+ memory_id = str(uuid.uuid4())
+
+ # Generate embeddings
+ query_embedding = self.generate_embedding(user_query)
+ response_embedding = self.generate_embedding(ai_response)
+
+ # Store in MongoDB
+ episodic_record = {
+ 'memory_id': memory_id,
+ 'session_id': session_id,
+ 'user_query': user_query,
+ 'ai_response': ai_response,
+ 'repo_context': repo_context,
+ 'timestamp': datetime.utcnow(),
+ 'metadata': metadata or {}
+ }
+ self.episodic_collection.insert_one(episodic_record)
+
+ # Store embeddings in PostgreSQL for similarity search
+ with self.pg_conn.cursor() as cur:
+ cur.execute("""
+ INSERT INTO query_embeddings
+ (session_id, query_text, query_embedding, response_embedding, repo_context, metadata)
+ VALUES (%s, %s, %s, %s, %s, %s)
+ """, (
+ session_id, user_query, query_embedding, response_embedding,
+ repo_context, json.dumps(metadata or {})
+ ))
+ self.pg_conn.commit()
+
+ self.logger.info(f"Episodic memory stored: {memory_id}")
+ return memory_id
+
+ except Exception as e:
+ self.logger.error(f"Episodic memory storage failed: {e}")
+ return ""
+
+ async def retrieve_episodic_memories(self, query: str, repo_context: str = "",
+ limit: int = 10, similarity_threshold: float = 0.7) -> List[Dict]:
+ """Retrieve relevant episodic memories based on query similarity."""
+ try:
+ query_embedding = self.generate_embedding(query)
+
+ with self.pg_conn.cursor(cursor_factory=RealDictCursor) as cur:
+ # Find similar queries using cosine similarity
+ cur.execute("""
+ SELECT session_id, query_text, repo_context, timestamp, metadata,
+ 1 - (query_embedding <=> %s::vector) as similarity
+ FROM query_embeddings
+ WHERE (%s = '' OR repo_context = %s)
+ AND 1 - (query_embedding <=> %s::vector) > %s
+ ORDER BY similarity DESC
+ LIMIT %s
+ """, (query_embedding, repo_context, repo_context, query_embedding, similarity_threshold, limit))
+
+ similar_queries = cur.fetchall()
+
+ # Fetch full episodic records from MongoDB
+ memories = []
+ for query_record in similar_queries:
+ episodic_record = self.episodic_collection.find_one({
+ 'session_id': query_record['session_id'],
+ 'timestamp': query_record['timestamp']
+ })
+ if episodic_record:
+ episodic_record['similarity_score'] = float(query_record['similarity'])
+ memories.append(episodic_record)
+
+ return memories
+
+ except Exception as e:
+ self.logger.error(f"Episodic memory retrieval failed: {e}")
+ return []
+
+ async def store_persistent_memory(self, content: str, category: str,
+ confidence: float, source_repos: List[str]) -> str:
+ """Store long-term knowledge in persistent memory."""
+ try:
+ fact_id = str(uuid.uuid4())
+ embedding = self.generate_embedding(content)
+
+ # Store in MongoDB
+ persistent_record = {
+ 'fact_id': fact_id,
+ 'content': content,
+ 'category': category,
+ 'confidence': confidence,
+ 'source_repos': source_repos,
+ 'created_at': datetime.utcnow(),
+ 'last_accessed': datetime.utcnow(),
+ 'access_frequency': 1
+ }
+ self.persistent_collection.insert_one(persistent_record)
+
+ # Store embedding in PostgreSQL
+ with self.pg_conn.cursor() as cur:
+ if self.has_vector:
+ cur.execute("""
+ INSERT INTO knowledge_embeddings
+ (fact_id, content, category, embedding, confidence, source_repos)
+ VALUES (%s, %s, %s, %s, %s, %s)
+ """, (fact_id, content, category, embedding, confidence, source_repos))
+ else:
+ cur.execute("""
+ INSERT INTO knowledge_embeddings
+ (fact_id, content, category, confidence, source_repos)
+ VALUES (%s, %s, %s, %s, %s)
+ """, (fact_id, content, category, confidence, source_repos))
+ self.pg_conn.commit()
+
+ self.logger.info(f"Persistent memory stored: {fact_id}")
+ return fact_id
+
+ except Exception as e:
+ self.logger.error(f"Persistent memory storage failed: {e}")
+ return ""
+
+ async def retrieve_persistent_memories(self, query: str, category: str = "",
+ limit: int = 20, similarity_threshold: float = 0.6) -> List[Dict]:
+ """Retrieve relevant persistent knowledge."""
+ try:
+ query_embedding = self.generate_embedding(query)
+
+ with self.pg_conn.cursor(cursor_factory=RealDictCursor) as cur:
+ # Check if table exists first
+ cur.execute("""
+ SELECT EXISTS (
+ SELECT FROM information_schema.tables
+ WHERE table_name = 'knowledge_embeddings'
+ );
+ """)
+ table_exists = cur.fetchone()[0]
+
+ if not table_exists:
+ self.logger.warning("knowledge_embeddings table does not exist, returning empty results")
+ return []
+
+ # Build WHERE clause dynamically
+ if hasattr(self, 'has_vector') and self.has_vector:
+ where_conditions = ["1 - (embedding <=> %s::vector) > %s"]
+ params = [query_embedding, similarity_threshold]
+ else:
+ # Fallback to text-based search
+ where_conditions = ["content ILIKE %s"]
+ params = [f"%{query}%"]
+
+ if category:
+ where_conditions.append("category = %s")
+ params.append(category)
+
+ where_clause = " AND ".join(where_conditions)
+ params.extend([limit])
+
+ if hasattr(self, 'has_vector') and self.has_vector:
+ cur.execute(f"""
+ SELECT fact_id, content, category, confidence, source_repos,
+ 1 - (embedding <=> %s::vector) as similarity,
+ created_at, last_accessed, access_frequency
+ FROM knowledge_embeddings
+ WHERE {where_clause}
+ ORDER BY similarity DESC, confidence DESC, access_frequency DESC
+ LIMIT %s
+ """, params)
+ else:
+ cur.execute(f"""
+ SELECT fact_id, content, category, confidence, source_repos,
+ 0.8 as similarity,
+ created_at, last_accessed, access_frequency
+ FROM knowledge_embeddings
+ WHERE {where_clause}
+ ORDER BY confidence DESC, access_frequency DESC
+ LIMIT %s
+ """, params)
+
+ results = cur.fetchall()
+
+ # Update access frequency
+ for result in results:
+ cur.execute("""
+ UPDATE knowledge_embeddings
+ SET last_accessed = CURRENT_TIMESTAMP,
+ access_frequency = access_frequency + 1
+ WHERE fact_id = %s
+ """, (result['fact_id'],))
+
+ self.pg_conn.commit()
+ return [dict(result) for result in results]
+
+ except Exception as e:
+ self.logger.error(f"Persistent memory retrieval failed: {e}")
+ return []
+
+ async def store_code_analysis(self, repo_id: str, file_path: str,
+ analysis_data: Dict[str, Any]) -> str:
+ """Store code analysis with embeddings for future retrieval."""
+ try:
+ content_hash = self.calculate_content_hash(json.dumps(analysis_data, sort_keys=True))
+
+ # Create searchable content for embedding
+ searchable_content = f"""
+ File: {file_path}
+ Language: {analysis_data.get('language', 'Unknown')}
+ Issues: {' '.join(analysis_data.get('issues_found', []))}
+ Recommendations: {' '.join(analysis_data.get('recommendations', []))}
+ Analysis: {analysis_data.get('detailed_analysis', '')}
+ """
+
+ embedding = self.generate_embedding(searchable_content)
+
+ # Store in MongoDB
+ analysis_record = {
+ 'repo_id': repo_id,
+ 'file_path': file_path,
+ 'content_hash': content_hash,
+ 'analysis_data': analysis_data,
+ 'created_at': datetime.utcnow(),
+ 'last_accessed': datetime.utcnow(),
+ 'access_count': 1
+ }
+
+ # Upsert to handle updates
+ self.analysis_collection.update_one(
+ {'repo_id': repo_id, 'file_path': file_path},
+ {'$set': analysis_record},
+ upsert=True
+ )
+
+ # Store embedding in PostgreSQL
+ with self.pg_conn.cursor() as cur:
+ if self.has_vector:
+ cur.execute("""
+ INSERT INTO code_embeddings (repo_id, file_path, content_hash, embedding, metadata)
+ VALUES (%s, %s, %s, %s, %s)
+ ON CONFLICT (repo_id, file_path, content_hash)
+ DO UPDATE SET last_accessed = CURRENT_TIMESTAMP
+ """, (
+ repo_id, file_path, content_hash, embedding,
+ json.dumps({
+ 'language': analysis_data.get('language'),
+ 'lines_of_code': analysis_data.get('lines_of_code', 0),
+ 'severity_score': analysis_data.get('severity_score', 5.0)
+ })
+ ))
+ else:
+ cur.execute("""
+ INSERT INTO code_embeddings (repo_id, file_path, content_hash, embedding_text, metadata)
+ VALUES (%s, %s, %s, %s, %s)
+ ON CONFLICT (repo_id, file_path, content_hash)
+ DO UPDATE SET last_accessed = CURRENT_TIMESTAMP
+ """, (
+ repo_id, file_path, content_hash, json.dumps(embedding),
+ json.dumps({
+ 'language': analysis_data.get('language'),
+ 'lines_of_code': analysis_data.get('lines_of_code', 0),
+ 'severity_score': analysis_data.get('severity_score', 5.0)
+ })
+ ))
+ self.pg_conn.commit()
+
+ return content_hash
+
+ except Exception as e:
+ self.logger.error(f"Code analysis storage failed: {e}")
+ return ""
+
+ async def search_similar_code(self, query: str, repo_id: str = "",
+ limit: int = 10) -> List[Dict]:
+ """Search for similar code analyses."""
+ try:
+ query_embedding = self.generate_embedding(query)
+
+ with self.pg_conn.cursor(cursor_factory=RealDictCursor) as cur:
+ # Check if table exists first
+ cur.execute("""
+ SELECT EXISTS (
+ SELECT FROM information_schema.tables
+ WHERE table_name = 'code_embeddings'
+ );
+ """)
+ table_exists = cur.fetchone()[0]
+
+ if not table_exists:
+ self.logger.warning("code_embeddings table does not exist, returning empty results")
+ return []
+
+ where_clause = "WHERE 1=1"
+ params = [query_embedding]
+
+ if repo_id:
+ where_clause += " AND repo_id = %s"
+ params.append(repo_id)
+
+ params.append(limit)
+
+ cur.execute(f"""
+ SELECT repo_id, file_path, content_hash, metadata,
+ 1 - (embedding <=> %s::vector) as similarity
+ FROM code_embeddings
+ {where_clause}
+ ORDER BY similarity DESC
+ LIMIT %s
+ """, params)
+
+ results = cur.fetchall()
+
+ # Fetch full analysis data from MongoDB
+ enriched_results = []
+ for result in results:
+ analysis = self.analysis_collection.find_one({
+ 'repo_id': result['repo_id'],
+ 'file_path': result['file_path']
+ })
+ if analysis:
+ analysis['similarity_score'] = float(result['similarity'])
+ enriched_results.append(analysis)
+
+ return enriched_results
+
+ except Exception as e:
+ self.logger.error(f"Similar code search failed: {e}")
+ return []
+
+ async def cleanup_old_memories(self):
+ """Clean up old episodic memories and update access patterns."""
+ try:
+ cutoff_date = datetime.utcnow() - timedelta(days=self.episodic_retention_days)
+
+ # Clean up old episodic memories
+ result = self.episodic_collection.delete_many({
+ 'timestamp': {'$lt': cutoff_date}
+ })
+ self.logger.info(f"Cleaned up {result.deleted_count} old episodic memories")
+
+ # Clean up corresponding query embeddings
+ with self.pg_conn.cursor() as cur:
+ cur.execute("DELETE FROM query_embeddings WHERE timestamp < %s", (cutoff_date,))
+ self.pg_conn.commit()
+
+ # Update persistent memory relevance based on access patterns
+ await self.update_persistent_memory_relevance()
+
+ except Exception as e:
+ self.logger.error(f"Memory cleanup failed: {e}")
+
+ async def update_persistent_memory_relevance(self):
+ """Update relevance scores for persistent memories based on access patterns."""
+ try:
+ with self.pg_conn.cursor() as cur:
+ # Calculate relevance based on recency and frequency
+ cur.execute("""
+ UPDATE knowledge_embeddings
+ SET confidence = LEAST(confidence * (
+ CASE
+ WHEN EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - last_accessed)) / 86400 < 30
+ THEN 1.1
+ ELSE 0.95
+ END *
+ (1.0 + LOG(access_frequency + 1) / 10.0)
+ ), 1.0)
+ """)
+ self.pg_conn.commit()
+
+ except Exception as e:
+ self.logger.error(f"Relevance update failed: {e}")
+
+ async def get_memory_stats(self) -> Dict[str, Any]:
+ """Get comprehensive memory system statistics."""
+ try:
+ stats = {}
+
+ # Working memory stats (Redis)
+ working_keys = self.redis_client.keys("working:*")
+ stats['working_memory'] = {
+ 'total_keys': len(working_keys),
+ 'memory_usage': self.redis_client.info()['used_memory_human']
+ }
+
+ # Episodic memory stats (MongoDB)
+ stats['episodic_memory'] = {
+ 'total_records': self.episodic_collection.count_documents({}),
+ 'recent_interactions': self.episodic_collection.count_documents({
+ 'timestamp': {'$gte': datetime.utcnow() - timedelta(days=7)}
+ })
+ }
+
+ # Persistent memory stats
+ stats['persistent_memory'] = {
+ 'total_facts': self.persistent_collection.count_documents({}),
+ 'high_confidence_facts': self.persistent_collection.count_documents({
+ 'confidence': {'$gte': 0.8}
+ })
+ }
+
+ # Code analysis stats
+ stats['code_analysis'] = {
+ 'total_analyses': self.analysis_collection.count_documents({}),
+ 'unique_repositories': len(self.analysis_collection.distinct('repo_id'))
+ }
+
+ # Vector database stats (PostgreSQL)
+ with self.pg_conn.cursor(cursor_factory=RealDictCursor) as cur:
+ cur.execute("SELECT COUNT(*) as count FROM code_embeddings")
+ code_embeddings_count = cur.fetchone()['count']
+
+ cur.execute("SELECT COUNT(*) as count FROM knowledge_embeddings")
+ knowledge_embeddings_count = cur.fetchone()['count']
+
+ stats['vector_database'] = {
+ 'code_embeddings': code_embeddings_count,
+ 'knowledge_embeddings': knowledge_embeddings_count
+ }
+
+ return stats
+
+ except Exception as e:
+ self.logger.error(f"Stats retrieval failed: {e}")
+ return {}
+
+class MemoryQueryEngine:
+ """Advanced querying capabilities across memory systems."""
+
+ def __init__(self, memory_manager: MemoryManager):
+ self.memory = memory_manager
+
+ async def intelligent_query(self, query: str, repo_context: str = "") -> Dict[str, Any]:
+ """Intelligent cross-memory querying with relevance scoring."""
+ try:
+ # Multi-source memory retrieval
+ results = await asyncio.gather(
+ self.memory.retrieve_episodic_memories(query, repo_context, limit=5),
+ self.memory.retrieve_persistent_memories(query, limit=10),
+ self.memory.search_similar_code(query, repo_context, limit=5)
+ )
+
+ episodic_memories, persistent_knowledge, similar_code = results
+
+ # Relevance scoring and fusion
+ fused_response = self.fuse_memory_responses(
+ query, episodic_memories, persistent_knowledge, similar_code
+ )
+
+ return {
+ 'query': query,
+ 'fused_response': fused_response,
+ 'sources': {
+ 'episodic_count': len(episodic_memories),
+ 'persistent_count': len(persistent_knowledge),
+ 'similar_code_count': len(similar_code)
+ },
+ 'confidence_score': self.calculate_response_confidence(fused_response),
+ 'timestamp': datetime.utcnow()
+ }
+
+ except Exception as e:
+ self.memory.logger.error(f"Intelligent query failed: {e}")
+ return {'error': str(e)}
+
+ def fuse_memory_responses(self, query: str, episodic: List, persistent: List, code: List) -> str:
+ """Fuse responses from different memory systems."""
+ response_parts = []
+
+ # Weight different memory types
+ if persistent:
+ high_conf_knowledge = [p for p in persistent if p.get('confidence', 0) > 0.8]
+ if high_conf_knowledge:
+ response_parts.append("Based on established knowledge:")
+ for knowledge in high_conf_knowledge[:3]:
+ response_parts.append(f"⢠{knowledge['content']}")
+
+ if episodic:
+ recent_interactions = sorted(episodic, key=lambda x: x.get('timestamp', datetime.min), reverse=True)[:2]
+ if recent_interactions:
+ response_parts.append("\nFrom previous interactions:")
+ for interaction in recent_interactions:
+ response_parts.append(f"⢠{interaction.get('ai_response', '')[:200]}...")
+
+ if code:
+ similar_patterns = [c for c in code if c.get('similarity_score', 0) > 0.7]
+ if similar_patterns:
+ response_parts.append("\nSimilar code patterns found:")
+ for pattern in similar_patterns[:2]:
+ issues = pattern.get('analysis_data', {}).get('issues_found', [])
+ if issues:
+ response_parts.append(f"⢠{pattern['file_path']}: {issues[0]}")
+
+ return '\n'.join(response_parts) if response_parts else "No relevant memories found."
+
+ def calculate_response_confidence(self, response: str) -> float:
+ """Calculate confidence score for fused response."""
+ if not response or response == "No relevant memories found.":
+ return 0.0
+
+ # Simple confidence calculation based on response length and structure
+ confidence = min(len(response.split()) / 100.0, 1.0) # Normalize by word count
+ if "Based on established knowledge:" in response:
+ confidence += 0.2
+ if "From previous interactions:" in response:
+ confidence += 0.1
+ if "Similar code patterns found:" in response:
+ confidence += 0.15
+
+ return min(confidence, 1.0)
+
+class EnhancedGitHubAnalyzer:
+ """Enhanced repository analyzer with memory capabilities."""
+
+ def __init__(self, api_key: str, memory_config: Dict[str, Any]):
+ self.client = anthropic.Anthropic(api_key=api_key)
+ self.memory_manager = MemoryManager(memory_config)
+ self.query_engine = MemoryQueryEngine(self.memory_manager)
+ self.session_id = str(uuid.uuid4())
+ self.temp_dir = None
+
+ # Language mapping for file detection
+ self.language_map = {
+ '.py': 'Python', '.js': 'JavaScript', '.ts': 'TypeScript',
+ '.tsx': 'TypeScript', '.jsx': 'JavaScript', '.java': 'Java',
+ '.cpp': 'C++', '.c': 'C', '.cs': 'C#', '.go': 'Go', '.rs': 'Rust',
+ '.php': 'PHP', '.rb': 'Ruby', '.swift': 'Swift', '.kt': 'Kotlin',
+ '.html': 'HTML', '.css': 'CSS', '.scss': 'SCSS', '.sass': 'SASS',
+ '.sql': 'SQL', '.yaml': 'YAML', '.yml': 'YAML', '.json': 'JSON',
+ '.xml': 'XML', '.sh': 'Shell', '.dockerfile': 'Docker',
+ '.md': 'Markdown', '.txt': 'Text'
+ }
+
+ # Code file extensions to analyze
+ self.code_extensions = set(self.language_map.keys())
+
+ def clone_repository(self, repo_path: str) -> str:
+ """Clone repository or use existing path."""
+ if os.path.exists(repo_path):
+ print(f"Using existing repository: {repo_path}")
+ return repo_path
+ else:
+ print(f"Cloning repository: {repo_path}")
+ self.temp_dir = tempfile.mkdtemp(prefix="repo_analysis_")
+ try:
+ git.Repo.clone_from(repo_path, self.temp_dir)
+ return self.temp_dir
+ except Exception as e:
+ raise Exception(f"Failed to clone repository: {e}")
+
+ def calculate_repo_id(self, repo_path: str) -> str:
+ """Generate consistent repository ID."""
+ return hashlib.sha256(repo_path.encode()).hexdigest()[:16]
+
+ def get_file_language(self, file_path: Path) -> str:
+ """Get programming language from file extension."""
+ return self.language_map.get(file_path.suffix.lower(), 'Unknown')
+
+ def calculate_complexity_score(self, content: str) -> float:
+ """Calculate basic complexity score based on code patterns."""
+ lines = content.split('\n')
+ complexity_indicators = ['if', 'else', 'elif', 'for', 'while', 'try', 'except', 'catch', 'switch']
+
+ complexity = 1
+ for line in lines:
+ line_lower = line.lower().strip()
+ for indicator in complexity_indicators:
+ if indicator in line_lower:
+ complexity += 1
+
+ # Normalize to 1-10 scale
+ return min(complexity / max(len(lines), 1) * 100, 10.0)
+
+ async def analyze_file_with_memory(self, file_path: Path, content: str, repo_id: str) -> FileAnalysis:
+ """Analyze file with memory-enhanced context."""
+ language = self.get_file_language(file_path)
+ lines_of_code = len([line for line in content.split('\n') if line.strip()])
+ complexity_score = self.calculate_complexity_score(content)
+
+ # Check for similar code patterns in memory
+ similar_analyses = await self.memory_manager.search_similar_code(
+ f"{language} {file_path.name}", repo_id, limit=3
+ )
+
+ # Get relevant knowledge from persistent memory
+ persistent_knowledge = await self.memory_manager.retrieve_persistent_memories(
+ f"{language} code quality security", category="", limit=5
+ )
+
+ # Build enhanced context for analysis
+ context_info = ""
+ if similar_analyses:
+ context_info += f"\nSimilar files previously analyzed:\n"
+ for similar in similar_analyses[:2]:
+ context_info += f"- {similar['file_path']}: Found {len(similar.get('analysis_data', {}).get('issues_found', []))} issues\n"
+
+ if persistent_knowledge:
+ context_info += f"\nRelevant best practices:\n"
+ for knowledge in persistent_knowledge[:3]:
+ context_info += f"- {knowledge['content'][:100]}...\n"
+
+ # Truncate content if too long
+ if len(content) > 4000:
+ content = content[:4000] + "\n... [truncated for analysis]"
+
+ print(f" Analyzing {file_path.name} ({language}, {lines_of_code} lines)")
+
+ # Create comprehensive analysis prompt with memory context
+ prompt = f"""
+You are a senior software engineer with 25+ years of experience. Analyze this {language} code file with context from previous analyses.
+
+FILENAME: {file_path.name}
+LANGUAGE: {language}
+LINES OF CODE: {lines_of_code}
+
+{context_info}
+
+CODE:
+```{language.lower()}
+{content}
+```
+
+Provide a comprehensive analysis covering:
+
+1. ISSUES FOUND: List specific problems, bugs, security vulnerabilities, or code smells
+2. RECOMMENDATIONS: Actionable suggestions for improvement
+3. CODE QUALITY: Overall assessment of code quality and maintainability
+4. SECURITY: Any security concerns or vulnerabilities
+5. PERFORMANCE: Potential performance issues or optimizations
+6. BEST PRACTICES: Adherence to coding standards and best practices
+
+Rate the overall code quality from 1-10 where 10 is excellent.
+
+ANALYSIS:
+"""
+
+ try:
+ message = self.client.messages.create(
+ model="claude-3-5-sonnet-20240620",
+ max_tokens=3000,
+ temperature=0.1,
+ messages=[{"role": "user", "content": prompt}]
+ )
+
+ analysis_text = message.content[0].text.strip()
+
+ # Extract severity score from analysis
+ severity_match = re.search(r'(\d+(?:\.\d+)?)/10', analysis_text)
+ severity_score = float(severity_match.group(1)) if severity_match else 5.0
+
+ # Parse issues and recommendations from the text
+ issues = self.extract_issues_from_analysis(analysis_text)
+ recommendations = self.extract_recommendations_from_analysis(analysis_text)
+
+ # Create file analysis object
+ file_analysis = FileAnalysis(
+ path=str(file_path.relative_to(Path(self.temp_dir or '.'))),
+ language=language,
+ lines_of_code=lines_of_code,
+ complexity_score=complexity_score,
+ issues_found=issues,
+ recommendations=recommendations,
+ detailed_analysis=analysis_text,
+ severity_score=severity_score
+ )
+
+ # Store analysis in memory for future reference
+ await self.memory_manager.store_code_analysis(
+ repo_id, str(file_analysis.path), asdict(file_analysis)
+ )
+
+ # Extract knowledge for persistent memory
+ await self.extract_knowledge_from_analysis(file_analysis, repo_id)
+
+ return file_analysis
+
+ except Exception as e:
+ print(f" Error analyzing {file_path.name}: {e}")
+ return FileAnalysis(
+ path=str(file_path),
+ language=language,
+ lines_of_code=lines_of_code,
+ complexity_score=complexity_score,
+ issues_found=[f"Analysis failed: {str(e)}"],
+ recommendations=["Review file manually due to analysis error"],
+ detailed_analysis=f"Analysis failed due to error: {str(e)}",
+ severity_score=5.0
+ )
+
+ def extract_issues_from_analysis(self, analysis_text: str) -> List[str]:
+ """Extract issues from analysis text."""
+ issues = []
+ lines = analysis_text.split('\n')
+
+ # Look for common issue indicators
+ issue_keywords = ['issue', 'problem', 'bug', 'vulnerability', 'error', 'warning', 'concern']
+
+ for line in lines:
+ line_lower = line.lower().strip()
+ if any(keyword in line_lower for keyword in issue_keywords):
+ if line.strip() and not line.strip().startswith('#'):
+ issues.append(line.strip())
+
+ return issues[:10] # Limit to top 10 issues
+
+ def extract_recommendations_from_analysis(self, analysis_text: str) -> List[str]:
+ """Extract recommendations from analysis text."""
+ recommendations = []
+ lines = analysis_text.split('\n')
+
+ # Look for recommendation indicators
+ rec_keywords = ['recommend', 'suggest', 'should', 'consider', 'improve']
+
+ for line in lines:
+ line_lower = line.lower().strip()
+ if any(keyword in line_lower for keyword in rec_keywords):
+ if line.strip() and not line.strip().startswith('#'):
+ recommendations.append(line.strip())
+
+ return recommendations[:10] # Limit to top 10 recommendations
+
+ async def extract_knowledge_from_analysis(self, file_analysis: FileAnalysis, repo_id: str):
+ """Extract valuable knowledge from analysis for persistent storage."""
+ try:
+ # Extract security-related knowledge
+ security_issues = [issue for issue in file_analysis.issues_found
+ if any(sec in issue.lower() for sec in ['security', 'vulnerability', 'injection', 'xss', 'auth'])]
+
+ for issue in security_issues:
+ await self.memory_manager.store_persistent_memory(
+ content=f"Security issue in {file_analysis.language}: {issue}",
+ category='security_vulnerability',
+ confidence=0.8,
+ source_repos=[repo_id]
+ )
+
+ # Extract best practices
+ best_practices = [rec for rec in file_analysis.recommendations
+ if any(bp in rec.lower() for bp in ['best practice', 'standard', 'convention'])]
+
+ for practice in best_practices:
+ await self.memory_manager.store_persistent_memory(
+ content=f"{file_analysis.language} best practice: {practice}",
+ category='best_practice',
+ confidence=0.7,
+ source_repos=[repo_id]
+ )
+
+ # Extract code patterns
+ if file_analysis.severity_score < 5:
+ await self.memory_manager.store_persistent_memory(
+ content=f"Low quality {file_analysis.language} pattern: {file_analysis.detailed_analysis[:200]}",
+ category='code_pattern',
+ confidence=0.6,
+ source_repos=[repo_id]
+ )
+
+ except Exception as e:
+ self.memory_manager.logger.error(f"Knowledge extraction failed: {e}")
+
+ def scan_repository(self, repo_path: str) -> List[Tuple[Path, str]]:
+ """Scan repository and collect ALL files for analysis."""
+ print(f"Scanning repository: {repo_path}")
+
+ files_to_analyze = []
+
+ # Important files to always include
+ important_files = {
+ 'README.md', 'package.json', 'requirements.txt', 'Dockerfile',
+ 'docker-compose.yml', 'tsconfig.json', 'next.config.js',
+ 'tailwind.config.js', 'webpack.config.js', '.env.example',
+ 'Cargo.toml', 'pom.xml', 'build.gradle', 'composer.json',
+ 'Gemfile', 'go.mod', 'yarn.lock', 'pnpm-lock.yaml'
+ }
+
+ for root, dirs, files in os.walk(repo_path):
+ # Skip common build/cache directories
+ dirs[:] = [d for d in dirs if not d.startswith('.') and
+ d not in {'node_modules', '__pycache__', 'build', 'dist', 'target',
+ 'venv', 'env', '.git', '.next', 'coverage', 'vendor',
+ 'bower_components', '.gradle', '.m2', '.cargo'}]
+
+ for file in files:
+ file_path = Path(root) / file
+
+ # Skip large files (increased limit for comprehensive analysis)
+ try:
+ if file_path.stat().st_size > 2000000: # 2MB limit
+ print(f" Skipping large file: {file_path.name} ({file_path.stat().st_size / 1024 / 1024:.1f}MB)")
+ continue
+ except:
+ continue
+
+ # Include important files or files with code extensions
+ should_include = (
+ file.lower() in important_files or
+ file_path.suffix.lower() in self.code_extensions or
+ file.lower().startswith('dockerfile') or
+ file.lower().startswith('makefile') or
+ file.lower().startswith('cmake')
+ )
+
+ if should_include:
+ try:
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+ content = f.read()
+ if content.strip(): # Only non-empty files
+ files_to_analyze.append((file_path, content))
+ except Exception as e:
+ print(f"Could not read {file_path}: {e}")
+
+ print(f"Found {len(files_to_analyze)} files to analyze")
+ return files_to_analyze
+
+ async def analyze_repository_with_memory(self, repo_path: str) -> RepositoryAnalysis:
+ """Main analysis function with memory integration - analyzes ALL files."""
+ try:
+ # Generate repo ID and check for cached analysis
+ repo_id = self.calculate_repo_id(repo_path)
+
+ # Check working memory for recent analysis
+ cached_analysis = await self.memory_manager.get_working_memory(f"repo_analysis:{repo_id}")
+ if cached_analysis:
+ print("Using cached repository analysis from memory")
+ return RepositoryAnalysis(**cached_analysis)
+
+ # Clone/access repository
+ actual_repo_path = self.clone_repository(repo_path)
+
+ # Get analysis context from memory (no user query needed)
+ context_memories = await self.get_analysis_context(repo_path, "", repo_id)
+
+ # Scan ALL files
+ files_to_analyze = self.scan_repository(actual_repo_path)
+
+ if not files_to_analyze:
+ raise Exception("No files found to analyze")
+
+ # Analyze each file with memory context
+ print(f"Starting comprehensive analysis of {len(files_to_analyze)} files...")
+ file_analyses = []
+
+ for i, (file_path, content) in enumerate(files_to_analyze):
+ print(f"Analyzing file {i+1}/{len(files_to_analyze)}: {file_path.name}")
+ analysis = await self.analyze_file_with_memory(file_path, content, repo_id)
+ file_analyses.append(analysis)
+
+ # Small delay to avoid rate limiting
+ await asyncio.sleep(0.1)
+
+ # Repository-level analyses with memory context
+ print("Performing repository-level analysis with memory context...")
+ architecture_assessment, security_assessment = await self.analyze_repository_overview_with_memory(
+ actual_repo_path, file_analyses, context_memories, repo_id
+ )
+
+ # Calculate overall quality score
+ avg_quality = sum(fa.severity_score for fa in file_analyses) / len(file_analyses)
+
+ # Generate statistics
+ languages = dict(Counter(fa.language for fa in file_analyses))
+ total_lines = sum(fa.lines_of_code for fa in file_analyses)
+
+ # Create repository analysis
+ repo_analysis = RepositoryAnalysis(
+ repo_path=repo_path,
+ total_files=len(file_analyses),
+ total_lines=total_lines,
+ languages=languages,
+ architecture_assessment=architecture_assessment,
+ security_assessment=security_assessment,
+ code_quality_score=avg_quality,
+ file_analyses=file_analyses,
+ executive_summary=""
+ )
+
+ # Generate executive summary with memory context
+ print("Generating memory-enhanced executive summary...")
+ repo_analysis.executive_summary = await self.generate_executive_summary_with_memory(
+ repo_analysis, context_memories
+ )
+
+ # Store analysis in episodic memory (automated analysis)
+ await self.memory_manager.store_episodic_memory(
+ self.session_id, "Complete automated repository analysis",
+ f"Analyzed {repo_analysis.total_files} files, found {sum(len(fa.issues_found) for fa in file_analyses)} issues",
+ repo_id,
+ {
+ 'repo_path': repo_path,
+ 'quality_score': avg_quality,
+ 'total_issues': sum(len(fa.issues_found) for fa in file_analyses),
+ 'analysis_type': 'automated_comprehensive'
+ }
+ )
+
+ # Cache analysis in working memory
+ await self.memory_manager.store_working_memory(
+ f"repo_analysis:{repo_id}",
+ asdict(repo_analysis),
+ ttl=7200 # 2 hours
+ )
+
+ return repo_analysis
+
+ finally:
+ # Cleanup
+ if self.temp_dir and os.path.exists(self.temp_dir):
+ shutil.rmtree(self.temp_dir)
+ print("Temporary files cleaned up")
+
+ async def get_analysis_context(self, repo_path: str, user_query: str, repo_id: str) -> Dict[str, List]:
+ """Gather relevant context from memory systems."""
+ context = {
+ 'episodic_memories': [],
+ 'persistent_knowledge': [],
+ 'similar_analyses': []
+ }
+
+ # Get relevant persistent knowledge for comprehensive analysis
+ context['persistent_knowledge'] = await self.memory_manager.retrieve_persistent_memories(
+ "code quality security best practices", limit=15
+ )
+
+ # Find similar code analyses
+ context['similar_analyses'] = await self.memory_manager.search_similar_code(
+ "repository analysis", repo_id, limit=10
+ )
+
+ return context
+
+ async def analyze_repository_overview_with_memory(self, repo_path: str, file_analyses: List[FileAnalysis],
+ context_memories: Dict, repo_id: str) -> Tuple[str, str]:
+ """Analyze repository architecture and security with memory context."""
+ print("Analyzing repository overview with memory context...")
+
+ # Prepare summary data
+ languages = dict(Counter(fa.language for fa in file_analyses))
+ total_lines = sum(fa.lines_of_code for fa in file_analyses)
+ avg_quality = sum(fa.severity_score for fa in file_analyses) / len(file_analyses) if file_analyses else 5.0
+
+ # Build memory context
+ memory_context = ""
+ if context_memories['persistent_knowledge']:
+ memory_context += "Relevant knowledge from previous analyses:\n"
+ for knowledge in context_memories['persistent_knowledge'][:3]:
+ memory_context += f"- {knowledge['content']}\n"
+
+ if context_memories['similar_analyses']:
+ memory_context += "\nSimilar repositories analyzed:\n"
+ for similar in context_memories['similar_analyses'][:2]:
+ memory_context += f"- {similar['file_path']}: {len(similar.get('analysis_data', {}).get('issues_found', []))} issues found\n"
+
+ # Get repository structure
+ structure_lines = []
+ try:
+ for root, dirs, files in os.walk(repo_path):
+ dirs[:] = [d for d in dirs if not d.startswith('.') and d not in {'node_modules', '__pycache__'}]
+ level = root.replace(repo_path, '').count(os.sep)
+ indent = ' ' * level
+ structure_lines.append(f"{indent}{os.path.basename(root)}/")
+ for file in files[:3]: # Limit files shown per directory
+ structure_lines.append(f"{indent} {file}")
+ if len(structure_lines) > 50: # Limit total structure size
+ break
+ except Exception as e:
+ structure_lines = [f"Error reading structure: {e}"]
+
+ # Architecture analysis with memory context
+ arch_prompt = f"""
+You are a Senior Software Architect with 25+ years of experience.
+
+{memory_context}
+
+Analyze this repository:
+
+REPOSITORY STRUCTURE:
+{chr(10).join(structure_lines[:30])}
+
+STATISTICS:
+- Total files analyzed: {len(file_analyses)}
+- Total lines of code: {total_lines:,}
+- Languages: {languages}
+- Average code quality: {avg_quality:.1f}/10
+
+TOP FILE ISSUES:
+{chr(10).join([f"- {fa.path}: {len(fa.issues_found)} issues" for fa in file_analyses[:10]])}
+
+Provide an architectural assessment covering:
+1. Project type and purpose
+2. Technology stack evaluation
+3. Code organization and structure
+4. Scalability and maintainability concerns
+5. Key recommendations for improvement
+
+Incorporate insights from the memory context provided above.
+Keep response under 1500 words and focus on actionable insights.
+"""
+
+ # Security analysis with memory context
+ security_issues = []
+ for fa in file_analyses:
+ security_issues.extend([issue for issue in fa.issues_found if
+ any(keyword in issue.lower() for keyword in
+ ['security', 'vulnerability', 'injection', 'xss', 'auth', 'password'])])
+
+ sec_prompt = f"""
+You are a Senior Security Engineer with 20+ years of experience.
+
+{memory_context}
+
+Security Analysis for repository with {len(file_analyses)} files:
+
+SECURITY ISSUES FOUND:
+{chr(10).join(security_issues[:20]) if security_issues else "No obvious security issues detected"}
+
+HIGH-RISK FILE TYPES PRESENT:
+{[lang for lang, count in languages.items() if lang in ['JavaScript', 'TypeScript', 'Python', 'PHP', 'SQL']]}
+
+Provide security assessment covering:
+1. Overall security posture
+2. Main security risks and vulnerabilities
+3. Authentication and authorization concerns
+4. Data protection and privacy issues
+5. Immediate security priorities
+
+Incorporate insights from the memory context provided above.
+Keep response under 1000 words and focus on actionable security recommendations.
+"""
+
+ try:
+ # Run both analyses
+ arch_task = self.client.messages.create(
+ model="claude-3-5-sonnet-20240620",
+ max_tokens=2000,
+ temperature=0.1,
+ messages=[{"role": "user", "content": arch_prompt}]
+ )
+
+ sec_task = self.client.messages.create(
+ model="claude-3-5-sonnet-20240620",
+ max_tokens=1500,
+ temperature=0.1,
+ messages=[{"role": "user", "content": sec_prompt}]
+ )
+
+ architecture_assessment = arch_task.content[0].text
+ security_assessment = sec_task.content[0].text
+
+ # Store insights as persistent knowledge
+ await self.memory_manager.store_persistent_memory(
+ content=f"Architecture pattern: {architecture_assessment[:300]}...",
+ category='architecture',
+ confidence=0.7,
+ source_repos=[repo_id]
+ )
+
+ return architecture_assessment, security_assessment
+
+ except Exception as e:
+ return f"Architecture analysis failed: {e}", f"Security analysis failed: {e}"
+
+ async def generate_executive_summary_with_memory(self, analysis: RepositoryAnalysis, context_memories: Dict) -> str:
+ """Generate executive summary with memory context."""
+ print("Generating executive summary with memory context...")
+
+ # Build memory context for executive summary
+ executive_context = ""
+ if context_memories['episodic_memories']:
+ executive_context += "Previous executive discussions:\n"
+ for memory in context_memories['episodic_memories'][:2]:
+ if 'executive' in memory.get('ai_response', '').lower():
+ executive_context += f"- {memory['ai_response'][:200]}...\n"
+
+ prompt = f"""
+You are presenting to C-level executives. Create an executive summary of this technical analysis.
+
+{executive_context}
+
+REPOSITORY METRICS:
+- Total Files: {analysis.total_files}
+- Lines of Code: {analysis.total_lines:,}
+- Languages: {analysis.languages}
+- Code Quality Score: {analysis.code_quality_score:.1f}/10
+
+KEY FINDINGS:
+- Total issues identified: {sum(len(fa.issues_found) for fa in analysis.file_analyses)}
+- Files needing attention: {len([fa for fa in analysis.file_analyses if fa.severity_score < 7])}
+- High-quality files: {len([fa for fa in analysis.file_analyses if fa.severity_score >= 8])}
+
+Create an executive summary for non-technical leadership covering:
+1. Business impact of code quality findings
+2. Risk assessment and implications
+3. Investment priorities and recommendations
+4. Expected ROI from addressing technical debt
+5. Competitive implications
+
+Focus on business outcomes, not technical details. Keep under 800 words.
+"""
+
+ try:
+ message = self.client.messages.create(
+ model="claude-3-5-sonnet-20240620",
+ max_tokens=1200,
+ temperature=0.1,
+ messages=[{"role": "user", "content": prompt}]
+ )
+ return message.content[0].text
+ except Exception as e:
+ return f"Executive summary generation failed: {e}"
+
+ def create_pdf_report(self, analysis: RepositoryAnalysis, output_path: str):
+ """Generate comprehensive PDF report."""
+ print(f"Generating PDF report: {output_path}")
+
+ doc = SimpleDocTemplate(output_path, pagesize=A4,
+ leftMargin=72, rightMargin=72,
+ topMargin=72, bottomMargin=72)
+ styles = getSampleStyleSheet()
+ story = []
+
+ # Custom styles
+ title_style = ParagraphStyle(
+ 'CustomTitle',
+ parent=styles['Heading1'],
+ fontSize=24,
+ textColor=colors.darkblue,
+ spaceAfter=30,
+ alignment=TA_CENTER
+ )
+
+ heading_style = ParagraphStyle(
+ 'CustomHeading',
+ parent=styles['Heading2'],
+ fontSize=16,
+ textColor=colors.darkblue,
+ spaceBefore=20,
+ spaceAfter=10
+ )
+
+ # Title Page
+ story.append(Paragraph("AI-Enhanced Repository Analysis Report", title_style))
+ story.append(Spacer(1, 20))
+ story.append(Paragraph(f"Repository: {analysis.repo_path}", styles['Normal']))
+ story.append(Paragraph(f"Analysis Date: {datetime.now().strftime('%B %d, %Y at %H:%M')}", styles['Normal']))
+ story.append(Paragraph("Generated by: Enhanced AI Analysis System with Memory", styles['Normal']))
+ story.append(PageBreak())
+
+ # Executive Summary
+ story.append(Paragraph("Executive Summary", heading_style))
+ story.append(Paragraph(analysis.executive_summary, styles['Normal']))
+ story.append(PageBreak())
+
+ # Repository Overview
+ story.append(Paragraph("Repository Overview", heading_style))
+
+ overview_data = [
+ ['Metric', 'Value'],
+ ['Total Files Analyzed', str(analysis.total_files)],
+ ['Total Lines of Code', f"{analysis.total_lines:,}"],
+ ['Primary Languages', ', '.join(list(analysis.languages.keys())[:5])],
+ ['Overall Code Quality', f"{analysis.code_quality_score:.1f}/10"],
+ ]
+
+ overview_table = Table(overview_data, colWidths=[200, 300])
+ overview_table.setStyle(TableStyle([
+ ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
+ ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
+ ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
+ ('FONTSIZE', (0, 0), (-1, 0), 12),
+ ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
+ ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
+ ('GRID', (0, 0), (-1, -1), 1, colors.black)
+ ]))
+
+ story.append(overview_table)
+ story.append(Spacer(1, 20))
+
+ # Build PDF
+ try:
+ doc.build(story)
+ print(f"ā
PDF report generated successfully: {output_path}")
+ except Exception as e:
+ print(f"ā Error generating PDF: {e}")
+
+ async def query_memory(self, query: str, repo_context: str = "") -> Dict[str, Any]:
+ """Query the memory system directly."""
+ return await self.query_engine.intelligent_query(query, repo_context)
+
+def get_memory_config() -> Dict[str, Any]:
+ """Get memory system configuration from environment variables."""
+ return {
+ 'anthropic_api_key': os.getenv('ANTHROPIC_API_KEY', ''),
+ 'redis_host': os.getenv('REDIS_HOST', 'localhost'),
+ 'redis_port': int(os.getenv('REDIS_PORT', 6379)),
+ 'redis_db': int(os.getenv('REDIS_DB', 0)),
+ 'mongodb_url': os.getenv('MONGODB_URL', 'mongodb://localhost:27017/'),
+ 'mongodb_name': os.getenv('MONGODB_DB', 'repo_analyzer'),
+ 'postgres_host': os.getenv('POSTGRES_HOST', 'localhost'),
+ 'postgres_port': int(os.getenv('POSTGRES_PORT', 5432)),
+ 'postgres_db': os.getenv('POSTGRES_DB', 'repo_vectors'),
+ 'postgres_user': os.getenv('POSTGRES_USER', 'postgres'),
+ 'postgres_password': os.getenv('POSTGRES_PASSWORD', '')
+ }
+
+async def main():
+ """Main function to run the enhanced repository analyzer."""
+ load_dotenv()
+
+ import argparse
+ parser = argparse.ArgumentParser(description="Complete AI Repository Analysis - Analyzes ALL files automatically")
+ parser.add_argument("repo_path", help="Repository path (local directory or Git URL)")
+ parser.add_argument("--output", "-o", default="complete_repository_analysis.pdf",
+ help="Output PDF file path")
+ parser.add_argument("--api-key", help="Anthropic API key (overrides .env)")
+
+ args = parser.parse_args()
+
+ # Get API key
+ api_key = args.api_key or os.getenv('ANTHROPIC_API_KEY')
+ if not api_key:
+ print("ā Error: ANTHROPIC_API_KEY not found in .env file or command line")
+ return 1
+
+ try:
+ print("š Starting Complete AI Repository Analysis")
+ print("=" * 60)
+ print(f"Repository: {args.repo_path}")
+ print(f"Output: {args.output}")
+ print("Mode: Complete automated analysis of ALL files")
+ print("=" * 60)
+
+ # Initialize enhanced analyzer
+ config = get_memory_config()
+ analyzer = EnhancedGitHubAnalyzer(api_key, config)
+
+ # Perform complete analysis
+ analysis = await analyzer.analyze_repository_with_memory(args.repo_path)
+
+ # Generate PDF report
+ analyzer.create_pdf_report(analysis, args.output)
+
+ # Print summary to console
+ print("\n" + "=" * 60)
+ print("šÆ COMPLETE ANALYSIS FINISHED")
+ print("=" * 60)
+ print(f"š Repository Statistics:")
+ print(f" ⢠Files Analyzed: {analysis.total_files}")
+ print(f" ⢠Lines of Code: {analysis.total_lines:,}")
+ print(f" ⢠Languages: {len(analysis.languages)}")
+ print(f" ⢠Code Quality: {analysis.code_quality_score:.1f}/10")
+
+ # Quality breakdown
+ high_quality = len([fa for fa in analysis.file_analyses if fa.severity_score >= 8])
+ medium_quality = len([fa for fa in analysis.file_analyses if 5 <= fa.severity_score < 8])
+ low_quality = len([fa for fa in analysis.file_analyses if fa.severity_score < 5])
+
+ print(f"\nš Quality Breakdown:")
+ print(f" ⢠High Quality Files (8-10): {high_quality}")
+ print(f" ⢠Medium Quality Files (5-7): {medium_quality}")
+ print(f" ⢠Low Quality Files (1-4): {low_quality}")
+ print(f" ⢠Total Issues Found: {sum(len(fa.issues_found) for fa in analysis.file_analyses)}")
+
+ # Language breakdown
+ print(f"\nš¤ Language Distribution:")
+ for lang, count in sorted(analysis.languages.items(), key=lambda x: x[1], reverse=True)[:10]:
+ print(f" ⢠{lang}: {count} files")
+
+ # Memory system stats
+ memory_stats = await analyzer.memory_manager.get_memory_stats()
+ print(f"\nš§ Memory System Statistics:")
+ for category, data in memory_stats.items():
+ print(f" ⢠{category.replace('_', ' ').title()}: {data}")
+
+ print(f"\nš Complete PDF Report: {args.output}")
+ print("\nā
Complete analysis finished successfully!")
+
+ return 0
+
+ except Exception as e:
+ print(f"ā Error during analysis: {e}")
+ import traceback
+ traceback.print_exc()
+ return 1
+
+if __name__ == "__main__":
+ exit(asyncio.run(main()))
\ No newline at end of file
diff --git a/services/ai-analysis-service/env.example b/services/ai-analysis-service/env.example
new file mode 100644
index 0000000..dc3beee
--- /dev/null
+++ b/services/ai-analysis-service/env.example
@@ -0,0 +1,46 @@
+# AI Analysis Service Environment Configuration
+
+# Service Configuration
+PORT=8022
+HOST=0.0.0.0
+NODE_ENV=development
+
+# AI API Keys
+ANTHROPIC_API_KEY=your_anthropic_api_key_here
+
+# Database Configuration
+POSTGRES_HOST=localhost
+POSTGRES_PORT=5432
+POSTGRES_DB=dev_pipeline
+POSTGRES_USER=pipeline_admin
+POSTGRES_PASSWORD=secure_pipeline_2024
+
+# Redis Configuration
+REDIS_HOST=localhost
+REDIS_PORT=6379
+REDIS_PASSWORD=redis_secure_2024
+REDIS_DB=0
+
+# MongoDB Configuration
+MONGODB_URL=mongodb://pipeline_admin:mongo_secure_2024@localhost:27017/
+MONGODB_DB=repo_analyzer
+
+# JWT Configuration
+JWT_ACCESS_SECRET=access-secret-key-2024-tech4biz-secure_pipeline_2024
+
+# Service URLs
+USER_AUTH_SERVICE_URL=http://localhost:8011
+
+# Analysis Configuration
+MAX_FILES_PER_ANALYSIS=100
+MAX_FILE_SIZE_MB=2
+ANALYSIS_TIMEOUT_SECONDS=300
+
+# Memory System Configuration
+WORKING_MEMORY_TTL=3600
+EPISODIC_RETENTION_DAYS=365
+PERSISTENT_MEMORY_THRESHOLD=0.8
+
+# Logging Configuration
+LOG_LEVEL=INFO
+LOG_FILE_PATH=/app/logs/ai-analysis.log
diff --git a/services/ai-analysis-service/migrate.sh b/services/ai-analysis-service/migrate.sh
new file mode 100755
index 0000000..0c21c31
--- /dev/null
+++ b/services/ai-analysis-service/migrate.sh
@@ -0,0 +1,104 @@
+#!/bin/bash
+
+# Database Migration Script using psql
+# Executes the complete 001-schema.sql file
+
+set -e # Exit on any error
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+# Load environment variables
+if [ -f .env ]; then
+ export $(cat .env | grep -v '^#' | xargs)
+fi
+
+# Database connection parameters
+DB_HOST=${POSTGRES_HOST:-localhost}
+DB_PORT=${POSTGRES_PORT:-5432}
+DB_NAME=${POSTGRES_DB:-dev_pipeline}
+DB_USER=${POSTGRES_USER:-pipeline_admin}
+DB_PASSWORD=${POSTGRES_PASSWORD:-secure_pipeline_2024}
+
+# Schema file
+SCHEMA_FILE="001-schema.sql"
+
+echo -e "${BLUE}š§ AI Repository Analysis Database Migration${NC}"
+echo "=================================================="
+echo -e "Database: ${YELLOW}${DB_NAME}@${DB_HOST}:${DB_PORT}${NC}"
+echo -e "User: ${YELLOW}${DB_USER}${NC}"
+echo -e "Schema file: ${YELLOW}${SCHEMA_FILE}${NC}"
+echo ""
+
+# Check if psql is available
+if ! command -v psql &> /dev/null; then
+ echo -e "${RED}ā psql command not found!${NC}"
+ echo "Please install PostgreSQL client tools:"
+ echo " Ubuntu/Debian: sudo apt-get install postgresql-client"
+ echo " CentOS/RHEL: sudo yum install postgresql"
+ echo " macOS: brew install postgresql"
+ exit 1
+fi
+
+# Check if schema file exists
+if [ ! -f "$SCHEMA_FILE" ]; then
+ echo -e "${RED}ā Schema file not found: ${SCHEMA_FILE}${NC}"
+ exit 1
+fi
+
+echo -e "${BLUE}⢠Executing migration...${NC}"
+
+# Set password for psql
+export PGPASSWORD="$DB_PASSWORD"
+
+# Run migration
+if psql -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" \
+ -f "$SCHEMA_FILE" \
+ -v ON_ERROR_STOP=1 \
+ --echo-errors \
+ --echo-queries; then
+
+ echo -e "${GREEN}ā
Migration completed successfully!${NC}"
+
+ # Verify migration
+ echo -e "${BLUE}⢠Verifying migration...${NC}"
+
+ TABLES=$(psql -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" -t -c "
+ SELECT table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ AND table_name IN ('code_embeddings', 'query_embeddings', 'knowledge_embeddings',
+ 'repository_metadata', 'analysis_sessions', 'file_analysis_history')
+ ORDER BY table_name;
+ " | tr -d ' ')
+
+ if [ -n "$TABLES" ]; then
+ TABLE_COUNT=$(echo "$TABLES" | wc -l)
+ echo -e "${GREEN}ā Found ${TABLE_COUNT} core tables: ${TABLES}${NC}"
+ else
+ echo -e "${YELLOW}ā Could not verify table creation${NC}"
+ fi
+
+ # Check for pgvector extension
+ VECTOR_AVAILABLE=$(psql -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" -t -c "
+ SELECT EXISTS(SELECT 1 FROM pg_extension WHERE extname = 'vector');
+ " | tr -d ' ')
+
+ if [ "$VECTOR_AVAILABLE" = "t" ]; then
+ echo -e "${GREEN}ā pgvector extension is available${NC}"
+ else
+ echo -e "${YELLOW}ā pgvector extension not available - vector operations will be limited${NC}"
+ fi
+
+ echo ""
+ echo -e "${GREEN}š Database migration completed successfully!${NC}"
+ echo -e "${GREEN}š Production-level database ready for AI repository analysis${NC}"
+
+else
+ echo -e "${RED}ā Migration failed!${NC}"
+ exit 1
+fi
diff --git a/services/ai-analysis-service/migrate_database.py b/services/ai-analysis-service/migrate_database.py
new file mode 100644
index 0000000..694d6db
--- /dev/null
+++ b/services/ai-analysis-service/migrate_database.py
@@ -0,0 +1,203 @@
+#!/usr/bin/env python3
+"""
+Database Migration Script using psql command
+Executes the complete 001-schema.sql file using PostgreSQL's psql command
+"""
+
+import os
+import subprocess
+import sys
+from dotenv import load_dotenv
+import logging
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
+logger = logging.getLogger(__name__)
+
+def run_migration():
+ """Run the database migration using psql command."""
+ load_dotenv()
+
+ # Database connection parameters
+ db_config = {
+ 'host': os.getenv('POSTGRES_HOST', 'localhost'),
+ 'port': os.getenv('POSTGRES_PORT', 5432),
+ 'database': os.getenv('POSTGRES_DB', 'dev_pipeline'),
+ 'user': os.getenv('POSTGRES_USER', 'pipeline_admin'),
+ 'password': os.getenv('POSTGRES_PASSWORD', 'secure_pipeline_2024')
+ }
+
+ # Schema file path
+ schema_file = os.path.join(os.path.dirname(__file__), '001-schema.sql')
+
+ if not os.path.exists(schema_file):
+ logger.error(f"ā Schema file not found: {schema_file}")
+ return False
+
+ try:
+ logger.info("š§ Starting database migration with psql...")
+ logger.info(f" ⢠Database: {db_config['database']}@{db_config['host']}:{db_config['port']}")
+ logger.info(f" ⢠User: {db_config['user']}")
+ logger.info(f" ⢠Schema file: {schema_file}")
+
+ # Set PGPASSWORD environment variable for psql
+ env = os.environ.copy()
+ env['PGPASSWORD'] = db_config['password']
+
+ # Build psql command
+ psql_cmd = [
+ 'psql',
+ '-h', db_config['host'],
+ '-p', str(db_config['port']),
+ '-U', db_config['user'],
+ '-d', db_config['database'],
+ '-f', schema_file,
+ '-v', 'ON_ERROR_STOP=1', # Stop on first error
+ '--echo-errors', # Show errors
+ '--echo-queries' # Show queries being executed
+ ]
+
+ logger.info(" ⢠Executing migration...")
+ logger.info(f" ⢠Command: {' '.join(psql_cmd)}")
+
+ # Run psql command
+ result = subprocess.run(
+ psql_cmd,
+ env=env,
+ capture_output=True,
+ text=True,
+ timeout=300 # 5 minute timeout
+ )
+
+ # Check if psql command exists
+ if result.returncode == 127:
+ logger.error("ā psql command not found. Please install PostgreSQL client tools.")
+ logger.error(" On Ubuntu/Debian: sudo apt-get install postgresql-client")
+ logger.error(" On CentOS/RHEL: sudo yum install postgresql")
+ return False
+
+ # Check for errors
+ if result.returncode != 0:
+ logger.error(f"ā Migration failed with return code: {result.returncode}")
+ if result.stderr:
+ logger.error("STDERR:")
+ logger.error(result.stderr)
+ if result.stdout:
+ logger.error("STDOUT:")
+ logger.error(result.stdout)
+ return False
+
+ # Log success
+ logger.info("ā
Migration completed successfully!")
+
+ if result.stdout:
+ logger.info("Migration output:")
+ # Filter out common psql output noise
+ lines = result.stdout.split('\n')
+ for line in lines:
+ if line.strip() and not line.startswith('SET') and not line.startswith('NOTICE'):
+ logger.info(f" {line}")
+
+ # Verify migration by checking if key tables exist
+ logger.info(" ⢠Verifying migration...")
+
+ verify_cmd = [
+ 'psql',
+ '-h', db_config['host'],
+ '-p', str(db_config['port']),
+ '-U', db_config['user'],
+ '-d', db_config['database'],
+ '-t', # tuples only
+ '-c', """
+ SELECT table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ AND table_name IN ('code_embeddings', 'query_embeddings', 'knowledge_embeddings',
+ 'repository_metadata', 'analysis_sessions', 'file_analysis_history')
+ ORDER BY table_name;
+ """
+ ]
+
+ verify_result = subprocess.run(
+ verify_cmd,
+ env=env,
+ capture_output=True,
+ text=True,
+ timeout=30
+ )
+
+ if verify_result.returncode == 0:
+ tables = [line.strip() for line in verify_result.stdout.split('\n') if line.strip()]
+ logger.info(f" ā Found {len(tables)} core tables: {', '.join(tables)}")
+ else:
+ logger.warning(" ā Could not verify table creation")
+
+ # Check for pgvector extension
+ vector_cmd = [
+ 'psql',
+ '-h', db_config['host'],
+ '-p', str(db_config['port']),
+ '-U', db_config['user'],
+ '-d', db_config['database'],
+ '-t',
+ '-c', "SELECT EXISTS(SELECT 1 FROM pg_extension WHERE extname = 'vector');"
+ ]
+
+ vector_result = subprocess.run(
+ vector_cmd,
+ env=env,
+ capture_output=True,
+ text=True,
+ timeout=30
+ )
+
+ if vector_result.returncode == 0:
+ has_vector = vector_result.stdout.strip() == 't'
+ if has_vector:
+ logger.info(" ā pgvector extension is available")
+ else:
+ logger.warning(" ā pgvector extension not available - vector operations will be limited")
+
+ logger.info("š Database migration completed successfully!")
+ logger.info("š Production-level database ready for AI repository analysis")
+
+ return True
+
+ except subprocess.TimeoutExpired:
+ logger.error("ā Migration timed out after 5 minutes")
+ return False
+ except FileNotFoundError:
+ logger.error("ā psql command not found. Please install PostgreSQL client tools.")
+ return False
+ except Exception as e:
+ logger.error(f"ā Migration failed: {e}")
+ return False
+
+def check_psql_available():
+ """Check if psql command is available."""
+ try:
+ result = subprocess.run(['psql', '--version'], capture_output=True, text=True)
+ if result.returncode == 0:
+ logger.info(f"ā Found psql: {result.stdout.strip()}")
+ return True
+ else:
+ return False
+ except FileNotFoundError:
+ return False
+
+if __name__ == "__main__":
+ logger.info("š§ AI Repository Analysis Database Migration")
+ logger.info("=" * 50)
+
+ # Check if psql is available
+ if not check_psql_available():
+ logger.error("ā psql command not found!")
+ logger.error("Please install PostgreSQL client tools:")
+ logger.error(" Ubuntu/Debian: sudo apt-get install postgresql-client")
+ logger.error(" CentOS/RHEL: sudo yum install postgresql")
+ logger.error(" macOS: brew install postgresql")
+ sys.exit(1)
+
+ # Run migration
+ success = run_migration()
+ sys.exit(0 if success else 1)
diff --git a/services/ai-analysis-service/requirements.txt b/services/ai-analysis-service/requirements.txt
new file mode 100644
index 0000000..78e4a11
--- /dev/null
+++ b/services/ai-analysis-service/requirements.txt
@@ -0,0 +1,25 @@
+# Core AI and API dependencies
+anthropic>=0.7.0
+python-dotenv>=1.0.0
+
+# Web framework
+fastapi>=0.104.1
+uvicorn>=0.24.0
+pydantic>=2.5.0
+
+# Git operations
+GitPython>=3.1.40
+
+# Database dependencies
+redis>=4.5.0
+pymongo>=4.5.0
+psycopg2-binary>=2.9.7
+
+# Data processing
+numpy>=1.24.0
+
+# PDF generation
+reportlab>=4.0.0
+
+# Optional: For better performance (if needed)
+# sentence-transformers>=2.2.2 # Commented out - using Claude API instead
diff --git a/services/ai-analysis-service/run_migration.py b/services/ai-analysis-service/run_migration.py
new file mode 100644
index 0000000..595fe47
--- /dev/null
+++ b/services/ai-analysis-service/run_migration.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+"""
+AI Analysis Service Database Migration Runner
+Runs the database migration for AI Analysis Service during container startup.
+"""
+
+import os
+import sys
+import subprocess
+import time
+from pathlib import Path
+
+def log(message):
+ """Log with timestamp."""
+ print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] {message}")
+
+def check_database_connection():
+ """Check if database is available."""
+ try:
+ import psycopg2
+ from dotenv import load_dotenv
+
+ load_dotenv()
+
+ conn = psycopg2.connect(
+ host=os.getenv('POSTGRES_HOST', 'localhost'),
+ port=os.getenv('POSTGRES_PORT', 5432),
+ database=os.getenv('POSTGRES_DB', 'dev_pipeline'),
+ user=os.getenv('POSTGRES_USER', 'pipeline_admin'),
+ password=os.getenv('POSTGRES_PASSWORD', 'secure_pipeline_2024')
+ )
+ conn.close()
+ return True
+ except Exception as e:
+ log(f"Database connection failed: {e}")
+ return False
+
+def run_migration():
+ """Run the database migration."""
+ try:
+ log("Starting AI Analysis Service database migration...")
+
+ # Check if database is available
+ max_retries = 30
+ retry_count = 0
+
+ while retry_count < max_retries:
+ if check_database_connection():
+ log("Database connection successful")
+ break
+ else:
+ retry_count += 1
+ log(f"Database not ready, retrying in 2 seconds... ({retry_count}/{max_retries})")
+ time.sleep(2)
+ else:
+ log("ERROR: Could not connect to database after 60 seconds")
+ return False
+
+ # Run the migration script
+ schema_file = Path(__file__).parent / "001-schema.sql"
+ if not schema_file.exists():
+ log("ERROR: Schema file not found")
+ return False
+
+ log(f"Running migration from {schema_file}")
+
+ # Use psql to run the migration
+ env = os.environ.copy()
+ env['PGPASSWORD'] = os.getenv('POSTGRES_PASSWORD', 'secure_pipeline_2024')
+
+ result = subprocess.run([
+ 'psql',
+ '-h', os.getenv('POSTGRES_HOST', 'localhost'),
+ '-p', os.getenv('POSTGRES_PORT', '5432'),
+ '-U', os.getenv('POSTGRES_USER', 'pipeline_admin'),
+ '-d', os.getenv('POSTGRES_DB', 'dev_pipeline'),
+ '-f', str(schema_file),
+ '-v', 'ON_ERROR_STOP=1'
+ ], env=env, capture_output=True, text=True)
+
+ if result.returncode == 0:
+ log("ā
AI Analysis Service database migration completed successfully")
+ return True
+ else:
+ log(f"ā Migration failed: {result.stderr}")
+ return False
+
+ except Exception as e:
+ log(f"ā Migration error: {e}")
+ return False
+
+if __name__ == "__main__":
+ success = run_migration()
+ sys.exit(0 if success else 1)
diff --git a/services/ai-analysis-service/server.py b/services/ai-analysis-service/server.py
new file mode 100644
index 0000000..3de8039
--- /dev/null
+++ b/services/ai-analysis-service/server.py
@@ -0,0 +1,230 @@
+#!/usr/bin/env python3
+"""
+AI Analysis Service HTTP Server
+Provides REST API endpoints for repository analysis.
+"""
+
+import os
+import asyncio
+import json
+import tempfile
+import shutil
+from pathlib import Path
+from typing import Dict, Any
+from datetime import datetime
+
+from fastapi import FastAPI, HTTPException, BackgroundTasks
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import FileResponse
+from pydantic import BaseModel
+import uvicorn
+
+# Import the AI analysis components
+# Note: ai-analyze.py has a hyphen, so we need to handle the import specially
+import sys
+import importlib.util
+
+# Load the ai-analyze.py module
+spec = importlib.util.spec_from_file_location("ai_analyze", "/app/ai-analyze.py")
+ai_analyze_module = importlib.util.module_from_spec(spec)
+sys.modules["ai_analyze"] = ai_analyze_module
+spec.loader.exec_module(ai_analyze_module)
+
+# Now import the classes
+from ai_analyze import EnhancedGitHubAnalyzer, get_memory_config
+
+app = FastAPI(
+ title="AI Analysis Service",
+ description="AI-powered repository analysis with memory system",
+ version="1.0.0"
+)
+
+# CORS middleware
+app.add_middleware(
+ CORSMiddleware,
+ allow_origins=["*"],
+ allow_credentials=True,
+ allow_methods=["*"],
+ allow_headers=["*"],
+)
+
+# Global analyzer instance
+analyzer = None
+
+class AnalysisRequest(BaseModel):
+ repo_path: str
+ output_format: str = "pdf" # pdf, json
+ max_files: int = 50
+
+class AnalysisResponse(BaseModel):
+ success: bool
+ message: str
+ analysis_id: str = None
+ report_path: str = None
+ stats: Dict[str, Any] = None
+
+@app.on_event("startup")
+async def startup_event():
+ """Initialize the analyzer on startup."""
+ global analyzer
+ try:
+ # Load environment variables
+ from dotenv import load_dotenv
+ load_dotenv()
+
+ # Get API key
+ api_key = os.getenv('ANTHROPIC_API_KEY')
+ if not api_key:
+ raise Exception("ANTHROPIC_API_KEY not found in environment")
+
+ # Initialize analyzer
+ config = get_memory_config()
+ analyzer = EnhancedGitHubAnalyzer(api_key, config)
+
+ print("ā
AI Analysis Service initialized successfully")
+ except Exception as e:
+ print(f"ā Failed to initialize AI Analysis Service: {e}")
+ raise
+
+@app.get("/health")
+async def health_check():
+ """Health check endpoint."""
+ return {
+ "status": "healthy",
+ "service": "ai-analysis-service",
+ "timestamp": datetime.now().isoformat(),
+ "version": "1.0.0"
+ }
+
+@app.post("/analyze", response_model=AnalysisResponse)
+async def analyze_repository(request: AnalysisRequest, background_tasks: BackgroundTasks):
+ """Analyze a repository."""
+ try:
+ if not analyzer:
+ raise HTTPException(status_code=500, detail="Analyzer not initialized")
+
+ # Generate unique analysis ID
+ analysis_id = f"analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+
+ # Create temporary directory for this analysis
+ temp_dir = tempfile.mkdtemp(prefix=f"ai_analysis_{analysis_id}_")
+
+ try:
+ # Run analysis
+ analysis = await analyzer.analyze_repository_with_memory(
+ request.repo_path,
+ max_files=request.max_files
+ )
+
+ # Generate report
+ if request.output_format == "pdf":
+ report_path = f"/app/reports/{analysis_id}_analysis.pdf"
+ analyzer.create_pdf_report(analysis, report_path)
+ else:
+ report_path = f"/app/reports/{analysis_id}_analysis.json"
+ with open(report_path, 'w') as f:
+ json.dump({
+ "repo_path": analysis.repo_path,
+ "total_files": analysis.total_files,
+ "total_lines": analysis.total_lines,
+ "languages": analysis.languages,
+ "code_quality_score": analysis.code_quality_score,
+ "architecture_assessment": analysis.architecture_assessment,
+ "security_assessment": analysis.security_assessment,
+ "executive_summary": analysis.executive_summary,
+ "file_analyses": [
+ {
+ "path": fa.path,
+ "language": fa.language,
+ "lines_of_code": fa.lines_of_code,
+ "severity_score": fa.severity_score,
+ "issues_found": fa.issues_found,
+ "recommendations": fa.recommendations
+ } for fa in analysis.file_analyses
+ ]
+ }, f, indent=2)
+
+ # Calculate stats
+ stats = {
+ "total_files": analysis.total_files,
+ "total_lines": analysis.total_lines,
+ "languages": analysis.languages,
+ "code_quality_score": analysis.code_quality_score,
+ "high_quality_files": len([fa for fa in analysis.file_analyses if fa.severity_score >= 8]),
+ "medium_quality_files": len([fa for fa in analysis.file_analyses if 5 <= fa.severity_score < 8]),
+ "low_quality_files": len([fa for fa in analysis.file_analyses if fa.severity_score < 5]),
+ "total_issues": sum(len(fa.issues_found) for fa in analysis.file_analyses)
+ }
+
+ return AnalysisResponse(
+ success=True,
+ message="Analysis completed successfully",
+ analysis_id=analysis_id,
+ report_path=report_path,
+ stats=stats
+ )
+
+ finally:
+ # Cleanup temporary directory
+ if os.path.exists(temp_dir):
+ shutil.rmtree(temp_dir)
+
+ except Exception as e:
+ return AnalysisResponse(
+ success=False,
+ message=f"Analysis failed: {str(e)}",
+ analysis_id=None,
+ report_path=None,
+ stats=None
+ )
+
+@app.get("/reports/{filename}")
+async def download_report(filename: str):
+ """Download analysis report."""
+ report_path = f"/app/reports/{filename}"
+ if not os.path.exists(report_path):
+ raise HTTPException(status_code=404, detail="Report not found")
+
+ return FileResponse(
+ report_path,
+ media_type='application/octet-stream',
+ filename=filename
+ )
+
+@app.get("/memory/stats")
+async def get_memory_stats():
+ """Get memory system statistics."""
+ try:
+ if not analyzer:
+ raise HTTPException(status_code=500, detail="Analyzer not initialized")
+
+ stats = await analyzer.memory_manager.get_memory_stats()
+ return {
+ "success": True,
+ "memory_stats": stats
+ }
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=f"Failed to get memory stats: {str(e)}")
+
+@app.post("/memory/query")
+async def query_memory(query: str, repo_context: str = ""):
+ """Query the memory system."""
+ try:
+ if not analyzer:
+ raise HTTPException(status_code=500, detail="Analyzer not initialized")
+
+ result = await analyzer.query_memory(query, repo_context)
+ return {
+ "success": True,
+ "query": query,
+ "result": result
+ }
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=f"Memory query failed: {str(e)}")
+
+if __name__ == "__main__":
+ port = int(os.getenv('PORT', 8022))
+ host = os.getenv('HOST', '0.0.0.0')
+
+ print(f"š Starting AI Analysis Service on {host}:{port}")
+ uvicorn.run(app, host=host, port=port)
diff --git a/services/api-gateway/src/server.js b/services/api-gateway/src/server.js
index 4c64e39..2a57093 100644
--- a/services/api-gateway/src/server.js
+++ b/services/api-gateway/src/server.js
@@ -68,6 +68,7 @@ const serviceTargets = {
DASHBOARD_URL: process.env.DASHBOARD_URL || 'http://localhost:8008',
SELF_IMPROVING_GENERATOR_URL: process.env.SELF_IMPROVING_GENERATOR_URL || 'http://localhost:8007',
AI_MOCKUP_URL: process.env.AI_MOCKUP_URL || 'http://localhost:8021',
+ AI_ANALYSIS_URL: process.env.AI_ANALYSIS_URL || 'http://localhost:8022',
};
// Log service targets for debugging
@@ -1984,6 +1985,76 @@ app.use('/api/mockup',
}
);
+// AI Analysis Service - Direct HTTP forwarding
+console.log('š§ Registering /api/ai-analysis proxy route...');
+app.use('/api/ai-analysis',
+ createServiceLimiter(200),
+ // Allow unauthenticated access for AI analysis (public feature)
+ (req, res, next) => {
+ console.log(`š¤ [AI ANALYSIS PROXY] ${req.method} ${req.originalUrl}`);
+ return next();
+ },
+ (req, res, next) => {
+ const aiAnalysisServiceUrl = serviceTargets.AI_ANALYSIS_URL;
+ // Strip the /api/ai-analysis prefix so /api/ai-analysis/analyze -> /analyze at target
+ const rewrittenPath = (req.originalUrl || '').replace(/^\/api\/ai-analysis/, '');
+ const targetUrl = `${aiAnalysisServiceUrl}${rewrittenPath}`;
+ console.log(`š„ [AI ANALYSIS PROXY] ${req.method} ${req.originalUrl} ā ${targetUrl}`);
+
+ res.setTimeout(300000, () => { // 5 minutes timeout for analysis
+ console.error('ā [AI ANALYSIS PROXY] Response timeout');
+ if (!res.headersSent) {
+ res.status(504).json({ error: 'Gateway timeout', service: 'ai-analysis' });
+ }
+ });
+
+ const options = {
+ method: req.method,
+ url: targetUrl,
+ headers: {
+ 'Content-Type': 'application/json',
+ 'User-Agent': 'API-Gateway/1.0',
+ 'Connection': 'keep-alive',
+ 'Authorization': req.headers.authorization,
+ 'X-User-ID': req.user?.id || req.user?.userId,
+ ...(req.user?.role && { 'X-User-Role': req.user.role })
+ },
+ timeout: 240000, // 4 minutes timeout
+ validateStatus: () => true,
+ maxRedirects: 0,
+ maxContentLength: 100 * 1024 * 1024, // 100MB max content length
+ maxBodyLength: 100 * 1024 * 1024 // 100MB max body length
+ };
+
+ if (req.method === 'POST' || req.method === 'PUT' || req.method === 'PATCH') {
+ options.data = req.body || {};
+ console.log(`š¦ [AI ANALYSIS PROXY] Request body:`, JSON.stringify(req.body));
+ }
+
+ axios(options)
+ .then(response => {
+ console.log(`ā
[AI ANALYSIS PROXY] Response: ${response.status} for ${req.method} ${req.originalUrl}`);
+ if (!res.headersSent) {
+ res.status(response.status).json(response.data);
+ }
+ })
+ .catch(error => {
+ console.error(`ā [AI ANALYSIS PROXY ERROR]:`, error.message);
+ if (!res.headersSent) {
+ if (error.response) {
+ res.status(error.response.status).json(error.response.data);
+ } else {
+ res.status(502).json({
+ error: 'AI Analysis service unavailable',
+ message: error.code || error.message,
+ service: 'ai-analysis'
+ });
+ }
+ }
+ });
+ }
+);
+
// Gateway management endpoints
app.get('/api/gateway/info', authMiddleware.verifyToken, (req, res) => {
res.json({
@@ -2041,9 +2112,10 @@ app.get('/', (req, res) => {
deploy: '/api/deploy',
dashboard: '/api/dashboard',
self_improving: '/api/self-improving',
- mockup: '/api/mockup',
- unison: '/api/unison',
- unified: '/api/recommendations'
+ mockup: '/api/mockup',
+ ai_analysis: '/api/ai-analysis',
+ unison: '/api/unison',
+ unified: '/api/recommendations'
},
websocket: {
endpoint: '/socket.io/',