-- ================================================ -- Repository Analyzer Memory System Database Migration -- Version: 1.0 -- Description: Complete database setup for AI memory system -- ================================================ -- Enable required extensions CREATE EXTENSION IF NOT EXISTS vector; CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; -- ================================================ -- CORE TABLES -- ================================================ -- Code embeddings table for semantic search of analyzed code CREATE TABLE IF NOT EXISTS code_embeddings ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), repo_id VARCHAR(255) NOT NULL, file_path TEXT NOT NULL, content_hash VARCHAR(64) NOT NULL, embedding vector(384) NOT NULL, metadata JSONB DEFAULT '{}', created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, last_accessed TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, access_count INTEGER DEFAULT 0, -- Ensure uniqueness per repo/file/hash combination CONSTRAINT unique_code_analysis UNIQUE(repo_id, file_path, content_hash) ); -- Query embeddings for episodic memory (user interactions) CREATE TABLE IF NOT EXISTS query_embeddings ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), session_id VARCHAR(255) NOT NULL, query_text TEXT NOT NULL, query_embedding vector(384) NOT NULL, response_embedding vector(384), repo_context VARCHAR(255), timestamp TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, metadata JSONB DEFAULT '{}', -- Index for session-based queries CONSTRAINT valid_session_id CHECK (LENGTH(session_id) > 0) ); -- Persistent knowledge embeddings for long-term learning CREATE TABLE IF NOT EXISTS knowledge_embeddings ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), fact_id VARCHAR(255) UNIQUE NOT NULL, content TEXT NOT NULL, category VARCHAR(100) NOT NULL, embedding vector(384) NOT NULL, confidence REAL DEFAULT 1.0 CHECK (confidence >= 0.0 AND confidence <= 1.0), source_repos TEXT[] DEFAULT '{}', created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, last_accessed TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, access_frequency INTEGER DEFAULT 0, -- Ensure valid categories CONSTRAINT valid_category CHECK (category IN ('code_pattern', 'best_practice', 'vulnerability', 'architecture', 'security_vulnerability', 'performance')) ); -- Repository metadata for tracking analyzed repositories CREATE TABLE IF NOT EXISTS repository_metadata ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), repo_id VARCHAR(255) UNIQUE NOT NULL, repo_path TEXT NOT NULL, repo_name VARCHAR(500), primary_language VARCHAR(100), total_files INTEGER DEFAULT 0, total_lines INTEGER DEFAULT 0, last_analyzed TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, analysis_count INTEGER DEFAULT 0, quality_score REAL DEFAULT 5.0 CHECK (quality_score >= 0.0 AND quality_score <= 10.0), metadata JSONB DEFAULT '{}' ); -- Session tracking for episodic memory correlation CREATE TABLE IF NOT EXISTS analysis_sessions ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), session_id VARCHAR(255) UNIQUE NOT NULL, user_identifier VARCHAR(255), start_time TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, end_time TIMESTAMP WITH TIME ZONE, total_queries INTEGER DEFAULT 0, repositories_analyzed TEXT[] DEFAULT '{}', session_metadata JSONB DEFAULT '{}' ); -- File analysis history for change tracking CREATE TABLE IF NOT EXISTS file_analysis_history ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), repo_id VARCHAR(255) NOT NULL, file_path TEXT NOT NULL, content_hash VARCHAR(64) NOT NULL, language VARCHAR(100), lines_of_code INTEGER DEFAULT 0, complexity_score REAL DEFAULT 0.0, severity_score REAL DEFAULT 5.0 CHECK (severity_score >= 0.0 AND severity_score <= 10.0), issues_count INTEGER DEFAULT 0, analyzed_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, analysis_version VARCHAR(50) DEFAULT '1.0' ); -- Memory consolidation log for tracking knowledge extraction CREATE TABLE IF NOT EXISTS memory_consolidation_log ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), source_type VARCHAR(50) NOT NULL, -- 'episodic', 'code_analysis', 'manual' source_id VARCHAR(255) NOT NULL, target_memory_type VARCHAR(50) NOT NULL, -- 'persistent', 'working' target_id VARCHAR(255), consolidation_confidence REAL DEFAULT 0.5, consolidation_timestamp TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, consolidation_metadata JSONB DEFAULT '{}' ); -- ================================================ -- PERFORMANCE INDEXES -- ================================================ -- Code embeddings indexes CREATE INDEX IF NOT EXISTS idx_code_embeddings_repo_id ON code_embeddings(repo_id); CREATE INDEX IF NOT EXISTS idx_code_embeddings_file_path ON code_embeddings(file_path); CREATE INDEX IF NOT EXISTS idx_code_embeddings_accessed ON code_embeddings(last_accessed DESC); CREATE INDEX IF NOT EXISTS idx_code_embeddings_metadata ON code_embeddings USING gin(metadata); -- Vector similarity indexes (using IVFFlat for better performance) CREATE INDEX IF NOT EXISTS idx_code_embeddings_vector ON code_embeddings USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100); -- Query embeddings indexes CREATE INDEX IF NOT EXISTS idx_query_embeddings_session ON query_embeddings(session_id); CREATE INDEX IF NOT EXISTS idx_query_embeddings_timestamp ON query_embeddings(timestamp DESC); CREATE INDEX IF NOT EXISTS idx_query_embeddings_repo_context ON query_embeddings(repo_context); CREATE INDEX IF NOT EXISTS idx_query_embeddings_vector ON query_embeddings USING ivfflat (query_embedding vector_cosine_ops) WITH (lists = 100); -- Knowledge embeddings indexes CREATE INDEX IF NOT EXISTS idx_knowledge_embeddings_category ON knowledge_embeddings(category); CREATE INDEX IF NOT EXISTS idx_knowledge_embeddings_confidence ON knowledge_embeddings(confidence DESC); CREATE INDEX IF NOT EXISTS idx_knowledge_embeddings_access_freq ON knowledge_embeddings(access_frequency DESC); CREATE INDEX IF NOT EXISTS idx_knowledge_embeddings_vector ON knowledge_embeddings USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100); CREATE INDEX IF NOT EXISTS idx_knowledge_source_repos ON knowledge_embeddings USING gin(source_repos); -- Repository metadata indexes CREATE INDEX IF NOT EXISTS idx_repository_metadata_repo_id ON repository_metadata(repo_id); CREATE INDEX IF NOT EXISTS idx_repository_metadata_analyzed ON repository_metadata(last_analyzed DESC); CREATE INDEX IF NOT EXISTS idx_repository_metadata_language ON repository_metadata(primary_language); -- File history indexes CREATE INDEX IF NOT EXISTS idx_file_history_repo_file ON file_analysis_history(repo_id, file_path); CREATE INDEX IF NOT EXISTS idx_file_history_analyzed ON file_analysis_history(analyzed_at DESC); CREATE INDEX IF NOT EXISTS idx_file_history_severity ON file_analysis_history(severity_score); -- ================================================ -- MATERIALIZED VIEWS FOR COMMON QUERIES -- ================================================ -- High confidence knowledge view CREATE MATERIALIZED VIEW IF NOT EXISTS high_confidence_knowledge AS SELECT fact_id, content, category, confidence, source_repos, created_at, last_accessed, access_frequency FROM knowledge_embeddings WHERE confidence > 0.8 ORDER BY confidence DESC, access_frequency DESC; CREATE INDEX ON high_confidence_knowledge (category); CREATE INDEX ON high_confidence_knowledge (confidence DESC); -- Repository quality summary view CREATE MATERIALIZED VIEW IF NOT EXISTS repository_quality_summary AS SELECT rm.repo_id, rm.repo_path, rm.repo_name, rm.primary_language, rm.total_files, rm.total_lines, rm.quality_score, rm.last_analyzed, COUNT(ce.id) as total_embeddings, AVG(fah.severity_score) as avg_file_quality, COUNT(DISTINCT fah.file_path) as analyzed_files_count FROM repository_metadata rm LEFT JOIN code_embeddings ce ON rm.repo_id = ce.repo_id LEFT JOIN file_analysis_history fah ON rm.repo_id = fah.repo_id GROUP BY rm.repo_id, rm.repo_path, rm.repo_name, rm.primary_language, rm.total_files, rm.total_lines, rm.quality_score, rm.last_analyzed; CREATE INDEX ON repository_quality_summary (quality_score DESC); CREATE INDEX ON repository_quality_summary (last_analyzed DESC); -- Recent activity view CREATE MATERIALIZED VIEW IF NOT EXISTS recent_activity AS SELECT 'query' as activity_type, session_id as identifier, query_text as description, timestamp as activity_time, repo_context FROM query_embeddings WHERE timestamp >= CURRENT_TIMESTAMP - INTERVAL '7 days' UNION ALL SELECT 'analysis' as activity_type, repo_id as identifier, file_path as description, analyzed_at as activity_time, repo_id as repo_context FROM file_analysis_history WHERE analyzed_at >= CURRENT_TIMESTAMP - INTERVAL '7 days' ORDER BY activity_time DESC; CREATE INDEX ON recent_activity (activity_time DESC); CREATE INDEX ON recent_activity (activity_type); -- ================================================ -- STORED FUNCTIONS AND PROCEDURES -- ================================================ -- Function to refresh all materialized views CREATE OR REPLACE FUNCTION refresh_memory_views() RETURNS void AS $$ BEGIN REFRESH MATERIALIZED VIEW CONCURRENTLY high_confidence_knowledge; REFRESH MATERIALIZED VIEW CONCURRENTLY repository_quality_summary; REFRESH MATERIALIZED VIEW CONCURRENTLY recent_activity; -- Log the refresh INSERT INTO memory_consolidation_log ( source_type, source_id, target_memory_type, target_id, consolidation_confidence, consolidation_metadata ) VALUES ( 'system', 'materialized_views', 'system', 'view_refresh', 1.0, '{"refresh_time": "' || CURRENT_TIMESTAMP || '"}'::jsonb ); END; $$ LANGUAGE plpgsql; -- Function to calculate semantic similarity between texts CREATE OR REPLACE FUNCTION calculate_similarity(embedding1 vector(384), embedding2 vector(384)) RETURNS real AS $$ BEGIN RETURN 1 - (embedding1 <=> embedding2); END; $$ LANGUAGE plpgsql IMMUTABLE STRICT; -- Function to update access patterns CREATE OR REPLACE FUNCTION update_access_pattern(table_name text, id_column text, id_value text) RETURNS void AS $$ BEGIN CASE table_name WHEN 'knowledge_embeddings' THEN EXECUTE 'UPDATE knowledge_embeddings SET last_accessed = CURRENT_TIMESTAMP, access_frequency = access_frequency + 1 WHERE fact_id = $1' USING id_value; WHEN 'code_embeddings' THEN EXECUTE 'UPDATE code_embeddings SET last_accessed = CURRENT_TIMESTAMP, access_count = access_count + 1 WHERE id = $1::uuid' USING id_value; ELSE RAISE EXCEPTION 'Unsupported table: %', table_name; END CASE; END; $$ LANGUAGE plpgsql; -- Function to cleanup old memories CREATE OR REPLACE FUNCTION cleanup_old_memories(retention_days integer DEFAULT 365) RETURNS integer AS $$ DECLARE deleted_count integer := 0; cutoff_date timestamp; BEGIN cutoff_date := CURRENT_TIMESTAMP - (retention_days || ' days')::interval; -- Delete old query embeddings (episodic memories) DELETE FROM query_embeddings WHERE timestamp < cutoff_date; GET DIAGNOSTICS deleted_count = ROW_COUNT; -- Update knowledge confidence based on access patterns UPDATE knowledge_embeddings SET confidence = LEAST(confidence * ( CASE WHEN EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - last_accessed)) / 86400 < 30 THEN 1.05 ELSE 0.98 END * (1.0 + LOG(access_frequency + 1) / 20.0) ), 1.0); -- Log cleanup activity INSERT INTO memory_consolidation_log ( source_type, source_id, target_memory_type, target_id, consolidation_confidence, consolidation_metadata ) VALUES ( 'system', 'cleanup_function', 'system', 'memory_cleanup', 1.0, ('{"deleted_records": ' || deleted_count || ', "cutoff_date": "' || cutoff_date || '"}')::jsonb ); RETURN deleted_count; END; $$ LANGUAGE plpgsql; -- Function to find similar code patterns CREATE OR REPLACE FUNCTION find_similar_code( query_embedding vector(384), repo_filter text DEFAULT NULL, similarity_threshold real DEFAULT 0.7, max_results integer DEFAULT 10 ) RETURNS TABLE ( id uuid, repo_id varchar(255), file_path text, similarity real, metadata jsonb ) AS $$ BEGIN RETURN QUERY SELECT ce.id, ce.repo_id, ce.file_path, (1 - (ce.embedding <=> query_embedding))::real as similarity, ce.metadata FROM code_embeddings ce WHERE (repo_filter IS NULL OR ce.repo_id = repo_filter) AND (1 - (ce.embedding <=> query_embedding)) > similarity_threshold ORDER BY similarity DESC LIMIT max_results; END; $$ LANGUAGE plpgsql; -- Function to get knowledge by category CREATE OR REPLACE FUNCTION get_knowledge_by_category( category_filter varchar(100), min_confidence real DEFAULT 0.5, max_results integer DEFAULT 20 ) RETURNS TABLE ( fact_id varchar(255), content text, confidence real, access_frequency integer, source_repos text[] ) AS $$ BEGIN RETURN QUERY SELECT ke.fact_id, ke.content, ke.confidence, ke.access_frequency, ke.source_repos FROM knowledge_embeddings ke WHERE ke.category = category_filter AND ke.confidence >= min_confidence ORDER BY ke.confidence DESC, ke.access_frequency DESC LIMIT max_results; END; $$ LANGUAGE plpgsql; -- ================================================ -- TRIGGERS FOR AUTOMATIC MAINTENANCE -- ================================================ -- Trigger function to update repository metadata when embeddings are added CREATE OR REPLACE FUNCTION update_repository_stats() RETURNS trigger AS $$ BEGIN -- Update or insert repository metadata INSERT INTO repository_metadata (repo_id, repo_path, analysis_count, last_analyzed) VALUES (NEW.repo_id, NEW.repo_id, 1, CURRENT_TIMESTAMP) ON CONFLICT (repo_id) DO UPDATE SET analysis_count = repository_metadata.analysis_count + 1, last_analyzed = CURRENT_TIMESTAMP; RETURN NEW; END; $$ LANGUAGE plpgsql; -- Create triggers DROP TRIGGER IF EXISTS trigger_update_repo_stats ON code_embeddings; CREATE TRIGGER trigger_update_repo_stats AFTER INSERT ON code_embeddings FOR EACH ROW EXECUTE FUNCTION update_repository_stats(); -- Trigger to automatically update access patterns CREATE OR REPLACE FUNCTION auto_update_access() RETURNS trigger AS $$ BEGIN NEW.last_accessed = CURRENT_TIMESTAMP; NEW.access_count = COALESCE(OLD.access_count, 0) + 1; RETURN NEW; END; $$ LANGUAGE plpgsql; DROP TRIGGER IF EXISTS trigger_auto_access_update ON code_embeddings; CREATE TRIGGER trigger_auto_access_update BEFORE UPDATE ON code_embeddings FOR EACH ROW EXECUTE FUNCTION auto_update_access(); -- ================================================ -- SECURITY AND PERMISSIONS -- ================================================ -- Create roles for different access levels DO $$ BEGIN IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'repo_analyzer_read') THEN CREATE ROLE repo_analyzer_read; END IF; IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'repo_analyzer_write') THEN CREATE ROLE repo_analyzer_write; END IF; IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'repo_analyzer_admin') THEN CREATE ROLE repo_analyzer_admin; END IF; END $$; -- Grant permissions GRANT SELECT ON ALL TABLES IN SCHEMA public TO repo_analyzer_read; GRANT SELECT ON high_confidence_knowledge TO repo_analyzer_read; GRANT SELECT ON repository_quality_summary TO repo_analyzer_read; GRANT SELECT ON recent_activity TO repo_analyzer_read; GRANT SELECT, INSERT, UPDATE ON ALL TABLES IN SCHEMA public TO repo_analyzer_write; GRANT SELECT ON high_confidence_knowledge TO repo_analyzer_write; GRANT SELECT ON repository_quality_summary TO repo_analyzer_write; GRANT SELECT ON recent_activity TO repo_analyzer_write; GRANT USAGE, SELECT ON ALL SEQUENCES IN SCHEMA public TO repo_analyzer_write; GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO repo_analyzer_admin; GRANT ALL PRIVILEGES ON high_confidence_knowledge TO repo_analyzer_admin; GRANT ALL PRIVILEGES ON repository_quality_summary TO repo_analyzer_admin; GRANT ALL PRIVILEGES ON recent_activity TO repo_analyzer_admin; GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA public TO repo_analyzer_admin; GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA public TO repo_analyzer_admin; -- ================================================ -- DATA VALIDATION AND CONSTRAINTS -- ================================================ -- Add check constraints for data quality -- Note: Vector dimensions are validated at insertion time, no need for runtime checks -- Add constraints for reasonable data ranges DO $$ BEGIN IF NOT EXISTS (SELECT 1 FROM pg_constraint WHERE conname = 'reasonable_lines_of_code') THEN ALTER TABLE file_analysis_history ADD CONSTRAINT reasonable_lines_of_code CHECK (lines_of_code >= 0 AND lines_of_code <= 1000000); END IF; IF NOT EXISTS (SELECT 1 FROM pg_constraint WHERE conname = 'reasonable_complexity') THEN ALTER TABLE file_analysis_history ADD CONSTRAINT reasonable_complexity CHECK (complexity_score >= 0.0 AND complexity_score <= 100.0); END IF; END $$; -- ================================================ -- INITIAL DATA AND CONFIGURATION -- ================================================ -- Insert initial system configuration INSERT INTO memory_consolidation_log ( source_type, source_id, target_memory_type, target_id, consolidation_confidence, consolidation_metadata ) VALUES ( 'system', 'database_migration', 'system', 'initial_setup', 1.0, ('{"migration_version": "1.0", "setup_time": "' || CURRENT_TIMESTAMP || '"}')::jsonb ) ON CONFLICT DO NOTHING; -- Create initial knowledge categories INSERT INTO knowledge_embeddings ( fact_id, content, category, embedding, confidence, source_repos ) VALUES ( 'init_security_001', 'Always validate and sanitize user input to prevent injection attacks', 'security_vulnerability', array_fill(0.0, ARRAY[384])::vector(384), 0.95, ARRAY[]::text[] ), ( 'init_performance_001', 'Use appropriate data structures and algorithms for better performance', 'performance', array_fill(0.0, ARRAY[384])::vector(384), 0.9, ARRAY[]::text[] ), ( 'init_best_practice_001', 'Follow consistent naming conventions and code formatting standards', 'best_practice', array_fill(0.0, ARRAY[384])::vector(384), 0.85, ARRAY[]::text[] ) ON CONFLICT (fact_id) DO NOTHING; -- ================================================ -- BACKUP AND MAINTENANCE PROCEDURES -- ================================================ -- Function to create backup of critical memory data CREATE OR REPLACE FUNCTION backup_memory_data(backup_path text DEFAULT '/tmp/memory_backup') RETURNS text AS $$ DECLARE backup_file text; result_message text; BEGIN backup_file := backup_path || '_' || to_char(CURRENT_TIMESTAMP, 'YYYY-MM-DD_HH24-MI-SS') || '.sql'; -- This would need to be implemented with actual backup logic -- For now, just return the intended backup file name result_message := 'Backup would be created at: ' || backup_file; -- Log backup activity INSERT INTO memory_consolidation_log ( source_type, source_id, target_memory_type, target_id, consolidation_confidence, consolidation_metadata ) VALUES ( 'system', 'backup_function', 'system', 'backup_created', 1.0, ('{"backup_file": "' || backup_file || '"}')::jsonb ); RETURN result_message; END; $$ LANGUAGE plpgsql; -- ================================================ -- MONITORING AND ANALYTICS -- ================================================ -- View for system health monitoring CREATE OR REPLACE VIEW system_health_monitor AS SELECT 'code_embeddings' as table_name, COUNT(*) as record_count, MAX(created_at) as latest_record, AVG(access_count) as avg_access_count FROM code_embeddings UNION ALL SELECT 'query_embeddings' as table_name, COUNT(*) as record_count, MAX(timestamp) as latest_record, NULL as avg_access_count FROM query_embeddings UNION ALL SELECT 'knowledge_embeddings' as table_name, COUNT(*) as record_count, MAX(created_at) as latest_record, AVG(access_frequency) as avg_access_count FROM knowledge_embeddings; -- Function to get comprehensive system statistics CREATE OR REPLACE FUNCTION get_system_statistics() RETURNS jsonb AS $$ DECLARE stats jsonb; BEGIN SELECT jsonb_build_object( 'total_code_embeddings', (SELECT COUNT(*) FROM code_embeddings), 'total_query_embeddings', (SELECT COUNT(*) FROM query_embeddings), 'total_knowledge_embeddings', (SELECT COUNT(*) FROM knowledge_embeddings), 'unique_repositories', (SELECT COUNT(DISTINCT repo_id) FROM code_embeddings), 'high_confidence_knowledge', (SELECT COUNT(*) FROM knowledge_embeddings WHERE confidence > 0.8), 'recent_activity_7d', (SELECT COUNT(*) FROM query_embeddings WHERE timestamp >= CURRENT_TIMESTAMP - INTERVAL '7 days'), 'average_code_quality', (SELECT AVG(quality_score) FROM repository_metadata), 'last_updated', CURRENT_TIMESTAMP ) INTO stats; RETURN stats; END; $$ LANGUAGE plpgsql; -- ================================================ -- COMPLETION MESSAGE -- ================================================ DO $$ BEGIN RAISE NOTICE '================================================'; RAISE NOTICE 'Repository Analyzer Memory System Database Setup Complete'; RAISE NOTICE '================================================'; RAISE NOTICE 'Tables created: code_embeddings, query_embeddings, knowledge_embeddings'; RAISE NOTICE 'Indexes created: Vector similarity indexes with IVFFlat'; RAISE NOTICE 'Functions created: Similarity search, cleanup, statistics'; RAISE NOTICE 'Materialized views created: High confidence knowledge, repository summary'; RAISE NOTICE 'Triggers created: Auto-update repository stats and access patterns'; RAISE NOTICE '================================================'; RAISE NOTICE 'Ready for AI-enhanced repository analysis with persistent memory'; RAISE NOTICE '================================================'; END $$;