codenuk_backend_mine/services/ai-analysis-service/001-schema.sql
2025-10-17 10:33:14 +05:30

604 lines
22 KiB
PL/PgSQL

-- ================================================
-- Repository Analyzer Memory System Database Migration
-- Version: 1.0
-- Description: Complete database setup for AI memory system
-- ================================================
-- Enable required extensions
CREATE EXTENSION IF NOT EXISTS vector;
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
-- ================================================
-- CORE TABLES
-- ================================================
-- Code embeddings table for semantic search of analyzed code
CREATE TABLE IF NOT EXISTS code_embeddings (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
repo_id VARCHAR(255) NOT NULL,
file_path TEXT NOT NULL,
content_hash VARCHAR(64) NOT NULL,
embedding vector(384) NOT NULL,
metadata JSONB DEFAULT '{}',
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
last_accessed TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
access_count INTEGER DEFAULT 0,
-- Ensure uniqueness per repo/file/hash combination
CONSTRAINT unique_code_analysis UNIQUE(repo_id, file_path, content_hash)
);
-- Query embeddings for episodic memory (user interactions)
CREATE TABLE IF NOT EXISTS query_embeddings (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
session_id VARCHAR(255) NOT NULL,
query_text TEXT NOT NULL,
query_embedding vector(384) NOT NULL,
response_embedding vector(384),
repo_context VARCHAR(255),
timestamp TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
metadata JSONB DEFAULT '{}',
-- Index for session-based queries
CONSTRAINT valid_session_id CHECK (LENGTH(session_id) > 0)
);
-- Persistent knowledge embeddings for long-term learning
CREATE TABLE IF NOT EXISTS knowledge_embeddings (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
fact_id VARCHAR(255) UNIQUE NOT NULL,
content TEXT NOT NULL,
category VARCHAR(100) NOT NULL,
embedding vector(384) NOT NULL,
confidence REAL DEFAULT 1.0 CHECK (confidence >= 0.0 AND confidence <= 1.0),
source_repos TEXT[] DEFAULT '{}',
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
last_accessed TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
access_frequency INTEGER DEFAULT 0,
-- Ensure valid categories
CONSTRAINT valid_category CHECK (category IN ('code_pattern', 'best_practice', 'vulnerability', 'architecture', 'security_vulnerability', 'performance'))
);
-- Repository metadata for tracking analyzed repositories
CREATE TABLE IF NOT EXISTS repository_metadata (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
repo_id VARCHAR(255) UNIQUE NOT NULL,
repo_path TEXT NOT NULL,
repo_name VARCHAR(500),
primary_language VARCHAR(100),
total_files INTEGER DEFAULT 0,
total_lines INTEGER DEFAULT 0,
last_analyzed TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
analysis_count INTEGER DEFAULT 0,
quality_score REAL DEFAULT 5.0 CHECK (quality_score >= 0.0 AND quality_score <= 10.0),
metadata JSONB DEFAULT '{}'
);
-- Session tracking for episodic memory correlation
CREATE TABLE IF NOT EXISTS analysis_sessions (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
session_id VARCHAR(255) UNIQUE NOT NULL,
user_identifier VARCHAR(255),
start_time TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
end_time TIMESTAMP WITH TIME ZONE,
total_queries INTEGER DEFAULT 0,
repositories_analyzed TEXT[] DEFAULT '{}',
session_metadata JSONB DEFAULT '{}'
);
-- File analysis history for change tracking
CREATE TABLE IF NOT EXISTS file_analysis_history (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
repo_id VARCHAR(255) NOT NULL,
file_path TEXT NOT NULL,
content_hash VARCHAR(64) NOT NULL,
language VARCHAR(100),
lines_of_code INTEGER DEFAULT 0,
complexity_score REAL DEFAULT 0.0,
severity_score REAL DEFAULT 5.0 CHECK (severity_score >= 0.0 AND severity_score <= 10.0),
issues_count INTEGER DEFAULT 0,
analyzed_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
analysis_version VARCHAR(50) DEFAULT '1.0'
);
-- Memory consolidation log for tracking knowledge extraction
CREATE TABLE IF NOT EXISTS memory_consolidation_log (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
source_type VARCHAR(50) NOT NULL, -- 'episodic', 'code_analysis', 'manual'
source_id VARCHAR(255) NOT NULL,
target_memory_type VARCHAR(50) NOT NULL, -- 'persistent', 'working'
target_id VARCHAR(255),
consolidation_confidence REAL DEFAULT 0.5,
consolidation_timestamp TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
consolidation_metadata JSONB DEFAULT '{}'
);
-- ================================================
-- PERFORMANCE INDEXES
-- ================================================
-- Code embeddings indexes
CREATE INDEX IF NOT EXISTS idx_code_embeddings_repo_id ON code_embeddings(repo_id);
CREATE INDEX IF NOT EXISTS idx_code_embeddings_file_path ON code_embeddings(file_path);
CREATE INDEX IF NOT EXISTS idx_code_embeddings_accessed ON code_embeddings(last_accessed DESC);
CREATE INDEX IF NOT EXISTS idx_code_embeddings_metadata ON code_embeddings USING gin(metadata);
-- Vector similarity indexes (using IVFFlat for better performance)
CREATE INDEX IF NOT EXISTS idx_code_embeddings_vector
ON code_embeddings USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);
-- Query embeddings indexes
CREATE INDEX IF NOT EXISTS idx_query_embeddings_session ON query_embeddings(session_id);
CREATE INDEX IF NOT EXISTS idx_query_embeddings_timestamp ON query_embeddings(timestamp DESC);
CREATE INDEX IF NOT EXISTS idx_query_embeddings_repo_context ON query_embeddings(repo_context);
CREATE INDEX IF NOT EXISTS idx_query_embeddings_vector
ON query_embeddings USING ivfflat (query_embedding vector_cosine_ops) WITH (lists = 100);
-- Knowledge embeddings indexes
CREATE INDEX IF NOT EXISTS idx_knowledge_embeddings_category ON knowledge_embeddings(category);
CREATE INDEX IF NOT EXISTS idx_knowledge_embeddings_confidence ON knowledge_embeddings(confidence DESC);
CREATE INDEX IF NOT EXISTS idx_knowledge_embeddings_access_freq ON knowledge_embeddings(access_frequency DESC);
CREATE INDEX IF NOT EXISTS idx_knowledge_embeddings_vector
ON knowledge_embeddings USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);
CREATE INDEX IF NOT EXISTS idx_knowledge_source_repos ON knowledge_embeddings USING gin(source_repos);
-- Repository metadata indexes
CREATE INDEX IF NOT EXISTS idx_repository_metadata_repo_id ON repository_metadata(repo_id);
CREATE INDEX IF NOT EXISTS idx_repository_metadata_analyzed ON repository_metadata(last_analyzed DESC);
CREATE INDEX IF NOT EXISTS idx_repository_metadata_language ON repository_metadata(primary_language);
-- File history indexes
CREATE INDEX IF NOT EXISTS idx_file_history_repo_file ON file_analysis_history(repo_id, file_path);
CREATE INDEX IF NOT EXISTS idx_file_history_analyzed ON file_analysis_history(analyzed_at DESC);
CREATE INDEX IF NOT EXISTS idx_file_history_severity ON file_analysis_history(severity_score);
-- ================================================
-- MATERIALIZED VIEWS FOR COMMON QUERIES
-- ================================================
-- High confidence knowledge view
CREATE MATERIALIZED VIEW IF NOT EXISTS high_confidence_knowledge AS
SELECT
fact_id,
content,
category,
confidence,
source_repos,
created_at,
last_accessed,
access_frequency
FROM knowledge_embeddings
WHERE confidence > 0.8
ORDER BY confidence DESC, access_frequency DESC;
CREATE INDEX ON high_confidence_knowledge (category);
CREATE INDEX ON high_confidence_knowledge (confidence DESC);
-- Repository quality summary view
CREATE MATERIALIZED VIEW IF NOT EXISTS repository_quality_summary AS
SELECT
rm.repo_id,
rm.repo_path,
rm.repo_name,
rm.primary_language,
rm.total_files,
rm.total_lines,
rm.quality_score,
rm.last_analyzed,
COUNT(ce.id) as total_embeddings,
AVG(fah.severity_score) as avg_file_quality,
COUNT(DISTINCT fah.file_path) as analyzed_files_count
FROM repository_metadata rm
LEFT JOIN code_embeddings ce ON rm.repo_id = ce.repo_id
LEFT JOIN file_analysis_history fah ON rm.repo_id = fah.repo_id
GROUP BY rm.repo_id, rm.repo_path, rm.repo_name, rm.primary_language,
rm.total_files, rm.total_lines, rm.quality_score, rm.last_analyzed;
CREATE INDEX ON repository_quality_summary (quality_score DESC);
CREATE INDEX ON repository_quality_summary (last_analyzed DESC);
-- Recent activity view
CREATE MATERIALIZED VIEW IF NOT EXISTS recent_activity AS
SELECT
'query' as activity_type,
session_id as identifier,
query_text as description,
timestamp as activity_time,
repo_context
FROM query_embeddings
WHERE timestamp >= CURRENT_TIMESTAMP - INTERVAL '7 days'
UNION ALL
SELECT
'analysis' as activity_type,
repo_id as identifier,
file_path as description,
analyzed_at as activity_time,
repo_id as repo_context
FROM file_analysis_history
WHERE analyzed_at >= CURRENT_TIMESTAMP - INTERVAL '7 days'
ORDER BY activity_time DESC;
CREATE INDEX ON recent_activity (activity_time DESC);
CREATE INDEX ON recent_activity (activity_type);
-- ================================================
-- STORED FUNCTIONS AND PROCEDURES
-- ================================================
-- Function to refresh all materialized views
CREATE OR REPLACE FUNCTION refresh_memory_views()
RETURNS void AS $$
BEGIN
REFRESH MATERIALIZED VIEW CONCURRENTLY high_confidence_knowledge;
REFRESH MATERIALIZED VIEW CONCURRENTLY repository_quality_summary;
REFRESH MATERIALIZED VIEW CONCURRENTLY recent_activity;
-- Log the refresh
INSERT INTO memory_consolidation_log (
source_type, source_id, target_memory_type, target_id,
consolidation_confidence, consolidation_metadata
) VALUES (
'system', 'materialized_views', 'system', 'view_refresh',
1.0, '{"refresh_time": "' || CURRENT_TIMESTAMP || '"}'::jsonb
);
END;
$$ LANGUAGE plpgsql;
-- Function to calculate semantic similarity between texts
CREATE OR REPLACE FUNCTION calculate_similarity(embedding1 vector(384), embedding2 vector(384))
RETURNS real AS $$
BEGIN
RETURN 1 - (embedding1 <=> embedding2);
END;
$$ LANGUAGE plpgsql IMMUTABLE STRICT;
-- Function to update access patterns
CREATE OR REPLACE FUNCTION update_access_pattern(table_name text, id_column text, id_value text)
RETURNS void AS $$
BEGIN
CASE table_name
WHEN 'knowledge_embeddings' THEN
EXECUTE 'UPDATE knowledge_embeddings SET last_accessed = CURRENT_TIMESTAMP, access_frequency = access_frequency + 1 WHERE fact_id = $1'
USING id_value;
WHEN 'code_embeddings' THEN
EXECUTE 'UPDATE code_embeddings SET last_accessed = CURRENT_TIMESTAMP, access_count = access_count + 1 WHERE id = $1::uuid'
USING id_value;
ELSE
RAISE EXCEPTION 'Unsupported table: %', table_name;
END CASE;
END;
$$ LANGUAGE plpgsql;
-- Function to cleanup old memories
CREATE OR REPLACE FUNCTION cleanup_old_memories(retention_days integer DEFAULT 365)
RETURNS integer AS $$
DECLARE
deleted_count integer := 0;
cutoff_date timestamp;
BEGIN
cutoff_date := CURRENT_TIMESTAMP - (retention_days || ' days')::interval;
-- Delete old query embeddings (episodic memories)
DELETE FROM query_embeddings WHERE timestamp < cutoff_date;
GET DIAGNOSTICS deleted_count = ROW_COUNT;
-- Update knowledge confidence based on access patterns
UPDATE knowledge_embeddings
SET confidence = LEAST(confidence * (
CASE
WHEN EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - last_accessed)) / 86400 < 30
THEN 1.05
ELSE 0.98
END *
(1.0 + LOG(access_frequency + 1) / 20.0)
), 1.0);
-- Log cleanup activity
INSERT INTO memory_consolidation_log (
source_type, source_id, target_memory_type, target_id,
consolidation_confidence, consolidation_metadata
) VALUES (
'system', 'cleanup_function', 'system', 'memory_cleanup',
1.0, ('{"deleted_records": ' || deleted_count || ', "cutoff_date": "' || cutoff_date || '"}')::jsonb
);
RETURN deleted_count;
END;
$$ LANGUAGE plpgsql;
-- Function to find similar code patterns
CREATE OR REPLACE FUNCTION find_similar_code(
query_embedding vector(384),
repo_filter text DEFAULT NULL,
similarity_threshold real DEFAULT 0.7,
max_results integer DEFAULT 10
)
RETURNS TABLE (
id uuid,
repo_id varchar(255),
file_path text,
similarity real,
metadata jsonb
) AS $$
BEGIN
RETURN QUERY
SELECT
ce.id,
ce.repo_id,
ce.file_path,
(1 - (ce.embedding <=> query_embedding))::real as similarity,
ce.metadata
FROM code_embeddings ce
WHERE (repo_filter IS NULL OR ce.repo_id = repo_filter)
AND (1 - (ce.embedding <=> query_embedding)) > similarity_threshold
ORDER BY similarity DESC
LIMIT max_results;
END;
$$ LANGUAGE plpgsql;
-- Function to get knowledge by category
CREATE OR REPLACE FUNCTION get_knowledge_by_category(
category_filter varchar(100),
min_confidence real DEFAULT 0.5,
max_results integer DEFAULT 20
)
RETURNS TABLE (
fact_id varchar(255),
content text,
confidence real,
access_frequency integer,
source_repos text[]
) AS $$
BEGIN
RETURN QUERY
SELECT
ke.fact_id,
ke.content,
ke.confidence,
ke.access_frequency,
ke.source_repos
FROM knowledge_embeddings ke
WHERE ke.category = category_filter
AND ke.confidence >= min_confidence
ORDER BY ke.confidence DESC, ke.access_frequency DESC
LIMIT max_results;
END;
$$ LANGUAGE plpgsql;
-- ================================================
-- TRIGGERS FOR AUTOMATIC MAINTENANCE
-- ================================================
-- Trigger function to update repository metadata when embeddings are added
CREATE OR REPLACE FUNCTION update_repository_stats()
RETURNS trigger AS $$
BEGIN
-- Update or insert repository metadata
INSERT INTO repository_metadata (repo_id, repo_path, analysis_count, last_analyzed)
VALUES (NEW.repo_id, NEW.repo_id, 1, CURRENT_TIMESTAMP)
ON CONFLICT (repo_id)
DO UPDATE SET
analysis_count = repository_metadata.analysis_count + 1,
last_analyzed = CURRENT_TIMESTAMP;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
-- Create triggers
DROP TRIGGER IF EXISTS trigger_update_repo_stats ON code_embeddings;
CREATE TRIGGER trigger_update_repo_stats
AFTER INSERT ON code_embeddings
FOR EACH ROW
EXECUTE FUNCTION update_repository_stats();
-- Trigger to automatically update access patterns
CREATE OR REPLACE FUNCTION auto_update_access()
RETURNS trigger AS $$
BEGIN
NEW.last_accessed = CURRENT_TIMESTAMP;
NEW.access_count = COALESCE(OLD.access_count, 0) + 1;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
DROP TRIGGER IF EXISTS trigger_auto_access_update ON code_embeddings;
CREATE TRIGGER trigger_auto_access_update
BEFORE UPDATE ON code_embeddings
FOR EACH ROW
EXECUTE FUNCTION auto_update_access();
-- ================================================
-- SECURITY AND PERMISSIONS
-- ================================================
-- Create roles for different access levels
DO $$
BEGIN
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'repo_analyzer_read') THEN
CREATE ROLE repo_analyzer_read;
END IF;
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'repo_analyzer_write') THEN
CREATE ROLE repo_analyzer_write;
END IF;
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'repo_analyzer_admin') THEN
CREATE ROLE repo_analyzer_admin;
END IF;
END
$$;
-- Grant permissions
GRANT SELECT ON ALL TABLES IN SCHEMA public TO repo_analyzer_read;
GRANT SELECT, INSERT, UPDATE ON ALL TABLES IN SCHEMA public TO repo_analyzer_write;
GRANT USAGE, SELECT ON ALL SEQUENCES IN SCHEMA public TO repo_analyzer_write;
GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO repo_analyzer_admin;
GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA public TO repo_analyzer_admin;
GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA public TO repo_analyzer_admin;
-- ================================================
-- DATA VALIDATION AND CONSTRAINTS
-- ================================================
-- Add check constraints for data quality
-- Note: Vector dimensions are validated at insertion time, no need for runtime checks
-- Add constraints for reasonable data ranges
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM pg_constraint WHERE conname = 'reasonable_lines_of_code') THEN
ALTER TABLE file_analysis_history ADD CONSTRAINT reasonable_lines_of_code
CHECK (lines_of_code >= 0 AND lines_of_code <= 1000000);
END IF;
IF NOT EXISTS (SELECT 1 FROM pg_constraint WHERE conname = 'reasonable_complexity') THEN
ALTER TABLE file_analysis_history ADD CONSTRAINT reasonable_complexity
CHECK (complexity_score >= 0.0 AND complexity_score <= 100.0);
END IF;
END
$$;
-- ================================================
-- INITIAL DATA AND CONFIGURATION
-- ================================================
-- Insert initial system configuration
INSERT INTO memory_consolidation_log (
source_type, source_id, target_memory_type, target_id,
consolidation_confidence, consolidation_metadata
) VALUES (
'system', 'database_migration', 'system', 'initial_setup',
1.0, ('{"migration_version": "1.0", "setup_time": "' || CURRENT_TIMESTAMP || '"}')::jsonb
) ON CONFLICT DO NOTHING;
-- Create initial knowledge categories
INSERT INTO knowledge_embeddings (
fact_id, content, category, embedding, confidence, source_repos
) VALUES
(
'init_security_001',
'Always validate and sanitize user input to prevent injection attacks',
'security_vulnerability',
array_fill(0.0, ARRAY[384])::vector(384),
0.95,
ARRAY[]::text[]
),
(
'init_performance_001',
'Use appropriate data structures and algorithms for better performance',
'performance',
array_fill(0.0, ARRAY[384])::vector(384),
0.9,
ARRAY[]::text[]
),
(
'init_best_practice_001',
'Follow consistent naming conventions and code formatting standards',
'best_practice',
array_fill(0.0, ARRAY[384])::vector(384),
0.85,
ARRAY[]::text[]
)
ON CONFLICT (fact_id) DO NOTHING;
-- ================================================
-- BACKUP AND MAINTENANCE PROCEDURES
-- ================================================
-- Function to create backup of critical memory data
CREATE OR REPLACE FUNCTION backup_memory_data(backup_path text DEFAULT '/tmp/memory_backup')
RETURNS text AS $$
DECLARE
backup_file text;
result_message text;
BEGIN
backup_file := backup_path || '_' || to_char(CURRENT_TIMESTAMP, 'YYYY-MM-DD_HH24-MI-SS') || '.sql';
-- This would need to be implemented with actual backup logic
-- For now, just return the intended backup file name
result_message := 'Backup would be created at: ' || backup_file;
-- Log backup activity
INSERT INTO memory_consolidation_log (
source_type, source_id, target_memory_type, target_id,
consolidation_confidence, consolidation_metadata
) VALUES (
'system', 'backup_function', 'system', 'backup_created',
1.0, ('{"backup_file": "' || backup_file || '"}')::jsonb
);
RETURN result_message;
END;
$$ LANGUAGE plpgsql;
-- ================================================
-- MONITORING AND ANALYTICS
-- ================================================
-- View for system health monitoring
CREATE OR REPLACE VIEW system_health_monitor AS
SELECT
'code_embeddings' as table_name,
COUNT(*) as record_count,
MAX(created_at) as latest_record,
AVG(access_count) as avg_access_count
FROM code_embeddings
UNION ALL
SELECT
'query_embeddings' as table_name,
COUNT(*) as record_count,
MAX(timestamp) as latest_record,
NULL as avg_access_count
FROM query_embeddings
UNION ALL
SELECT
'knowledge_embeddings' as table_name,
COUNT(*) as record_count,
MAX(created_at) as latest_record,
AVG(access_frequency) as avg_access_count
FROM knowledge_embeddings;
-- Function to get comprehensive system statistics
CREATE OR REPLACE FUNCTION get_system_statistics()
RETURNS jsonb AS $$
DECLARE
stats jsonb;
BEGIN
SELECT jsonb_build_object(
'total_code_embeddings', (SELECT COUNT(*) FROM code_embeddings),
'total_query_embeddings', (SELECT COUNT(*) FROM query_embeddings),
'total_knowledge_embeddings', (SELECT COUNT(*) FROM knowledge_embeddings),
'unique_repositories', (SELECT COUNT(DISTINCT repo_id) FROM code_embeddings),
'high_confidence_knowledge', (SELECT COUNT(*) FROM knowledge_embeddings WHERE confidence > 0.8),
'recent_activity_7d', (SELECT COUNT(*) FROM query_embeddings WHERE timestamp >= CURRENT_TIMESTAMP - INTERVAL '7 days'),
'average_code_quality', (SELECT AVG(quality_score) FROM repository_metadata),
'last_updated', CURRENT_TIMESTAMP
) INTO stats;
RETURN stats;
END;
$$ LANGUAGE plpgsql;
-- ================================================
-- COMPLETION MESSAGE
-- ================================================
DO $$
BEGIN
RAISE NOTICE '================================================';
RAISE NOTICE 'Repository Analyzer Memory System Database Setup Complete';
RAISE NOTICE '================================================';
RAISE NOTICE 'Tables created: code_embeddings, query_embeddings, knowledge_embeddings';
RAISE NOTICE 'Indexes created: Vector similarity indexes with IVFFlat';
RAISE NOTICE 'Functions created: Similarity search, cleanup, statistics';
RAISE NOTICE 'Materialized views created: High confidence knowledge, repository summary';
RAISE NOTICE 'Triggers created: Auto-update repository stats and access patterns';
RAISE NOTICE '================================================';
RAISE NOTICE 'Ready for AI-enhanced repository analysis with persistent memory';
RAISE NOTICE '================================================';
END
$$;