added qdrant db in multi doc service
This commit is contained in:
parent
603e9b4b20
commit
72fea0dee8
@ -196,27 +196,45 @@ services:
|
||||
# retries: 5
|
||||
# start_period: 60s
|
||||
|
||||
chromadb:
|
||||
image: chromadb/chroma:latest
|
||||
container_name: pipeline_chromadb
|
||||
# chromadb:
|
||||
# image: chromadb/chroma:latest
|
||||
# container_name: pipeline_chromadb
|
||||
# ports:
|
||||
# - "8010:8000"
|
||||
# environment:
|
||||
# - CHROMA_SERVER_HOST=0.0.0.0
|
||||
# - CHROMA_SERVER_HTTP_PORT=8000
|
||||
# - IS_PERSISTENT=TRUE
|
||||
# - PERSIST_DIRECTORY=/chroma/chroma
|
||||
# - ANONYMIZED_TELEMETRY=TRUE
|
||||
# volumes:
|
||||
# - chromadb_data:/chroma/chroma
|
||||
# networks:
|
||||
# - pipeline_network
|
||||
# healthcheck:
|
||||
# test: ["CMD-SHELL", "timeout 5 bash -c '</dev/tcp/127.0.0.1/8000' || exit 1"]
|
||||
# interval: 15s
|
||||
# timeout: 10s
|
||||
# retries: 3
|
||||
# start_period: 30s
|
||||
|
||||
qdrant:
|
||||
image: qdrant/qdrant:latest
|
||||
container_name: pipeline_qdrant
|
||||
ports:
|
||||
- "8010:8000"
|
||||
environment:
|
||||
- CHROMA_SERVER_HOST=0.0.0.0
|
||||
- CHROMA_SERVER_HTTP_PORT=8000
|
||||
- IS_PERSISTENT=TRUE
|
||||
- PERSIST_DIRECTORY=/chroma/chroma
|
||||
- ANONYMIZED_TELEMETRY=TRUE
|
||||
- "6333:6333"
|
||||
- "6334:6334"
|
||||
volumes:
|
||||
- chromadb_data:/chroma/chroma
|
||||
- qdrant_data:/qdrant/storage
|
||||
networks:
|
||||
- pipeline_network
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "timeout 5 bash -c '</dev/tcp/127.0.0.1/8000' || exit 1"]
|
||||
interval: 15s
|
||||
test: ["CMD-SHELL", "timeout 2 bash -c '</dev/tcp/127.0.0.1/6333' || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
retries: 5
|
||||
start_period: 30s
|
||||
restart: unless-stopped
|
||||
|
||||
|
||||
|
||||
@ -294,97 +312,97 @@ services:
|
||||
start_period: 40s
|
||||
restart: unless-stopped
|
||||
|
||||
requirement-processor:
|
||||
build: ./services/requirement-processor
|
||||
container_name: pipeline_requirement_processor
|
||||
ports:
|
||||
- "8001:8001"
|
||||
environment:
|
||||
- POSTGRES_HOST=postgres
|
||||
- POSTGRES_PORT=5432
|
||||
- POSTGRES_DB=dev_pipeline
|
||||
- POSTGRES_USER=pipeline_admin
|
||||
- POSTGRES_PASSWORD=secure_pipeline_2024
|
||||
- DATABASE_URL=postgresql://pipeline_admin:secure_pipeline_2024@postgres:5432/dev_pipeline
|
||||
- REDIS_HOST=redis
|
||||
- REDIS_PORT=6379
|
||||
- REDIS_PASSWORD=redis_secure_2024
|
||||
- MONGODB_HOST=mongodb
|
||||
- MONGODB_PORT=27017
|
||||
- NEO4J_URI=bolt://neo4j:7687
|
||||
- NEO4J_USER=neo4j
|
||||
- NEO4J_PASSWORD=password
|
||||
- CHROMA_HOST=chromadb
|
||||
- CHROMA_PORT=8000
|
||||
- REDIS_URL=redis://:redis_secure_2024@redis:6379
|
||||
networks:
|
||||
- pipeline_network
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
redis:
|
||||
condition: service_healthy
|
||||
mongodb:
|
||||
condition: service_started
|
||||
migrations:
|
||||
condition: service_completed_successfully
|
||||
# requirement-processor:
|
||||
# build: ./services/requirement-processor
|
||||
# container_name: pipeline_requirement_processor
|
||||
# ports:
|
||||
# - "8001:8001"
|
||||
# environment:
|
||||
# - POSTGRES_HOST=postgres
|
||||
# - POSTGRES_PORT=5432
|
||||
# - POSTGRES_DB=dev_pipeline
|
||||
# - POSTGRES_USER=pipeline_admin
|
||||
# - POSTGRES_PASSWORD=secure_pipeline_2024
|
||||
# - DATABASE_URL=postgresql://pipeline_admin:secure_pipeline_2024@postgres:5432/dev_pipeline
|
||||
# - REDIS_HOST=redis
|
||||
# - REDIS_PORT=6379
|
||||
# - REDIS_PASSWORD=redis_secure_2024
|
||||
# - MONGODB_HOST=mongodb
|
||||
# - MONGODB_PORT=27017
|
||||
# - NEO4J_URI=bolt://neo4j:7687
|
||||
# - NEO4J_USER=neo4j
|
||||
# - NEO4J_PASSWORD=password
|
||||
# - CHROMA_HOST=chromadb
|
||||
# - CHROMA_PORT=8000
|
||||
# - REDIS_URL=redis://:redis_secure_2024@redis:6379
|
||||
# networks:
|
||||
# - pipeline_network
|
||||
# depends_on:
|
||||
# postgres:
|
||||
# condition: service_healthy
|
||||
# redis:
|
||||
# condition: service_healthy
|
||||
# mongodb:
|
||||
# condition: service_started
|
||||
# migrations:
|
||||
# condition: service_completed_successfully
|
||||
|
||||
tech-stack-selector:
|
||||
build: ./services/tech-stack-selector
|
||||
container_name: pipeline_tech_stack_selector
|
||||
ports:
|
||||
- "8002:8002"
|
||||
environment:
|
||||
- POSTGRES_HOST=postgres
|
||||
- POSTGRES_PORT=5432
|
||||
- POSTGRES_DB=dev_pipeline
|
||||
- POSTGRES_USER=pipeline_admin
|
||||
- POSTGRES_PASSWORD=secure_pipeline_2024
|
||||
- REDIS_HOST=redis
|
||||
- REDIS_PORT=6379
|
||||
- REDIS_PASSWORD=redis_secure_2024
|
||||
- CLAUDE_API_KEY=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
|
||||
networks:
|
||||
- pipeline_network
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
redis:
|
||||
condition: service_healthy
|
||||
migrations:
|
||||
condition: service_completed_successfully
|
||||
# tech-stack-selector:
|
||||
# build: ./services/tech-stack-selector
|
||||
# container_name: pipeline_tech_stack_selector
|
||||
# ports:
|
||||
# - "8002:8002"
|
||||
# environment:
|
||||
# - POSTGRES_HOST=postgres
|
||||
# - POSTGRES_PORT=5432
|
||||
# - POSTGRES_DB=dev_pipeline
|
||||
# - POSTGRES_USER=pipeline_admin
|
||||
# - POSTGRES_PASSWORD=secure_pipeline_2024
|
||||
# - REDIS_HOST=redis
|
||||
# - REDIS_PORT=6379
|
||||
# - REDIS_PASSWORD=redis_secure_2024
|
||||
# - CLAUDE_API_KEY=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
|
||||
# networks:
|
||||
# - pipeline_network
|
||||
# depends_on:
|
||||
# postgres:
|
||||
# condition: service_healthy
|
||||
# redis:
|
||||
# condition: service_healthy
|
||||
# migrations:
|
||||
# condition: service_completed_successfully
|
||||
|
||||
architecture-designer:
|
||||
build: ./services/architecture-designer
|
||||
container_name: pipeline_architecture_designer
|
||||
ports:
|
||||
- "8003:8003"
|
||||
environment:
|
||||
- PORT=8003
|
||||
- HOST=0.0.0.0
|
||||
- CLAUDE_API_KEY=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
|
||||
- ANTHROPIC_API_KEY=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
|
||||
- POSTGRES_HOST=postgres
|
||||
- POSTGRES_PORT=5432
|
||||
- POSTGRES_DB=dev_pipeline
|
||||
- POSTGRES_USER=pipeline_admin
|
||||
- POSTGRES_PASSWORD=secure_pipeline_2024
|
||||
- MONGODB_HOST=mongodb
|
||||
- MONGODB_PORT=27017
|
||||
networks:
|
||||
- pipeline_network
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
mongodb:
|
||||
condition: service_started
|
||||
migrations:
|
||||
condition: service_completed_successfully
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8003/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
# architecture-designer:
|
||||
# build: ./services/architecture-designer
|
||||
# container_name: pipeline_architecture_designer
|
||||
# ports:
|
||||
# - "8003:8003"
|
||||
# environment:
|
||||
# - PORT=8003
|
||||
# - HOST=0.0.0.0
|
||||
# - CLAUDE_API_KEY=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
|
||||
# - ANTHROPIC_API_KEY=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
|
||||
# - POSTGRES_HOST=postgres
|
||||
# - POSTGRES_PORT=5432
|
||||
# - POSTGRES_DB=dev_pipeline
|
||||
# - POSTGRES_USER=pipeline_admin
|
||||
# - POSTGRES_PASSWORD=secure_pipeline_2024
|
||||
# - MONGODB_HOST=mongodb
|
||||
# - MONGODB_PORT=27017
|
||||
# networks:
|
||||
# - pipeline_network
|
||||
# depends_on:
|
||||
# postgres:
|
||||
# condition: service_healthy
|
||||
# mongodb:
|
||||
# condition: service_started
|
||||
# migrations:
|
||||
# condition: service_completed_successfully
|
||||
# healthcheck:
|
||||
# test: ["CMD", "curl", "-f", "http://localhost:8003/health"]
|
||||
# interval: 30s
|
||||
# timeout: 10s
|
||||
# retries: 3
|
||||
|
||||
# code-generator:
|
||||
# build: ./services/code-generator
|
||||
@ -461,34 +479,34 @@ services:
|
||||
migrations:
|
||||
condition: service_completed_successfully
|
||||
|
||||
deployment-manager:
|
||||
build: ./services/deployment-manager
|
||||
container_name: pipeline_deployment_manager
|
||||
ports:
|
||||
- "8006:8006"
|
||||
environment:
|
||||
- POSTGRES_HOST=postgres
|
||||
- POSTGRES_PORT=5432
|
||||
- POSTGRES_DB=dev_pipeline
|
||||
- POSTGRES_USER=pipeline_admin
|
||||
- POSTGRES_PASSWORD=secure_pipeline_2024
|
||||
- MONGODB_HOST=mongodb
|
||||
- MONGODB_PORT=27017
|
||||
- RABBITMQ_HOST=rabbitmq
|
||||
- RABBITMQ_PORT=5672
|
||||
- RABBITMQ_USER=pipeline_admin
|
||||
- RABBITMQ_PASSWORD=rabbit_secure_2024
|
||||
networks:
|
||||
- pipeline_network
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
rabbitmq:
|
||||
condition: service_healthy
|
||||
mongodb:
|
||||
condition: service_started
|
||||
migrations:
|
||||
condition: service_completed_successfully
|
||||
# deployment-manager:
|
||||
# build: ./services/deployment-manager
|
||||
# container_name: pipeline_deployment_manager
|
||||
# ports:
|
||||
# - "8006:8006"
|
||||
# environment:
|
||||
# - POSTGRES_HOST=postgres
|
||||
# - POSTGRES_PORT=5432
|
||||
# - POSTGRES_DB=dev_pipeline
|
||||
# - POSTGRES_USER=pipeline_admin
|
||||
# - POSTGRES_PASSWORD=secure_pipeline_2024
|
||||
# - MONGODB_HOST=mongodb
|
||||
# - MONGODB_PORT=27017
|
||||
# - RABBITMQ_HOST=rabbitmq
|
||||
# - RABBITMQ_PORT=5672
|
||||
# - RABBITMQ_USER=pipeline_admin
|
||||
# - RABBITMQ_PASSWORD=rabbit_secure_2024
|
||||
# networks:
|
||||
# - pipeline_network
|
||||
# depends_on:
|
||||
# postgres:
|
||||
# condition: service_healthy
|
||||
# rabbitmq:
|
||||
# condition: service_healthy
|
||||
# mongodb:
|
||||
# condition: service_started
|
||||
# migrations:
|
||||
# condition: service_completed_successfully
|
||||
|
||||
user-auth:
|
||||
build: ./services/user-auth
|
||||
@ -583,38 +601,38 @@ services:
|
||||
restart: unless-stopped
|
||||
|
||||
# AI Mockup / Wireframe Generation Service
|
||||
ai-mockup-service:
|
||||
build: ./services/ai-mockup-service
|
||||
container_name: pipeline_ai_mockup_service
|
||||
ports:
|
||||
- "8021:8021"
|
||||
environment:
|
||||
- PORT=8021
|
||||
- HOST=0.0.0.0
|
||||
- CLAUDE_API_KEY=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
|
||||
- POSTGRES_HOST=postgres
|
||||
- POSTGRES_PORT=5432
|
||||
- POSTGRES_DB=dev_pipeline
|
||||
- POSTGRES_USER=pipeline_admin
|
||||
- POSTGRES_PASSWORD=secure_pipeline_2024
|
||||
- REDIS_HOST=redis
|
||||
- REDIS_PORT=6379
|
||||
- REDIS_PASSWORD=redis_secure_2024
|
||||
- JWT_ACCESS_SECRET=access-secret-key-2024-tech4biz-secure_pipeline_2024
|
||||
- USER_AUTH_SERVICE_URL=http://user-auth:8011
|
||||
- FLASK_ENV=development
|
||||
networks:
|
||||
- pipeline_network
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
user-auth:
|
||||
condition: service_healthy
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8021/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
# ai-mockup-service:
|
||||
# build: ./services/ai-mockup-service
|
||||
# container_name: pipeline_ai_mockup_service
|
||||
# ports:
|
||||
# - "8021:8021"
|
||||
# environment:
|
||||
# - PORT=8021
|
||||
# - HOST=0.0.0.0
|
||||
# - CLAUDE_API_KEY=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
|
||||
# - POSTGRES_HOST=postgres
|
||||
# - POSTGRES_PORT=5432
|
||||
# - POSTGRES_DB=dev_pipeline
|
||||
# - POSTGRES_USER=pipeline_admin
|
||||
# - POSTGRES_PASSWORD=secure_pipeline_2024
|
||||
# - REDIS_HOST=redis
|
||||
# - REDIS_PORT=6379
|
||||
# - REDIS_PASSWORD=redis_secure_2024
|
||||
# - JWT_ACCESS_SECRET=access-secret-key-2024-tech4biz-secure_pipeline_2024
|
||||
# - USER_AUTH_SERVICE_URL=http://user-auth:8011
|
||||
# - FLASK_ENV=development
|
||||
# networks:
|
||||
# - pipeline_network
|
||||
# depends_on:
|
||||
# postgres:
|
||||
# condition: service_healthy
|
||||
# user-auth:
|
||||
# condition: service_healthy
|
||||
# healthcheck:
|
||||
# test: ["CMD", "curl", "-f", "http://localhost:8021/health"]
|
||||
# interval: 30s
|
||||
# timeout: 10s
|
||||
# retries: 3
|
||||
|
||||
git-integration:
|
||||
build: ./services/git-integration
|
||||
@ -731,7 +749,7 @@ services:
|
||||
environment:
|
||||
- PORT=8022
|
||||
- HOST=0.0.0.0
|
||||
- ANTHROPIC_API_KEY=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
|
||||
- ANTHROPIC_API_KEY=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
|
||||
|
||||
# Neo4j Configuration
|
||||
- USE_NEO4J_KG=true
|
||||
@ -790,17 +808,37 @@ services:
|
||||
environment:
|
||||
- PORT=8024
|
||||
- HOST=0.0.0.0
|
||||
- ANTHROPIC_API_KEY=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
|
||||
|
||||
# Claude/Anthropic Configuration
|
||||
- ANTHROPIC_API_KEY=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
|
||||
- MULTI_DOC_CLAUDE_MODEL=claude-3-5-haiku-latest
|
||||
- CLAUDE_MODEL=claude-3-5-haiku-latest
|
||||
|
||||
# Qwen2.5-VL API Configuration
|
||||
- QWEN_API_KEY=${QWEN_API_KEY:-}
|
||||
- QWEN_API_URL=${QWEN_API_URL:-https://api.example.com/v1/chat/completions}
|
||||
- QWEN_MODEL=qwen2.5-vl
|
||||
|
||||
# Neo4j Configuration
|
||||
- NEO4J_URI=bolt://neo4j:7687
|
||||
- NEO4J_USER=neo4j
|
||||
- NEO4J_PASSWORD=password
|
||||
- NEO4J_DATABASE=neo4j
|
||||
|
||||
# Qdrant Configuration
|
||||
- QDRANT_URL=http://qdrant:6333
|
||||
- QDRANT_COLLECTION_NAME=kg_embeddings
|
||||
|
||||
# DoWhy Configuration
|
||||
- DOWHY_ENABLED=true
|
||||
- DOWHY_CONFIDENCE_THRESHOLD=0.05
|
||||
|
||||
# Embedding Configuration
|
||||
- EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
|
||||
- EMBEDDING_DIMENSION=384
|
||||
|
||||
# Storage Configuration
|
||||
- STORAGE_DIR=/app/storage
|
||||
- MULTI_DOC_STORAGE_ROOT=/app/storage
|
||||
|
||||
# Database configurations (optional, for job tracking)
|
||||
- POSTGRES_HOST=pipeline_postgres
|
||||
@ -817,6 +855,8 @@ services:
|
||||
depends_on:
|
||||
neo4j:
|
||||
condition: service_healthy
|
||||
qdrant:
|
||||
condition: service_healthy
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
redis:
|
||||
@ -958,6 +998,8 @@ volumes:
|
||||
driver: local
|
||||
multi_document_storage:
|
||||
driver: local
|
||||
qdrant_data:
|
||||
driver: local
|
||||
|
||||
# =====================================
|
||||
# Networks
|
||||
|
||||
@ -7094,8 +7094,29 @@ async def main():
|
||||
js_files = [fa for fa in frontend_files if fa.path.lower().endswith(('.js', '.jsx', '.mjs', '.cjs'))]
|
||||
ts_files = [fa for fa in frontend_files if fa.path.lower().endswith(('.ts', '.tsx'))]
|
||||
|
||||
# Allocate frontend persona
|
||||
from persona_system import allocate_code_persona, build_code_analysis_persona_prompt
|
||||
|
||||
# Determine if it's UI or state management focused
|
||||
has_state_files = len(state_files) > 0
|
||||
sample_file = frontend_files[0] if frontend_files else None
|
||||
sample_path = sample_file.path if sample_file else ""
|
||||
sample_content = getattr(sample_file, 'content', '')[:1000] if sample_file else ""
|
||||
|
||||
# Allocate persona - prefer state management if state files exist
|
||||
if has_state_files:
|
||||
# Try to get state management persona
|
||||
persona = allocate_code_persona("store/state.ts", sample_content, "frontend_state")
|
||||
if "state" not in persona.get("role", "").lower():
|
||||
# Fallback to UI persona
|
||||
persona = allocate_code_persona(sample_path, sample_content, "frontend_ui")
|
||||
else:
|
||||
persona = allocate_code_persona(sample_path, sample_content, "frontend_ui")
|
||||
|
||||
assignment_context = f"CTO has assigned you to analyze the frontend codebase for this project. You are analyzing {len(frontend_files)} frontend files including components, routing, state management, and configuration."
|
||||
|
||||
front_end_prompt = f"""
|
||||
You are a Senior Frontend Architect and Technical Writer with 20+ years of experience. Analyze this frontend codebase and produce a comprehensive, technically precise report. The audience includes senior engineers and stakeholders who expect evidence-based, objective findings.
|
||||
Analyze this frontend codebase and produce a comprehensive, technically precise report. The audience includes senior engineers and stakeholders who expect evidence-based, objective findings.
|
||||
|
||||
STRICT STYLE RULES:
|
||||
- Use professional, technical language only. Do not use analogies, metaphors, storytelling, or colloquial comparisons.
|
||||
@ -7211,6 +7232,9 @@ FINAL REQUIREMENTS:
|
||||
- Ensure total length between 2000-3000 words.
|
||||
"""
|
||||
|
||||
# Enhance prompt with persona
|
||||
enhanced_prompt = build_code_analysis_persona_prompt(front_end_prompt, persona, assignment_context)
|
||||
|
||||
try:
|
||||
print(f"🤖 [FRONTEND AI] Calling Claude API for comprehensive frontend analysis...")
|
||||
print(f"🤖 [FRONTEND AI] Analyzing {len(frontend_files)} frontend files...")
|
||||
@ -7220,7 +7244,7 @@ FINAL REQUIREMENTS:
|
||||
model=os.getenv("CLAUDE_MODEL", "claude-3-5-haiku-latest"),
|
||||
max_tokens=8000, # Increased from 6000 to 8000 for more detailed analysis
|
||||
temperature=0.1,
|
||||
messages=[{"role": "user", "content": front_end_prompt}]
|
||||
messages=[{"role": "user", "content": enhanced_prompt}]
|
||||
)
|
||||
|
||||
ai_analysis = message.content[0].text.strip()
|
||||
@ -7230,7 +7254,7 @@ FINAL REQUIREMENTS:
|
||||
if not ai_analysis or len(ai_analysis) < 100:
|
||||
print("⚠️ [FRONTEND AI] AI analysis too short, regenerating...")
|
||||
# Retry with more emphasis on detail
|
||||
retry_prompt = front_end_prompt + "\n\nIMPORTANT: Provide a VERY DETAILED analysis. The previous response was too short. Please provide at least 2000 words of detailed explanation."
|
||||
retry_prompt = enhanced_prompt + "\n\nIMPORTANT: Provide a VERY DETAILED analysis. The previous response was too short. Please provide at least 2000 words of detailed explanation."
|
||||
message = self.client.messages.create(
|
||||
model=os.getenv("CLAUDE_MODEL", "claude-3-5-haiku-latest"),
|
||||
max_tokens=8000,
|
||||
|
||||
@ -524,7 +524,11 @@ class ChunkAnalyzer:
|
||||
def _build_chunk_analysis_prompt(self, file_path: str, chunk: ChunkInfo,
|
||||
chunk_index: int, total_chunks: int,
|
||||
context_memories: Dict[str, Any]) -> str:
|
||||
"""Build comprehensive analysis prompt for a chunk."""
|
||||
"""Build comprehensive analysis prompt for a chunk with persona."""
|
||||
from persona_system import allocate_code_persona, build_code_analysis_persona_prompt
|
||||
|
||||
# Allocate persona based on file path and chunk content
|
||||
persona = allocate_code_persona(file_path, chunk.content, chunk.chunk_type)
|
||||
|
||||
# Build context information
|
||||
context_info = ""
|
||||
@ -538,8 +542,10 @@ class ChunkAnalyzer:
|
||||
for practice in context_memories['best_practices'][:3]:
|
||||
context_info += f"- {practice['content'][:100]}...\n"
|
||||
|
||||
assignment_context = f"CTO has assigned you to analyze chunk {chunk_index + 1} of {total_chunks} from file: {file_path}. This is a {chunk.chunk_type} chunk covering lines {chunk.start_line}-{chunk.end_line}."
|
||||
|
||||
prompt = f"""
|
||||
You are a senior software engineer analyzing chunk {chunk_index + 1} of {total_chunks} from file: {file_path}
|
||||
Analyzing chunk {chunk_index + 1} of {total_chunks} from file: {file_path}
|
||||
|
||||
CHUNK INFORMATION:
|
||||
- Chunk Type: {chunk.chunk_type}
|
||||
@ -564,7 +570,10 @@ Provide a focused analysis of this specific chunk, considering:
|
||||
|
||||
Focus on actionable insights for this specific code section.
|
||||
"""
|
||||
return prompt
|
||||
|
||||
# Enhance with persona
|
||||
enhanced_prompt = build_code_analysis_persona_prompt(prompt, persona, assignment_context)
|
||||
return enhanced_prompt
|
||||
|
||||
def _detect_language_from_path(self, file_path: str) -> str:
|
||||
"""Detect language from file path."""
|
||||
|
||||
755
services/ai-analysis-service/persona_system.py
Normal file
755
services/ai-analysis-service/persona_system.py
Normal file
@ -0,0 +1,755 @@
|
||||
"""
|
||||
World-Class Persona System for AI Analysis
|
||||
Simulates real-world team allocation with domain-specific experts from top companies.
|
||||
"""
|
||||
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
import re
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# CODE ANALYSIS PERSONAS (for AI Analysis Service)
|
||||
# ============================================================================
|
||||
|
||||
CODE_ANALYSIS_PERSONAS = {
|
||||
# BACKEND DOMAINS
|
||||
"backend_api": {
|
||||
"role": "Senior Backend API Architect",
|
||||
"companies": ["Google", "Amazon", "Stripe"],
|
||||
"expertise": ["REST APIs", "GraphQL", "gRPC", "API Gateway", "Microservices"],
|
||||
"experience_years": "18+",
|
||||
"achievements": [
|
||||
"Designed APIs at Google Cloud Platform handling 10M+ requests/day",
|
||||
"Built scalable API infrastructure at Amazon AWS serving millions of customers",
|
||||
"Led API architecture at Stripe processing billions in transactions"
|
||||
],
|
||||
"detection_keywords": ["api", "controller", "route", "endpoint", "service", "rest", "graphql"],
|
||||
"focus_areas": [
|
||||
"API design patterns and best practices",
|
||||
"API versioning and backward compatibility",
|
||||
"Rate limiting and throttling strategies",
|
||||
"API documentation quality",
|
||||
"Security vulnerabilities in API endpoints"
|
||||
]
|
||||
},
|
||||
|
||||
"backend_database": {
|
||||
"role": "Senior Database Architect",
|
||||
"companies": ["Amazon", "Oracle", "MongoDB"],
|
||||
"expertise": ["SQL", "NoSQL", "Database Design", "Query Optimization", "Data Modeling"],
|
||||
"experience_years": "20+",
|
||||
"achievements": [
|
||||
"Designed database systems at Amazon handling petabytes of data",
|
||||
"Optimized databases at Oracle for enterprise-scale applications",
|
||||
"Built distributed databases at MongoDB for global scale"
|
||||
],
|
||||
"detection_keywords": ["database", "db", "model", "schema", "migration", "repository", "orm", "query"],
|
||||
"focus_areas": [
|
||||
"Database schema design and normalization",
|
||||
"Query performance and optimization",
|
||||
"Data integrity and constraints",
|
||||
"Indexing strategies",
|
||||
"Transaction management"
|
||||
]
|
||||
},
|
||||
|
||||
"backend_business": {
|
||||
"role": "Senior Backend Business Logic Architect",
|
||||
"companies": ["Microsoft", "Salesforce", "SAP"],
|
||||
"expertise": ["Business Logic", "Domain Modeling", "Design Patterns", "Service Layer"],
|
||||
"experience_years": "17+",
|
||||
"achievements": [
|
||||
"Architected business logic systems at Microsoft for enterprise applications",
|
||||
"Designed domain models at Salesforce for CRM platforms",
|
||||
"Built service layers at SAP for ERP systems"
|
||||
],
|
||||
"detection_keywords": ["service", "business", "logic", "domain", "entity", "dto", "handler"],
|
||||
"focus_areas": [
|
||||
"Code organization and structure",
|
||||
"Design patterns implementation",
|
||||
"Business logic maintainability",
|
||||
"Domain modeling quality",
|
||||
"Service layer architecture"
|
||||
]
|
||||
},
|
||||
|
||||
# FRONTEND DOMAINS
|
||||
"frontend_ui": {
|
||||
"role": "Senior Frontend UI Architect",
|
||||
"companies": ["Apple", "Meta", "Netflix"],
|
||||
"expertise": ["React", "Vue", "Angular", "Component Design", "UI/UX"],
|
||||
"experience_years": "15+",
|
||||
"achievements": [
|
||||
"Built user interfaces at Apple used by millions daily",
|
||||
"Led React architecture at Meta (Facebook) for large-scale applications",
|
||||
"Designed performance-optimized UIs at Netflix for 200M+ users"
|
||||
],
|
||||
"detection_keywords": ["component", "ui", "view", "page", "jsx", "tsx", "vue", "template"],
|
||||
"focus_areas": [
|
||||
"Component architecture and reusability",
|
||||
"User experience and accessibility",
|
||||
"UI performance optimization",
|
||||
"Design system consistency",
|
||||
"Responsive design implementation"
|
||||
]
|
||||
},
|
||||
|
||||
"frontend_state": {
|
||||
"role": "Senior Frontend State Management Architect",
|
||||
"companies": ["Meta", "Netflix", "Airbnb"],
|
||||
"expertise": ["Redux", "Zustand", "Context API", "State Management", "Data Flow"],
|
||||
"experience_years": "14+",
|
||||
"achievements": [
|
||||
"Architected state management at Meta for complex applications",
|
||||
"Designed data flow patterns at Netflix for real-time updates",
|
||||
"Built state systems at Airbnb for booking platforms"
|
||||
],
|
||||
"detection_keywords": ["store", "state", "redux", "context", "recoil", "zustand", "mobx"],
|
||||
"focus_areas": [
|
||||
"State architecture and patterns",
|
||||
"Data flow optimization",
|
||||
"State synchronization",
|
||||
"Performance in state updates",
|
||||
"State management best practices"
|
||||
]
|
||||
},
|
||||
|
||||
# DEVOPS DOMAINS
|
||||
"devops_ci_cd": {
|
||||
"role": "Senior DevOps CI/CD Architect",
|
||||
"companies": ["Google", "Netflix", "Uber"],
|
||||
"expertise": ["CI/CD", "Jenkins", "GitHub Actions", "GitLab CI", "Deployment Automation"],
|
||||
"experience_years": "12+",
|
||||
"achievements": [
|
||||
"Built CI/CD pipelines at Google handling 50K+ deployments/day",
|
||||
"Designed deployment systems at Netflix for zero-downtime releases",
|
||||
"Architected automation at Uber for global scale"
|
||||
],
|
||||
"detection_keywords": ["ci", "cd", "pipeline", "jenkins", "github-actions", "gitlab", "deploy"],
|
||||
"focus_areas": [
|
||||
"CI/CD pipeline efficiency",
|
||||
"Deployment strategy and automation",
|
||||
"Quality gates and testing",
|
||||
"Rollback strategies",
|
||||
"Build optimization"
|
||||
]
|
||||
},
|
||||
|
||||
"devops_infrastructure": {
|
||||
"role": "Senior Infrastructure Architect",
|
||||
"companies": ["Amazon", "Google", "Microsoft"],
|
||||
"expertise": ["Kubernetes", "Docker", "Terraform", "Cloud Infrastructure", "Scalability"],
|
||||
"experience_years": "16+",
|
||||
"achievements": [
|
||||
"Designed infrastructure at Amazon AWS for global scale",
|
||||
"Built container orchestration at Google for millions of containers",
|
||||
"Architected cloud systems at Microsoft Azure with 99.99% uptime"
|
||||
],
|
||||
"detection_keywords": ["docker", "kubernetes", "terraform", "infrastructure", "cloud", "aws", "gcp", "azure"],
|
||||
"focus_areas": [
|
||||
"Infrastructure scalability",
|
||||
"System reliability and uptime",
|
||||
"Cost optimization",
|
||||
"Security in infrastructure",
|
||||
"Monitoring and observability"
|
||||
]
|
||||
},
|
||||
|
||||
# SECURITY DOMAINS
|
||||
"security_engineer": {
|
||||
"role": "Senior Security Engineer",
|
||||
"companies": ["Google", "Microsoft", "Cloudflare"],
|
||||
"expertise": ["Security", "Vulnerability Assessment", "Penetration Testing", "Security Architecture"],
|
||||
"experience_years": "15+",
|
||||
"achievements": [
|
||||
"Led security initiatives at Google protecting billions of users",
|
||||
"Designed security systems at Microsoft for enterprise applications",
|
||||
"Built security infrastructure at Cloudflare for DDoS protection"
|
||||
],
|
||||
"detection_keywords": ["security", "auth", "encryption", "jwt", "oauth", "ssl", "tls", "cors"],
|
||||
"focus_areas": [
|
||||
"Security vulnerabilities and threats",
|
||||
"Authentication and authorization",
|
||||
"Data encryption and protection",
|
||||
"Security best practices",
|
||||
"Compliance and regulations"
|
||||
]
|
||||
},
|
||||
|
||||
# DATA DOMAINS
|
||||
"data_engineer": {
|
||||
"role": "Senior Data Engineer",
|
||||
"companies": ["Google", "Netflix", "Uber"],
|
||||
"expertise": ["Data Pipelines", "ETL", "Big Data", "Data Warehousing", "Spark"],
|
||||
"experience_years": "13+",
|
||||
"achievements": [
|
||||
"Built data pipelines at Google processing petabytes daily",
|
||||
"Designed ETL systems at Netflix for real-time analytics",
|
||||
"Architected data infrastructure at Uber for millions of rides"
|
||||
],
|
||||
"detection_keywords": ["data", "pipeline", "etl", "warehouse", "spark", "hadoop", "kafka"],
|
||||
"focus_areas": [
|
||||
"Data architecture and pipelines",
|
||||
"ETL performance and optimization",
|
||||
"Data quality and validation",
|
||||
"Scalability in data processing",
|
||||
"Data governance"
|
||||
]
|
||||
},
|
||||
|
||||
"ml_engineer": {
|
||||
"role": "Senior ML/AI Engineer",
|
||||
"companies": ["OpenAI", "Anthropic", "Google DeepMind"],
|
||||
"expertise": ["Machine Learning", "Deep Learning", "AI Systems", "Model Training"],
|
||||
"experience_years": "12+",
|
||||
"achievements": [
|
||||
"Developed ML models at OpenAI for language understanding",
|
||||
"Built AI systems at Anthropic for safety-critical applications",
|
||||
"Designed training pipelines at Google DeepMind for large-scale models"
|
||||
],
|
||||
"detection_keywords": ["ml", "ai", "model", "training", "neural", "tensorflow", "pytorch", "learning"],
|
||||
"focus_areas": [
|
||||
"ML model architecture",
|
||||
"Training pipeline optimization",
|
||||
"Model performance and accuracy",
|
||||
"Scalability in ML systems",
|
||||
"AI safety and ethics"
|
||||
]
|
||||
},
|
||||
|
||||
# TESTING DOMAINS
|
||||
"qa_automation": {
|
||||
"role": "Senior QA Automation Architect",
|
||||
"companies": ["Google", "Microsoft", "Amazon"],
|
||||
"expertise": ["Test Automation", "Selenium", "Cypress", "Jest", "Testing Strategy"],
|
||||
"experience_years": "14+",
|
||||
"achievements": [
|
||||
"Built test automation at Google for thousands of test cases",
|
||||
"Designed testing frameworks at Microsoft for enterprise software",
|
||||
"Architected QA systems at Amazon for e-commerce platforms"
|
||||
],
|
||||
"detection_keywords": ["test", "spec", "jest", "cypress", "selenium", "pytest", "testing"],
|
||||
"focus_areas": [
|
||||
"Test coverage and quality",
|
||||
"Automation strategy",
|
||||
"Test maintainability",
|
||||
"Performance testing",
|
||||
"Testing best practices"
|
||||
]
|
||||
},
|
||||
|
||||
"performance_engineer": {
|
||||
"role": "Senior Performance Engineer",
|
||||
"companies": ["Google", "Netflix", "Amazon"],
|
||||
"expertise": ["Performance Optimization", "Load Testing", "Profiling", "Scalability"],
|
||||
"experience_years": "16+",
|
||||
"achievements": [
|
||||
"Optimized systems at Google handling billions of requests",
|
||||
"Designed performance solutions at Netflix for streaming at scale",
|
||||
"Built performance infrastructure at Amazon for peak traffic"
|
||||
],
|
||||
"detection_keywords": ["performance", "load", "stress", "benchmark", "profiling", "optimization"],
|
||||
"focus_areas": [
|
||||
"Performance bottlenecks",
|
||||
"Optimization strategies",
|
||||
"Scalability concerns",
|
||||
"Resource utilization",
|
||||
"Performance testing"
|
||||
]
|
||||
},
|
||||
|
||||
# CTO (for synthesis)
|
||||
"cto": {
|
||||
"role": "Chief Technology Officer",
|
||||
"companies": ["Google", "Microsoft", "Amazon"],
|
||||
"expertise": ["Strategic Planning", "System Architecture", "Team Leadership", "Technology Strategy"],
|
||||
"experience_years": "25+",
|
||||
"achievements": [
|
||||
"Former VP of Engineering at Google, leading teams of 500+ engineers",
|
||||
"CTO at Microsoft Azure, responsible for cloud infrastructure strategy",
|
||||
"Strategic advisor at Amazon Web Services for enterprise architecture"
|
||||
],
|
||||
"focus_areas": [
|
||||
"Strategic technology insights",
|
||||
"System-wide risk assessment",
|
||||
"Architectural recommendations",
|
||||
"Cross-domain synthesis",
|
||||
"Executive-level analysis"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# DOCUMENT ANALYSIS PERSONAS (for Multi-Document Upload Service)
|
||||
# ============================================================================
|
||||
|
||||
DOCUMENT_ANALYSIS_PERSONAS = {
|
||||
"technical_doc_analyst": {
|
||||
"role": "Senior Technical Documentation Analyst",
|
||||
"companies": ["Google", "Stripe", "Microsoft"],
|
||||
"expertise_domain": "technical documentation and API specifications",
|
||||
"document_types": ["API docs", "technical specs", "developer guides"],
|
||||
"experience_years": "15+",
|
||||
"achievements": [
|
||||
"Analyzed technical documentation at Google for millions of API integrations",
|
||||
"Led documentation analysis at Stripe for developer experience",
|
||||
"Mapped technical relationships at Microsoft for enterprise systems"
|
||||
],
|
||||
"focus_areas": [
|
||||
"Technical dependencies and relationships",
|
||||
"System integration points",
|
||||
"API contract relationships",
|
||||
"Technical process flows",
|
||||
"Code-to-documentation mappings"
|
||||
],
|
||||
"visual_focus_areas": [
|
||||
"API flow diagrams",
|
||||
"System integration diagrams",
|
||||
"Technical architecture flows"
|
||||
],
|
||||
"detection_keywords": ["api", "technical", "specification", "documentation", "guide", "reference", "developer"]
|
||||
},
|
||||
|
||||
"business_process_analyst": {
|
||||
"role": "Senior Business Process Analyst",
|
||||
"companies": ["McKinsey", "Deloitte", "Accenture"],
|
||||
"expertise_domain": "business processes and stakeholder requirements",
|
||||
"document_types": ["business requirements", "user stories", "business plans"],
|
||||
"experience_years": "18+",
|
||||
"achievements": [
|
||||
"Analyzed business processes at McKinsey for Fortune 500 companies",
|
||||
"Led process mapping at Deloitte for enterprise transformations",
|
||||
"Mapped stakeholder relationships at Accenture for global projects"
|
||||
],
|
||||
"focus_areas": [
|
||||
"Business process flows",
|
||||
"Requirement dependencies",
|
||||
"Stakeholder impact chains",
|
||||
"Business decision consequences",
|
||||
"Organizational impact analysis"
|
||||
],
|
||||
"visual_focus_areas": [
|
||||
"Business process diagrams",
|
||||
"Stakeholder impact maps",
|
||||
"Decision flowcharts"
|
||||
],
|
||||
"detection_keywords": ["business", "requirement", "stakeholder", "user story", "process", "workflow", "business plan"]
|
||||
},
|
||||
|
||||
"system_architecture_analyst": {
|
||||
"role": "Senior System Architecture Document Analyst",
|
||||
"companies": ["Google", "Amazon", "Microsoft"],
|
||||
"expertise_domain": "system architecture and design documents",
|
||||
"document_types": ["architecture docs", "design documents", "system designs"],
|
||||
"experience_years": "20+",
|
||||
"achievements": [
|
||||
"Analyzed architecture documents at Google for large-scale distributed systems",
|
||||
"Mapped system relationships at Amazon for cloud infrastructure",
|
||||
"Led architecture analysis at Microsoft for enterprise solutions"
|
||||
],
|
||||
"focus_areas": [
|
||||
"Architecture relationships",
|
||||
"Component dependencies",
|
||||
"System interaction flows",
|
||||
"Design decision impacts",
|
||||
"Scalability relationships"
|
||||
],
|
||||
"visual_focus_areas": [
|
||||
"Architecture diagrams",
|
||||
"Component interaction diagrams",
|
||||
"System dependency maps"
|
||||
],
|
||||
"detection_keywords": ["architecture", "design", "system", "component", "diagram", "architectural"]
|
||||
},
|
||||
|
||||
"requirements_analyst": {
|
||||
"role": "Senior Requirements & Specification Analyst",
|
||||
"companies": ["IBM", "Oracle", "SAP"],
|
||||
"expertise_domain": "requirements and functional specifications",
|
||||
"document_types": ["requirements docs", "functional specs", "feature specs"],
|
||||
"experience_years": "17+",
|
||||
"achievements": [
|
||||
"Analyzed requirements at IBM for enterprise software implementations",
|
||||
"Mapped specifications at Oracle for database systems",
|
||||
"Led requirement analysis at SAP for ERP platforms"
|
||||
],
|
||||
"focus_areas": [
|
||||
"Requirement dependencies",
|
||||
"Feature relationships",
|
||||
"Specification impacts",
|
||||
"Change propagation",
|
||||
"Implementation dependencies"
|
||||
],
|
||||
"visual_focus_areas": [
|
||||
"Requirement traceability diagrams",
|
||||
"Feature dependency maps",
|
||||
"Impact analysis charts"
|
||||
],
|
||||
"detection_keywords": ["requirement", "specification", "feature", "functional", "traceability", "spec"]
|
||||
},
|
||||
|
||||
"process_flow_analyst": {
|
||||
"role": "Senior Process Flow Analyst",
|
||||
"companies": ["Amazon", "Netflix", "Uber"],
|
||||
"expertise_domain": "operational processes and workflows",
|
||||
"document_types": ["process docs", "workflows", "operational manuals"],
|
||||
"experience_years": "14+",
|
||||
"achievements": [
|
||||
"Analyzed processes at Amazon for fulfillment operations",
|
||||
"Mapped workflows at Netflix for content delivery",
|
||||
"Led process analysis at Uber for ride-sharing operations"
|
||||
],
|
||||
"focus_areas": [
|
||||
"Process step relationships",
|
||||
"Workflow dependencies",
|
||||
"Sequential cause-effects",
|
||||
"Decision impacts",
|
||||
"Operational dependencies"
|
||||
],
|
||||
"visual_focus_areas": [
|
||||
"Process flowcharts",
|
||||
"Workflow diagrams",
|
||||
"Decision trees",
|
||||
"Operational flow maps"
|
||||
],
|
||||
"detection_keywords": ["process", "workflow", "procedure", "operational", "manual", "step", "flow"]
|
||||
},
|
||||
|
||||
"visual_architecture_analyst": {
|
||||
"role": "Senior Visual Architecture Analyst",
|
||||
"companies": ["Google", "Microsoft", "Apple"],
|
||||
"expertise_domain": "visual diagrams and architecture drawings",
|
||||
"document_types": ["diagrams", "flowcharts", "architecture drawings"],
|
||||
"experience_years": "16+",
|
||||
"achievements": [
|
||||
"Analyzed visual diagrams at Google for complex system mappings",
|
||||
"Mapped architecture drawings at Microsoft for enterprise solutions",
|
||||
"Led visual analysis at Apple for product architecture"
|
||||
],
|
||||
"focus_areas": [
|
||||
"Visual relationship extraction",
|
||||
"Diagram dependency mapping",
|
||||
"Flow analysis",
|
||||
"Component interactions",
|
||||
"Visual pattern recognition"
|
||||
],
|
||||
"visual_focus_areas": [
|
||||
"All types of visual diagrams",
|
||||
"Architecture drawings",
|
||||
"Flowcharts and process diagrams",
|
||||
"Component and sequence diagrams"
|
||||
],
|
||||
"detection_keywords": ["diagram", "flowchart", "visual", "drawing", "chart", "map", "image"]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# DOCUMENT TYPE MAPPING
|
||||
# ============================================================================
|
||||
|
||||
DOCUMENT_PERSONA_MAPPING = {
|
||||
# Technical Documents
|
||||
"api_documentation": "technical_doc_analyst",
|
||||
"technical_specification": "technical_doc_analyst",
|
||||
"code_documentation": "technical_doc_analyst",
|
||||
"developer_guide": "technical_doc_analyst",
|
||||
|
||||
# Business Documents
|
||||
"business_requirements": "business_process_analyst",
|
||||
"user_stories": "business_process_analyst",
|
||||
"business_plan": "business_process_analyst",
|
||||
"product_specification": "business_process_analyst",
|
||||
"stakeholder_document": "business_process_analyst",
|
||||
|
||||
# Architecture Documents
|
||||
"architecture_document": "system_architecture_analyst",
|
||||
"system_design": "system_architecture_analyst",
|
||||
"design_document": "system_architecture_analyst",
|
||||
"technical_design": "system_architecture_analyst",
|
||||
|
||||
# Requirements Documents
|
||||
"requirements_document": "requirements_analyst",
|
||||
"functional_specification": "requirements_analyst",
|
||||
"feature_specification": "requirements_analyst",
|
||||
|
||||
# Process Documents
|
||||
"process_document": "process_flow_analyst",
|
||||
"workflow_document": "process_flow_analyst",
|
||||
"procedure_guide": "process_flow_analyst",
|
||||
"operational_manual": "process_flow_analyst",
|
||||
|
||||
# Visual/Diagram Documents
|
||||
"architecture_diagram": "visual_architecture_analyst",
|
||||
"flowchart": "visual_architecture_analyst",
|
||||
"sequence_diagram": "visual_architecture_analyst",
|
||||
"component_diagram": "visual_architecture_analyst",
|
||||
"process_diagram": "visual_architecture_analyst",
|
||||
"system_diagram": "visual_architecture_analyst",
|
||||
}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# PERSONA ALLOCATION FUNCTIONS
|
||||
# ============================================================================
|
||||
|
||||
def allocate_code_persona(file_path: str, content: str, chunk_type: str = "module") -> Dict:
|
||||
"""
|
||||
Intelligently allocates code analysis persona based on file path, content, and type.
|
||||
Returns persona config with prompt context.
|
||||
"""
|
||||
file_lower = file_path.lower()
|
||||
content_lower = content.lower()[:2000] if content else "" # Sample content
|
||||
|
||||
# Score each persona based on detection rules
|
||||
persona_scores = {}
|
||||
|
||||
for persona_id, persona_config in CODE_ANALYSIS_PERSONAS.items():
|
||||
if persona_id == "cto": # Skip CTO for individual analysis
|
||||
continue
|
||||
|
||||
score = 0
|
||||
detection_keywords = persona_config.get("detection_keywords", [])
|
||||
|
||||
# Check file path (higher weight)
|
||||
for keyword in detection_keywords:
|
||||
if keyword in file_lower:
|
||||
score += 15
|
||||
|
||||
# Check content (medium weight)
|
||||
for keyword in detection_keywords:
|
||||
if keyword in content_lower:
|
||||
score += 8
|
||||
|
||||
# Check chunk type
|
||||
if chunk_type and chunk_type.lower() in detection_keywords:
|
||||
score += 10
|
||||
|
||||
# Domain-specific boosts
|
||||
if "test" in file_lower and "qa" in persona_id:
|
||||
score += 20
|
||||
if "security" in file_lower and "security" in persona_id:
|
||||
score += 20
|
||||
if "performance" in file_lower and "performance" in persona_id:
|
||||
score += 20
|
||||
|
||||
if score > 0:
|
||||
persona_scores[persona_id] = score
|
||||
|
||||
# Select top persona
|
||||
if persona_scores:
|
||||
selected_id = max(persona_scores, key=persona_scores.get)
|
||||
return CODE_ANALYSIS_PERSONAS[selected_id]
|
||||
|
||||
# Default fallback to backend business logic
|
||||
return CODE_ANALYSIS_PERSONAS.get("backend_business", {})
|
||||
|
||||
|
||||
def allocate_document_persona(file_path: str, content: str, file_type: str = "text") -> Dict:
|
||||
"""
|
||||
Intelligently allocates document analysis persona based on file path, content, and type.
|
||||
Returns persona config for document analysis.
|
||||
"""
|
||||
file_lower = file_path.lower()
|
||||
content_lower = content.lower()[:2000] if content else ""
|
||||
|
||||
# Check if it's an image/diagram
|
||||
if file_type == "image" or any(ext in file_lower for ext in [".png", ".jpg", ".jpeg", ".gif", ".svg", ".pdf"]):
|
||||
return DOCUMENT_ANALYSIS_PERSONAS.get("visual_architecture_analyst", {})
|
||||
|
||||
# Score each persona based on detection rules
|
||||
persona_scores = {}
|
||||
|
||||
for persona_id, persona_config in DOCUMENT_ANALYSIS_PERSONAS.items():
|
||||
score = 0
|
||||
detection_keywords = persona_config.get("detection_keywords", [])
|
||||
|
||||
# Check file path (higher weight)
|
||||
for keyword in detection_keywords:
|
||||
if keyword in file_lower:
|
||||
score += 15
|
||||
|
||||
# Check content (medium weight)
|
||||
for keyword in detection_keywords:
|
||||
if keyword in content_lower:
|
||||
score += 8
|
||||
|
||||
# Check document type mapping
|
||||
for doc_type, mapped_persona in DOCUMENT_PERSONA_MAPPING.items():
|
||||
if doc_type in file_lower and mapped_persona == persona_id:
|
||||
score += 20
|
||||
|
||||
if score > 0:
|
||||
persona_scores[persona_id] = score
|
||||
|
||||
# Select top persona
|
||||
if persona_scores:
|
||||
selected_id = max(persona_scores, key=persona_scores.get)
|
||||
return DOCUMENT_ANALYSIS_PERSONAS[selected_id]
|
||||
|
||||
# Default fallback to technical doc analyst
|
||||
return DOCUMENT_ANALYSIS_PERSONAS.get("technical_doc_analyst", {})
|
||||
|
||||
|
||||
def get_cto_persona() -> Dict:
|
||||
"""Returns CTO persona for synthesis and high-level analysis."""
|
||||
return CODE_ANALYSIS_PERSONAS.get("cto", {})
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# PROMPT BUILDING FUNCTIONS
|
||||
# ============================================================================
|
||||
|
||||
def build_persona_intro(persona: Dict, assignment_context: str = "", analysis_type: str = "code") -> str:
|
||||
"""
|
||||
Builds persona introduction section for prompts.
|
||||
Works for both code and document analysis.
|
||||
"""
|
||||
if not persona:
|
||||
return ""
|
||||
|
||||
role = persona.get("role", "Senior Engineer")
|
||||
companies = persona.get("companies", [])
|
||||
experience = persona.get("experience_years", "15+")
|
||||
achievements = persona.get("achievements", [])
|
||||
focus_areas = persona.get("focus_areas", [])
|
||||
|
||||
# Build company background
|
||||
company_bg = ""
|
||||
if companies:
|
||||
company_bg = f"- Previously worked at {', '.join(companies[:2])}"
|
||||
if len(companies) > 2:
|
||||
company_bg += f" and {companies[2]}"
|
||||
|
||||
# Build achievements section
|
||||
achievements_text = ""
|
||||
if achievements:
|
||||
achievements_text = "\n".join([f"- {achievement}" for achievement in achievements[:2]])
|
||||
|
||||
# Build focus areas
|
||||
focus_text = ""
|
||||
if focus_areas:
|
||||
focus_text = "\n".join([f"- {focus}" for focus in focus_areas[:5]])
|
||||
|
||||
intro = f"""You are {role} with {experience} years of experience.
|
||||
|
||||
COMPANY BACKGROUND:
|
||||
{company_bg}
|
||||
|
||||
KEY ACHIEVEMENTS:
|
||||
{achievements_text}
|
||||
|
||||
YOUR ASSIGNMENT:
|
||||
{assignment_context if assignment_context else 'Analyze the provided code/document for quality, issues, and recommendations.'}
|
||||
|
||||
YOUR FOCUS AREAS:
|
||||
{focus_text}
|
||||
|
||||
---
|
||||
"""
|
||||
return intro
|
||||
|
||||
|
||||
def build_code_analysis_persona_prompt(base_prompt: str, persona: Dict,
|
||||
assignment_context: str = "") -> str:
|
||||
"""
|
||||
Enhances code analysis prompt with persona context.
|
||||
"""
|
||||
if not persona:
|
||||
return base_prompt
|
||||
|
||||
persona_intro = build_persona_intro(persona, assignment_context, "code")
|
||||
return persona_intro + base_prompt
|
||||
|
||||
|
||||
def build_document_analysis_persona_prompt(base_prompt: str, persona: Dict,
|
||||
document_type: str = "document",
|
||||
assignment_context: str = "") -> str:
|
||||
"""
|
||||
Enhances document analysis prompt with persona context.
|
||||
"""
|
||||
if not persona:
|
||||
return base_prompt
|
||||
|
||||
role = persona.get("role", "Senior Analyst")
|
||||
companies = persona.get("companies", [])
|
||||
expertise_domain = persona.get("expertise_domain", "document analysis")
|
||||
experience = persona.get("experience_years", "15+")
|
||||
achievements = persona.get("achievements", [])
|
||||
focus_areas = persona.get("focus_areas", [])
|
||||
|
||||
company_bg = f"- Previously worked at {', '.join(companies[:2])}" if companies else ""
|
||||
achievements_text = "\n".join([f"- {achievement}" for achievement in achievements[:2]]) if achievements else ""
|
||||
focus_text = "\n".join([f"- {focus}" for focus in focus_areas[:5]]) if focus_areas else ""
|
||||
|
||||
intro = f"""You are {role}, a specialist in analyzing {expertise_domain} with {experience} years of experience.
|
||||
|
||||
COMPANY BACKGROUND:
|
||||
{company_bg}
|
||||
|
||||
KEY ACHIEVEMENTS:
|
||||
{achievements_text}
|
||||
|
||||
YOUR SPECIALIZATION:
|
||||
You excel at identifying:
|
||||
{focus_text}
|
||||
|
||||
YOUR ASSIGNMENT:
|
||||
{assignment_context if assignment_context else f'Analyze this {document_type} to extract causal relationships and dependencies.'}
|
||||
|
||||
---
|
||||
"""
|
||||
return intro + base_prompt
|
||||
|
||||
|
||||
def build_cto_synthesis_prompt(base_prompt: str, team_findings: List[Dict] = None) -> str:
|
||||
"""
|
||||
Builds CTO-level synthesis prompt with team allocation context.
|
||||
"""
|
||||
cto_persona = get_cto_persona()
|
||||
|
||||
if not cto_persona:
|
||||
return base_prompt
|
||||
|
||||
role = cto_persona.get("role", "Chief Technology Officer")
|
||||
companies = cto_persona.get("companies", [])
|
||||
experience = cto_persona.get("experience_years", "25+")
|
||||
achievements = cto_persona.get("achievements", [])
|
||||
focus_areas = cto_persona.get("focus_areas", [])
|
||||
|
||||
company_bg = f"- Former VP of Engineering at {companies[0] if companies else 'Google'}, leading teams of 500+ engineers"
|
||||
if len(companies) > 1:
|
||||
company_bg += f"\n- CTO at {companies[1]}, responsible for cloud infrastructure strategy"
|
||||
|
||||
achievements_text = "\n".join([f"- {achievement}" for achievement in achievements[:2]]) if achievements else ""
|
||||
focus_text = "\n".join([f"- {focus}" for focus in focus_areas[:5]]) if focus_areas else ""
|
||||
|
||||
team_allocation = ""
|
||||
if team_findings:
|
||||
team_allocation = "\n\nTEAM ALLOCATION:\n"
|
||||
team_allocation += "You have allocated your expert team to analyze different domains:\n"
|
||||
for finding in team_findings[:5]:
|
||||
domain = finding.get("domain", "unknown")
|
||||
team_allocation += f"- {domain}: Expert analysis completed\n"
|
||||
|
||||
intro = f"""You are {role} with {experience} years of experience.
|
||||
|
||||
COMPANY BACKGROUND:
|
||||
{company_bg}
|
||||
|
||||
KEY ACHIEVEMENTS:
|
||||
{achievements_text}
|
||||
{team_allocation}
|
||||
|
||||
YOUR ROLE:
|
||||
You have received this project and allocated your expert team to analyze different domains.
|
||||
Now, synthesize all team findings into strategic recommendations.
|
||||
|
||||
YOUR FOCUS AREAS:
|
||||
{focus_text}
|
||||
|
||||
---
|
||||
"""
|
||||
return intro + base_prompt
|
||||
|
||||
@ -2673,8 +2673,10 @@ def build_intelligent_chunk_prompt(chunk: Dict, analysis_state: Optional[Dict] =
|
||||
"""
|
||||
Build comprehensive prompt for analyzing a semantically grouped chunk.
|
||||
Generates detailed module-level analysis with context awareness.
|
||||
Now includes progressive context from previous chunks.
|
||||
Now includes progressive context from previous chunks and world-class persona.
|
||||
"""
|
||||
from persona_system import allocate_code_persona, build_code_analysis_persona_prompt
|
||||
|
||||
chunk_name = chunk.get('name', 'unknown')
|
||||
chunk_type = chunk.get('chunk_type', 'module')
|
||||
files_batch = chunk.get('files', [])
|
||||
@ -2694,15 +2696,22 @@ def build_intelligent_chunk_prompt(chunk: Dict, analysis_state: Optional[Dict] =
|
||||
|
||||
optimized_files.append((file_path, optimized_content))
|
||||
|
||||
# Allocate appropriate persona based on files in chunk
|
||||
# Use the first file to determine persona (or combine if multiple domains)
|
||||
primary_file_path = optimized_files[0][0] if optimized_files else ""
|
||||
primary_content = optimized_files[0][1] if optimized_files else ""
|
||||
persona = allocate_code_persona(primary_file_path, primary_content, chunk_type)
|
||||
|
||||
# Build context from previous analyses (progressive learning)
|
||||
context_section = build_context_from_state(analysis_state, chunk)
|
||||
|
||||
# Build assignment context
|
||||
assignment_context = f"CTO has assigned you to analyze the '{chunk_name}' module/chunk for this project. This is a {chunk_type} type chunk containing {len(optimized_files)} files."
|
||||
|
||||
# Build comprehensive prompt with module context
|
||||
prompt_parts = [
|
||||
f"# COMPREHENSIVE ANALYSIS: {chunk_name.upper()}",
|
||||
f"Chunk Type: {chunk_type}",
|
||||
"",
|
||||
"You are a senior software architect with 30+ years of experience. Analyze this module/chunk comprehensively.",
|
||||
""
|
||||
]
|
||||
|
||||
@ -2794,7 +2803,12 @@ def build_intelligent_chunk_prompt(chunk: Dict, analysis_state: Optional[Dict] =
|
||||
"Focus on providing detailed, actionable insights that help understand the complete module context."
|
||||
])
|
||||
|
||||
return "\n".join(prompt_parts)
|
||||
base_prompt = "\n".join(prompt_parts)
|
||||
|
||||
# Enhance with persona
|
||||
enhanced_prompt = build_code_analysis_persona_prompt(base_prompt, persona, assignment_context)
|
||||
|
||||
return enhanced_prompt
|
||||
|
||||
def build_smart_batch_prompt(files_batch: List[Tuple[str, str]]) -> str:
|
||||
"""Legacy function: Build prompt for simple batch (backward compatibility)."""
|
||||
@ -4719,13 +4733,13 @@ def build_synthesis_prompt(analysis_state: Dict, all_chunk_analyses: List[Dict]
|
||||
"""
|
||||
Build comprehensive prompt for cross-module synthesis analysis.
|
||||
Synthesizes all individual module analyses into system-level insights.
|
||||
Uses CTO persona for executive-level synthesis.
|
||||
"""
|
||||
from persona_system import get_cto_persona, build_cto_synthesis_prompt
|
||||
|
||||
prompt_parts = [
|
||||
"# CROSS-MODULE SYNTHESIS ANALYSIS",
|
||||
"",
|
||||
"You are a senior software architect with 30+ years of experience. Your task is to synthesize",
|
||||
"findings from multiple module-level analyses into comprehensive system-level insights.",
|
||||
"",
|
||||
"## CONTEXT: PREVIOUSLY ANALYZED MODULES",
|
||||
""
|
||||
]
|
||||
@ -4842,7 +4856,19 @@ def build_synthesis_prompt(analysis_state: Dict, all_chunk_analyses: List[Dict]
|
||||
"across all analyzed modules, not just repeating individual module findings."
|
||||
])
|
||||
|
||||
return "\n".join(prompt_parts)
|
||||
base_prompt = "\n".join(prompt_parts)
|
||||
|
||||
# Get team findings for CTO context
|
||||
team_findings = []
|
||||
if all_chunk_analyses:
|
||||
for chunk_analysis in all_chunk_analyses:
|
||||
module_name = chunk_analysis.get('module_name', 'unknown')
|
||||
team_findings.append({"domain": module_name, "analysis": chunk_analysis})
|
||||
|
||||
# Enhance with CTO persona
|
||||
enhanced_prompt = build_cto_synthesis_prompt(base_prompt, team_findings)
|
||||
|
||||
return enhanced_prompt
|
||||
|
||||
def parse_synthesis_response(response_text: str) -> Dict:
|
||||
"""Parse synthesis response from Claude API."""
|
||||
|
||||
@ -141,17 +141,19 @@ router.get('/auth/github/callback', async (req, res) => {
|
||||
setImmediate(async () => {
|
||||
try {
|
||||
console.log('[GitHub OAuth] Starting background repository attachment for:', repoContext.repoUrl);
|
||||
console.log('[GitHub OAuth] Using newly stored token for user:', user_id);
|
||||
const GitHubIntegrationService = require('../services/github-integration.service');
|
||||
const database = require('../config/database');
|
||||
const githubService = new GitHubIntegrationService();
|
||||
const { owner, repo, branch } = githubService.parseGitHubUrl(repoContext.repoUrl);
|
||||
|
||||
// Get metadata using authenticated Octokit
|
||||
const repositoryData = await githubService.fetchRepositoryMetadata(owner, repo);
|
||||
// Get metadata using authenticated Octokit with the specific user's token
|
||||
// Pass userId to ensure we use the newly stored token
|
||||
const repositoryData = await githubService.fetchRepositoryMetadata(owner, repo, false, user_id);
|
||||
let actualBranch = repoContext.branchName || branch || repositoryData.default_branch || 'main';
|
||||
|
||||
// Attempt analysis and sync with fallback
|
||||
const codebaseAnalysis = await githubService.analyzeCodebase(owner, repo, actualBranch, false);
|
||||
// Attempt analysis and sync with fallback - use userId to ensure correct token
|
||||
const codebaseAnalysis = await githubService.analyzeCodebase(owner, repo, actualBranch, false, user_id);
|
||||
const insertQuery = `
|
||||
INSERT INTO all_repositories (
|
||||
repository_url, repository_name, owner_name,
|
||||
@ -170,14 +172,14 @@ router.get('/auth/github/callback', async (req, res) => {
|
||||
JSON.stringify(codebaseAnalysis),
|
||||
'syncing',
|
||||
repositoryData.visibility === 'private',
|
||||
repoContext.userId || null,
|
||||
user_id || repoContext.userId || null, // Use user_id from OAuth callback (most reliable)
|
||||
'github' // This is GitHub OAuth callback, so provider is always github
|
||||
];
|
||||
const insertResult = await database.query(insertQuery, insertValues);
|
||||
const repositoryRecord = insertResult.rows[0];
|
||||
|
||||
// Clone repository
|
||||
const downloadResult = await githubService.syncRepositoryWithFallback(owner, repo, actualBranch, repositoryRecord.id, repositoryData.visibility !== 'private');
|
||||
// Clone repository - use userId to ensure correct token
|
||||
const downloadResult = await githubService.syncRepositoryWithFallback(owner, repo, actualBranch, repositoryRecord.id, repositoryData.visibility !== 'private', user_id);
|
||||
const finalSyncStatus = downloadResult.success ? 'synced' : 'error';
|
||||
await database.query('UPDATE all_repositories SET sync_status = $1, updated_at = NOW() WHERE id = $2', [finalSyncStatus, repositoryRecord.id]);
|
||||
|
||||
|
||||
@ -163,12 +163,28 @@ router.post('/:provider/attach-repository', async (req, res) => {
|
||||
const { template_id, repository_url, branch_name } = req.body;
|
||||
const userId = req.headers['x-user-id'] || req.query.user_id || req.body.user_id || (req.user && (req.user.id || req.user.userId));
|
||||
|
||||
console.log(`[VCS Attach] Extracted userId:`, userId, `from headers:`, req.headers['x-user-id'], `query:`, req.query.user_id, `body:`, req.body.user_id);
|
||||
|
||||
// Validate input - only repository_url is required (like GitHub)
|
||||
if (!repository_url) {
|
||||
return res.status(400).json({ success: false, message: 'Repository URL is required' });
|
||||
}
|
||||
|
||||
const { owner, repo, branch } = provider.parseRepoUrl(repository_url);
|
||||
// Clean and normalize the repository URL (trim whitespace, decode URL encoding)
|
||||
let cleanedUrl = repository_url.trim();
|
||||
// Decode URL-encoded characters (like %20 for spaces)
|
||||
try {
|
||||
cleanedUrl = decodeURIComponent(cleanedUrl);
|
||||
} catch (e) {
|
||||
// If decoding fails, use original URL
|
||||
console.warn(`[VCS Attach] Failed to decode URL, using original: ${cleanedUrl}`);
|
||||
}
|
||||
// Trim again after decoding
|
||||
cleanedUrl = cleanedUrl.trim();
|
||||
|
||||
console.log(`[VCS Attach] Original URL: ${repository_url}, Cleaned URL: ${cleanedUrl}`);
|
||||
|
||||
const { owner, repo, branch } = provider.parseRepoUrl(cleanedUrl);
|
||||
|
||||
// Enhanced flow: Detect private repos and redirect to OAuth immediately
|
||||
const providerKey = (req.params.provider || '').toLowerCase();
|
||||
@ -248,7 +264,44 @@ router.post('/:provider/attach-repository', async (req, res) => {
|
||||
// For public repos or authenticated private repos, proceed with normal flow
|
||||
const accessCheck = await provider.checkRepositoryAccess(owner, repo, userId);
|
||||
|
||||
console.log(`[VCS Attach] Access check result for ${owner}/${repo}:`, {
|
||||
hasAccess: accessCheck.hasAccess,
|
||||
requiresAuth: accessCheck.requiresAuth,
|
||||
authError: accessCheck.authError,
|
||||
error: accessCheck.error,
|
||||
exists: accessCheck.exists,
|
||||
github_username: accessCheck.github_username
|
||||
});
|
||||
|
||||
if (!accessCheck.hasAccess) {
|
||||
// If access check failed but requires auth, trigger OAuth flow
|
||||
if (accessCheck.requiresAuth || accessCheck.authError) {
|
||||
const oauthService = getOAuthService(providerKey);
|
||||
if (oauthService) {
|
||||
console.log(`🔒 [VCS Attach] Token exists but cannot access repository (or no valid token), redirecting to OAuth: ${repository_url}`);
|
||||
console.log(`🔒 [VCS Attach] Reason: ${accessCheck.error || 'Authentication required'}, userId: ${userId}`);
|
||||
|
||||
// Generate OAuth URL with repository context in state
|
||||
const stateBase = Math.random().toString(36).substring(7);
|
||||
const state = `${stateBase}|uid=${userId || 'unknown'}|repo=${encodeURIComponent(repository_url)}|branch=${encodeURIComponent(branch_name || 'main')}|private_repo=true`;
|
||||
|
||||
const authUrl = oauthService.getAuthUrl(state, userId);
|
||||
|
||||
console.log(`🔒 [VCS Attach] Generated OAuth URL for ${providerKey}, returning requires_auth response`);
|
||||
|
||||
return res.json({
|
||||
success: false,
|
||||
message: `${providerKey.charAt(0).toUpperCase() + providerKey.slice(1)} authentication required for private repository`,
|
||||
requires_auth: true,
|
||||
is_private_repo: true,
|
||||
auth_url: authUrl,
|
||||
state: state
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// If it's not an auth issue, return 404
|
||||
console.log(`[VCS Attach] Access check failed without auth requirement, returning 404`);
|
||||
return res.status(404).json({ success: false, message: accessCheck.error || 'Repository not accessible' });
|
||||
}
|
||||
|
||||
|
||||
@ -21,8 +21,8 @@ class GitHubIntegrationService {
|
||||
}
|
||||
|
||||
// Get authenticated Octokit instance
|
||||
async getAuthenticatedOctokit() {
|
||||
return await this.oauthService.getAuthenticatedOctokit();
|
||||
async getAuthenticatedOctokit(userId = null) {
|
||||
return await this.oauthService.getAuthenticatedOctokit(userId);
|
||||
}
|
||||
|
||||
// Extract owner, repo, and branch from GitHub URL using parse-github-url library
|
||||
@ -31,8 +31,15 @@ class GitHubIntegrationService {
|
||||
throw new Error('URL must be a non-empty string');
|
||||
}
|
||||
|
||||
// Normalize the URL first
|
||||
// Normalize the URL first - trim and decode URL encoding
|
||||
let normalizedUrl = url.trim();
|
||||
// Decode URL-encoded characters (like %20 for spaces)
|
||||
try {
|
||||
normalizedUrl = decodeURIComponent(normalizedUrl).trim();
|
||||
} catch (e) {
|
||||
// If decoding fails, just trim
|
||||
normalizedUrl = normalizedUrl.trim();
|
||||
}
|
||||
|
||||
// Remove trailing slashes and .git extensions
|
||||
normalizedUrl = normalizedUrl.replace(/\/+$/, '').replace(/\.git$/, '');
|
||||
@ -216,7 +223,7 @@ class GitHubIntegrationService {
|
||||
};
|
||||
}
|
||||
|
||||
// No token found - try unauthenticated access first to check if it's public
|
||||
// No token found that can access this repo - try unauthenticated access to check if it's public
|
||||
try {
|
||||
const unauthenticatedOctokit = new Octokit({
|
||||
userAgent: 'CodeNuk-GitIntegration/1.0.0',
|
||||
@ -234,13 +241,18 @@ class GitHubIntegrationService {
|
||||
};
|
||||
} catch (unauthenticatedError) {
|
||||
if (unauthenticatedError.status === 404) {
|
||||
// Repository truly doesn't exist
|
||||
// 404 from unauthenticated access could mean:
|
||||
// 1. Repository truly doesn't exist
|
||||
// 2. Repository is private and requires authentication
|
||||
// Since we already tried to find a token and none could access it,
|
||||
// and we're being called from a private repo flow, assume it requires auth
|
||||
console.log(`🔒 [GitHub] 404 from unauthenticated access - assuming private repo requires authentication`);
|
||||
return {
|
||||
exists: false,
|
||||
exists: null, // Unknown - could be missing or private
|
||||
isPrivate: null,
|
||||
hasAccess: false,
|
||||
requiresAuth: false,
|
||||
error: 'Repository not found'
|
||||
requiresAuth: true, // Changed from false to true - trigger OAuth
|
||||
error: 'Repository not found or requires authentication'
|
||||
};
|
||||
} else if (unauthenticatedError.status === 401 || unauthenticatedError.status === 403) {
|
||||
// Repository exists but requires authentication (private) - generate auth URL
|
||||
@ -289,13 +301,13 @@ class GitHubIntegrationService {
|
||||
}
|
||||
|
||||
// Get repository information from GitHub
|
||||
async fetchRepositoryMetadata(owner, repo, skipAuth = false) {
|
||||
async fetchRepositoryMetadata(owner, repo, skipAuth = false, userId = null) {
|
||||
// If skipAuth is true, try with unauthenticated octokit first to check visibility
|
||||
let octokit;
|
||||
if (skipAuth) {
|
||||
octokit = this.octokit; // Use unauthenticated instance
|
||||
} else {
|
||||
octokit = await this.getAuthenticatedOctokit();
|
||||
octokit = await this.getAuthenticatedOctokit(userId);
|
||||
}
|
||||
|
||||
const safe = async (fn, fallback) => {
|
||||
@ -309,26 +321,41 @@ class GitHubIntegrationService {
|
||||
|
||||
let repoData;
|
||||
try {
|
||||
console.log(`🔍 [GitHub] fetchRepositoryMetadata: skipAuth=${skipAuth}, calling octokit.repos.get for ${owner}/${repo}`);
|
||||
const response = await octokit.repos.get({ owner, repo });
|
||||
if (skipAuth) {
|
||||
if (response.status === 401 || response.status === 403) {
|
||||
throw new Error('Authentication required to access repository');
|
||||
} else if (response.status === 404) {
|
||||
throw new Error('Repository not found');
|
||||
}
|
||||
}
|
||||
repoData = response.data;
|
||||
console.log(`✅ [GitHub] Successfully fetched repository data: ${repoData?.full_name || 'no full_name'}`);
|
||||
|
||||
// Validate we got real data
|
||||
if (!repoData || !repoData.full_name) {
|
||||
console.log(`❌ [GitHub] Invalid repository data received, throwing error`);
|
||||
throw new Error('Invalid repository data received');
|
||||
}
|
||||
} catch (error) {
|
||||
console.log(`🔍 [GitHub] Error in fetchRepositoryMetadata:`, error.message, error.status);
|
||||
// Check error status from various possible locations
|
||||
const status = error.status || error.response?.status || error.code;
|
||||
const errorMessage = error.message || '';
|
||||
const is404 = status === 404 || status === '404' || errorMessage.includes('404') || errorMessage.includes('Not Found');
|
||||
const isAuthError = status === 401 || status === 403 || status === '401' || status === '403';
|
||||
|
||||
console.log(`🔍 [GitHub] Error in fetchRepositoryMetadata CATCH BLOCK:`, errorMessage, `Status: ${status || 'unknown'}`, `is404: ${is404}`, `isAuthError: ${isAuthError}`, `skipAuth: ${skipAuth}`);
|
||||
console.log(`🔍 [GitHub] Error object:`, JSON.stringify({
|
||||
status: error.status,
|
||||
responseStatus: error.response?.status,
|
||||
code: error.code,
|
||||
message: error.message,
|
||||
name: error.name
|
||||
}));
|
||||
|
||||
if (skipAuth) {
|
||||
// For GitHub, any error when skipAuth=true likely means private repo
|
||||
if (error.status === 401 || error.status === 403 || error.status === 404) {
|
||||
// For GitHub, any error when skipAuth=true means private repo or doesn't exist
|
||||
// Always throw authentication required - let the caller decide if it's truly missing or private
|
||||
console.log(`🔒 [GitHub] skipAuth=true, THROWING authentication required error - NOT using safe fallback`);
|
||||
throw new Error('Authentication required to access repository');
|
||||
}
|
||||
// For other errors, also assume private repo
|
||||
throw new Error('Authentication required to access repository');
|
||||
}
|
||||
// For other errors, use safe fallback
|
||||
|
||||
// For authenticated requests, use safe fallback (but only if skipAuth is false)
|
||||
console.log(`⚠️ [GitHub] skipAuth=false, using safe fallback`);
|
||||
repoData = await safe(
|
||||
async () => {
|
||||
const response = await octokit.repos.get({ owner, repo });
|
||||
@ -336,6 +363,12 @@ class GitHubIntegrationService {
|
||||
},
|
||||
{}
|
||||
);
|
||||
|
||||
// If safe fallback also failed, throw
|
||||
if (!repoData || !repoData.full_name) {
|
||||
console.log(`❌ [GitHub] Safe fallback also failed, throwing Repository not found`);
|
||||
throw new Error('Repository not found');
|
||||
}
|
||||
}
|
||||
|
||||
const languages = await safe(
|
||||
@ -364,7 +397,7 @@ class GitHubIntegrationService {
|
||||
}
|
||||
|
||||
// Analyze codebase structure
|
||||
async analyzeCodebase(owner, repo, branch, isPublicRepo = false) {
|
||||
async analyzeCodebase(owner, repo, branch, isPublicRepo = false, userId = null) {
|
||||
try {
|
||||
// Use appropriate octokit instance based on repository type
|
||||
let octokit;
|
||||
@ -374,8 +407,8 @@ class GitHubIntegrationService {
|
||||
userAgent: 'CodeNuk-GitIntegration/1.0.0',
|
||||
});
|
||||
} else {
|
||||
// For private repos, use authenticated octokit
|
||||
octokit = await this.getAuthenticatedOctokit();
|
||||
// For private repos, use authenticated octokit with userId
|
||||
octokit = await this.getAuthenticatedOctokit(userId);
|
||||
}
|
||||
|
||||
// Get the commit SHA for the branch
|
||||
@ -519,7 +552,7 @@ class GitHubIntegrationService {
|
||||
}
|
||||
|
||||
// Git-based: clone or update local repo and re-index into DB
|
||||
async syncRepositoryWithGit(owner, repo, branch, repositoryId, isPublicRepo = false) {
|
||||
async syncRepositoryWithGit(owner, repo, branch, repositoryId, isPublicRepo = false, userId = null) {
|
||||
const database = require('../config/database');
|
||||
const localPath = this.gitRepoService.getLocalRepoPath(owner, repo, branch);
|
||||
let storageRecord = null;
|
||||
@ -544,7 +577,7 @@ class GitHubIntegrationService {
|
||||
console.warn(`Failed to clone public repo without auth: ${error.message}`);
|
||||
// Fallback to authenticated clone if available
|
||||
try {
|
||||
const tokenRecord = await this.oauthService.getToken();
|
||||
const tokenRecord = userId ? await this.oauthService.getTokenForUser(userId) : await this.oauthService.getToken();
|
||||
if (tokenRecord?.access_token) {
|
||||
repoPath = await this.gitRepoService.cloneIfMissingWithAuth(
|
||||
owner,
|
||||
@ -560,7 +593,7 @@ class GitHubIntegrationService {
|
||||
} else {
|
||||
// For private repos, try authenticated clone first
|
||||
try {
|
||||
const tokenRecord = await this.oauthService.getToken();
|
||||
const tokenRecord = userId ? await this.oauthService.getTokenForUser(userId) : await this.oauthService.getToken();
|
||||
if (tokenRecord?.access_token) {
|
||||
repoPath = await this.gitRepoService.cloneIfMissingWithAuth(
|
||||
owner,
|
||||
@ -628,7 +661,7 @@ class GitHubIntegrationService {
|
||||
try {
|
||||
// Try to ensure repo exists for the preferred branch
|
||||
try {
|
||||
const tokenRecord = await this.oauthService.getToken().catch(() => null);
|
||||
const tokenRecord = userId ? await this.oauthService.getTokenForUser(userId).catch(() => null) : await this.oauthService.getToken().catch(() => null);
|
||||
if (tokenRecord?.access_token) {
|
||||
repoPath = await this.gitRepoService.cloneIfMissingWithAuth(owner, repo, preferredBranch, 'github.com', tokenRecord.access_token, 'oauth2');
|
||||
} else {
|
||||
@ -637,7 +670,7 @@ class GitHubIntegrationService {
|
||||
} catch (cloneErr) {
|
||||
// If the branch doesn't exist (e.g., refs/heads not found), try the alternate branch
|
||||
try {
|
||||
const tokenRecordAlt = await this.oauthService.getToken().catch(() => null);
|
||||
const tokenRecordAlt = userId ? await this.oauthService.getTokenForUser(userId).catch(() => null) : await this.oauthService.getToken().catch(() => null);
|
||||
repoPath = tokenRecordAlt?.access_token
|
||||
? await this.gitRepoService.cloneIfMissingWithAuth(owner, repo, alternateBranch, 'github.com', tokenRecordAlt.access_token, 'oauth2')
|
||||
: await this.gitRepoService.cloneIfMissing(owner, repo, alternateBranch);
|
||||
@ -679,7 +712,7 @@ class GitHubIntegrationService {
|
||||
try {
|
||||
// Ensure repo exists similarly to diff flow
|
||||
try {
|
||||
const tokenRecord = await this.oauthService.getToken().catch(() => null);
|
||||
const tokenRecord = userId ? await this.oauthService.getTokenForUser(userId).catch(() => null) : await this.oauthService.getToken().catch(() => null);
|
||||
if (tokenRecord?.access_token) {
|
||||
repoPath = await this.gitRepoService.cloneIfMissingWithAuth(owner, repo, preferredBranch, 'github.com', tokenRecord.access_token, 'oauth2');
|
||||
} else {
|
||||
@ -687,7 +720,7 @@ class GitHubIntegrationService {
|
||||
}
|
||||
} catch (_) {
|
||||
try {
|
||||
const tokenRecordAlt = await this.oauthService.getToken().catch(() => null);
|
||||
const tokenRecordAlt = userId ? await this.oauthService.getTokenForUser(userId).catch(() => null) : await this.oauthService.getToken().catch(() => null);
|
||||
repoPath = tokenRecordAlt?.access_token
|
||||
? await this.gitRepoService.cloneIfMissingWithAuth(owner, repo, alternateBranch, 'github.com', tokenRecordAlt.access_token, 'oauth2')
|
||||
: await this.gitRepoService.cloneIfMissing(owner, repo, alternateBranch);
|
||||
@ -720,15 +753,15 @@ class GitHubIntegrationService {
|
||||
}
|
||||
|
||||
// Try git-based sync first, fall back to GitHub API download on failure
|
||||
async syncRepositoryWithFallback(owner, repo, branch, repositoryId, isPublicRepo = false) {
|
||||
async syncRepositoryWithFallback(owner, repo, branch, repositoryId, isPublicRepo = false, userId = null) {
|
||||
// First attempt: full git clone/fetch and index
|
||||
const gitResult = await this.syncRepositoryWithGit(owner, repo, branch, repositoryId, isPublicRepo);
|
||||
const gitResult = await this.syncRepositoryWithGit(owner, repo, branch, repositoryId, isPublicRepo, userId);
|
||||
if (gitResult && gitResult.success) {
|
||||
return { method: 'git', ...gitResult };
|
||||
}
|
||||
|
||||
// Fallback: API-based download and storage
|
||||
const apiResult = await this.downloadRepositoryWithStorage(owner, repo, branch, repositoryId, isPublicRepo);
|
||||
const apiResult = await this.downloadRepositoryWithStorage(owner, repo, branch, repositoryId, isPublicRepo, userId);
|
||||
if (apiResult && apiResult.success) {
|
||||
return { method: 'api', ...apiResult, git_error: gitResult?.error };
|
||||
}
|
||||
@ -737,7 +770,7 @@ class GitHubIntegrationService {
|
||||
}
|
||||
|
||||
// Download repository files locally and store in database
|
||||
async downloadRepositoryWithStorage(owner, repo, branch, repositoryId, isPublicRepo = false) {
|
||||
async downloadRepositoryWithStorage(owner, repo, branch, repositoryId, isPublicRepo = false, userId = null) {
|
||||
const targetDir = path.join(
|
||||
process.env.ATTACHED_REPOS_DIR,
|
||||
`${owner}__${repo}__${branch}`
|
||||
@ -765,8 +798,8 @@ class GitHubIntegrationService {
|
||||
userAgent: 'CodeNuk-GitIntegration/1.0.0',
|
||||
});
|
||||
} else {
|
||||
// For private repos, use authenticated octokit
|
||||
octokit = await this.getAuthenticatedOctokit();
|
||||
// For private repos, use authenticated octokit with userId
|
||||
octokit = await this.getAuthenticatedOctokit(userId);
|
||||
}
|
||||
|
||||
// Get the commit SHA for the branch
|
||||
|
||||
@ -199,8 +199,16 @@ class GitHubOAuthService {
|
||||
}
|
||||
|
||||
// Create authenticated Octokit instance
|
||||
async getAuthenticatedOctokit() {
|
||||
const tokenRecord = await this.getToken();
|
||||
async getAuthenticatedOctokit(userId = null) {
|
||||
// If userId is provided, get the newest token for that user
|
||||
// Otherwise, get the newest token overall
|
||||
let tokenRecord;
|
||||
if (userId) {
|
||||
tokenRecord = await this.getTokenForUser(userId);
|
||||
console.log(`[GitHub OAuth] Using token for user ${userId}: ${tokenRecord?.github_username || 'none'}`);
|
||||
} else {
|
||||
tokenRecord = await this.getToken();
|
||||
}
|
||||
|
||||
if (!tokenRecord) {
|
||||
throw new Error('No GitHub token found. Please authenticate with GitHub first.');
|
||||
|
||||
@ -15,7 +15,11 @@ class GithubAdapter {
|
||||
return this.impl.parseGitHubUrl(url);
|
||||
}
|
||||
|
||||
async checkRepositoryAccess(owner, repo) {
|
||||
async checkRepositoryAccess(owner, repo, userId = null) {
|
||||
// Use user-specific method if userId is provided
|
||||
if (userId) {
|
||||
return await this.impl.checkRepositoryAccessWithUser(owner, repo, userId);
|
||||
}
|
||||
return await this.impl.checkRepositoryAccess(owner, repo);
|
||||
}
|
||||
|
||||
|
||||
58
services/multi-document-upload-service/.dockerignore
Normal file
58
services/multi-document-upload-service/.dockerignore
Normal file
@ -0,0 +1,58 @@
|
||||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.so
|
||||
.Python
|
||||
*.egg-info/
|
||||
dist/
|
||||
build/
|
||||
*.egg
|
||||
|
||||
# Virtual environments
|
||||
venv/
|
||||
env/
|
||||
ENV/
|
||||
.venv
|
||||
|
||||
# IDE
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
|
||||
# Documentation
|
||||
*.md
|
||||
!README.md
|
||||
|
||||
# Testing
|
||||
.pytest_cache/
|
||||
.coverage
|
||||
htmlcov/
|
||||
*.log
|
||||
|
||||
# Storage and temporary files
|
||||
storage/
|
||||
*.tmp
|
||||
*.temp
|
||||
|
||||
# Git
|
||||
.git/
|
||||
.gitignore
|
||||
|
||||
# Docker
|
||||
Dockerfile*
|
||||
docker-compose*.yml
|
||||
.dockerignore
|
||||
|
||||
# Environment files
|
||||
.env
|
||||
.env.local
|
||||
*.env
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
||||
|
||||
@ -1,29 +1,60 @@
|
||||
FROM python:3.11-slim
|
||||
# Build stage - install dependencies that require compilation
|
||||
FROM python:3.11-slim as builder
|
||||
|
||||
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||
PYTHONUNBUFFERED=1
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install build dependencies only
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Copy and install Python dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir --user -r requirements.txt && \
|
||||
pip cache purge
|
||||
|
||||
# Download SpaCy English model
|
||||
RUN python -m spacy download en_core_web_sm
|
||||
|
||||
# Runtime stage - minimal image with only runtime dependencies
|
||||
FROM python:3.11-slim
|
||||
|
||||
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||
PYTHONUNBUFFERED=1 \
|
||||
PYTHONPATH=/app/src \
|
||||
PATH=/root/.local/bin:$PATH \
|
||||
MULTI_DOC_STORAGE_ROOT=/app/storage \
|
||||
MULTI_DOC_CLAUDE_MODEL=claude-3-5-haiku-latest \
|
||||
CLAUDE_MODEL=claude-3-5-haiku-latest \
|
||||
PORT=8024
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install only runtime dependencies (no build tools)
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
poppler-utils \
|
||||
tesseract-ocr \
|
||||
ffmpeg \
|
||||
libmagic1 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
curl \
|
||||
# Required for some Python packages at runtime
|
||||
libgomp1 \
|
||||
libglib2.0-0 \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& apt-get clean
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
# Copy Python packages from builder stage (includes spacy model)
|
||||
COPY --from=builder /root/.local /root/.local
|
||||
|
||||
# Copy application code
|
||||
COPY src ./src
|
||||
|
||||
ENV PYTHONPATH=/app/src \
|
||||
MULTI_DOC_STORAGE_ROOT=/app/storage \
|
||||
MULTI_DOC_CLAUDE_MODEL=claude-3-5-sonnet-20241022 \
|
||||
PORT=8024
|
||||
|
||||
EXPOSE 8024
|
||||
|
||||
CMD ["sh", "-c", "uvicorn multi_document_upload_service.main:app --host 0.0.0.0 --port ${PORT:-8024}"]
|
||||
|
||||
@ -1,144 +0,0 @@
|
||||
# Fix: Empty Graph in Neo4j (No Relationships Found)
|
||||
|
||||
## Problem
|
||||
|
||||
When querying Neo4j for `CAUSES` relationships, you get "(no changes, no records)" because:
|
||||
|
||||
1. **PDF extraction failed** - Missing dependencies (`unstructured[pdf]`)
|
||||
2. **0 relations extracted** - No text was extracted, so no analysis happened
|
||||
3. **0 relations written** - Nothing was written to Neo4j (correct behavior)
|
||||
|
||||
## Root Cause
|
||||
|
||||
The service completed with 0 relations because:
|
||||
- PDF file extraction failed: `partition_pdf() is not available because one or more dependencies are not installed`
|
||||
- No text was extracted from the PDF
|
||||
- No chunks were created
|
||||
- No Claude analysis happened
|
||||
- 0 relations were extracted
|
||||
- 0 relations were written to Neo4j
|
||||
|
||||
## Solution
|
||||
|
||||
### Step 1: Update Dependencies
|
||||
|
||||
The `requirements.txt` has been updated to include:
|
||||
```
|
||||
unstructured[pdf]>=0.15.0
|
||||
unstructured[docx]>=0.15.0
|
||||
unstructured[pptx]>=0.15.0
|
||||
unstructured[xlsx]>=0.15.0
|
||||
```
|
||||
|
||||
### Step 2: Rebuild the Service
|
||||
|
||||
```bash
|
||||
cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine
|
||||
|
||||
# Rebuild the service with new dependencies
|
||||
docker-compose build multi-document-upload-service
|
||||
|
||||
# Restart the service
|
||||
docker-compose restart multi-document-upload-service
|
||||
|
||||
# Check logs to verify it's working
|
||||
docker-compose logs -f multi-document-upload-service
|
||||
```
|
||||
|
||||
### Step 3: Verify Dependencies
|
||||
|
||||
```bash
|
||||
# Check if unstructured[pdf] is installed
|
||||
docker-compose exec multi-document-upload-service pip list | grep unstructured
|
||||
```
|
||||
|
||||
### Step 4: Re-upload Documents
|
||||
|
||||
1. Go to Project Builder in the frontend
|
||||
2. Click on "Upload Documents for Knowledge Graph"
|
||||
3. Upload a PDF or other document
|
||||
4. Wait for processing to complete
|
||||
5. Check Neo4j for relationships
|
||||
|
||||
### Step 5: Check Neo4j
|
||||
|
||||
Run these queries in Neo4j Browser:
|
||||
|
||||
```cypher
|
||||
// Check if any nodes exist
|
||||
MATCH (n)
|
||||
RETURN count(n) as node_count
|
||||
|
||||
// Check for CAUSES relationships
|
||||
MATCH (n:Concept)-[r:CAUSES]->(m:Concept)
|
||||
RETURN n.name as cause, m.name as effect, r.confidence as confidence
|
||||
LIMIT 50
|
||||
```
|
||||
|
||||
## Expected Behavior After Fix
|
||||
|
||||
1. **PDF extraction succeeds** - Text is extracted from PDF files
|
||||
2. **Text is chunked** - Document is split into manageable chunks
|
||||
3. **Claude analyzes** - Causal relationships are extracted
|
||||
4. **Relations are written** - Relationships are stored in Neo4j
|
||||
5. **Query returns results** - Neo4j query shows relationships
|
||||
|
||||
## Verification Steps
|
||||
|
||||
1. **Check service logs**:
|
||||
```bash
|
||||
docker-compose logs multi-document-upload-service | grep -i "extracted\|relation\|neo4j"
|
||||
```
|
||||
|
||||
2. **Check job status**:
|
||||
```bash
|
||||
curl http://localhost:8000/api/multi-docs/jobs/{job_id}
|
||||
```
|
||||
Should show: `"processed_files": 1` and relations count > 0
|
||||
|
||||
3. **Check Neo4j**:
|
||||
```cypher
|
||||
MATCH (n:Concept)-[r:CAUSES]->(m:Concept)
|
||||
RETURN count(r) as relation_count
|
||||
```
|
||||
|
||||
## Improvements Made
|
||||
|
||||
1. ✅ **Added PDF dependencies** - `unstructured[pdf]`, `unstructured[docx]`, etc.
|
||||
2. ✅ **Added fallback extractors** - Uses `pdfplumber` if unstructured fails
|
||||
3. ✅ **Better error handling** - Shows actual errors in job status
|
||||
4. ✅ **Improved logging** - More detailed logs for debugging
|
||||
5. ✅ **Better Neo4j query** - Validates data before writing
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
If you still see 0 relations after rebuilding:
|
||||
|
||||
1. **Check extraction logs**:
|
||||
```bash
|
||||
docker-compose logs multi-document-upload-service | grep -i "extract"
|
||||
```
|
||||
|
||||
2. **Check Claude analysis**:
|
||||
```bash
|
||||
docker-compose logs multi-document-upload-service | grep -i "claude\|analyze"
|
||||
```
|
||||
|
||||
3. **Check Neo4j connection**:
|
||||
```bash
|
||||
docker-compose logs multi-document-upload-service | grep -i "neo4j\|graph"
|
||||
```
|
||||
|
||||
4. **Verify document has causal language**:
|
||||
- Not all documents contain causal relationships
|
||||
- Try uploading a document with clear cause-effect statements
|
||||
- Example: "Smoking causes lung cancer" or "Rain causes flooding"
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. Rebuild the service with new dependencies
|
||||
2. Re-upload documents
|
||||
3. Check Neo4j for relationships
|
||||
4. If still no results, check service logs for errors
|
||||
5. Verify the document contains causal language
|
||||
|
||||
@ -1,176 +0,0 @@
|
||||
# Neo4j Diagnostic Queries
|
||||
|
||||
## Issue: No relationships found in Neo4j
|
||||
|
||||
If you're seeing "(no changes, no records)" when querying for `CAUSES` relationships, here are diagnostic queries to check what's actually in the database.
|
||||
|
||||
## Diagnostic Queries
|
||||
|
||||
### 1. Check if any nodes exist
|
||||
```cypher
|
||||
MATCH (n)
|
||||
RETURN count(n) as node_count
|
||||
LIMIT 1
|
||||
```
|
||||
|
||||
### 2. Check if Concept nodes exist
|
||||
```cypher
|
||||
MATCH (n:Concept)
|
||||
RETURN count(n) as concept_count,
|
||||
collect(DISTINCT labels(n)) as labels,
|
||||
collect(DISTINCT keys(n)) as properties
|
||||
LIMIT 10
|
||||
```
|
||||
|
||||
### 3. Check all relationship types
|
||||
```cypher
|
||||
CALL db.relationshipTypes() YIELD relationshipType
|
||||
RETURN relationshipType
|
||||
```
|
||||
|
||||
### 4. Check all node labels
|
||||
```cypher
|
||||
CALL db.labels() YIELD label
|
||||
RETURN label
|
||||
```
|
||||
|
||||
### 5. Check all relationships (any type)
|
||||
```cypher
|
||||
MATCH (n)-[r]->(m)
|
||||
RETURN type(r) as relationship_type,
|
||||
count(r) as count,
|
||||
labels(n) as from_labels,
|
||||
labels(m) as to_labels
|
||||
LIMIT 50
|
||||
```
|
||||
|
||||
### 6. Check for CAUSES relationships specifically
|
||||
```cypher
|
||||
MATCH (n)-[r:CAUSES]->(m)
|
||||
RETURN n, r, m
|
||||
LIMIT 50
|
||||
```
|
||||
|
||||
### 7. Check for relationships with lowercase "causes"
|
||||
```cypher
|
||||
MATCH (n)-[r]->(m)
|
||||
WHERE type(r) =~ '(?i)causes'
|
||||
RETURN type(r) as relationship_type, n, r, m
|
||||
LIMIT 50
|
||||
```
|
||||
|
||||
### 8. Check all nodes and their relationships
|
||||
```cypher
|
||||
MATCH (n)
|
||||
OPTIONAL MATCH (n)-[r]->(m)
|
||||
RETURN n, labels(n) as node_labels,
|
||||
type(r) as relationship_type,
|
||||
m, labels(m) as target_labels
|
||||
LIMIT 50
|
||||
```
|
||||
|
||||
### 9. Check for nodes created by the service (by job_id property)
|
||||
```cypher
|
||||
MATCH (n)-[r]->(m)
|
||||
WHERE r.job_id IS NOT NULL
|
||||
RETURN n, r, m, r.job_id as job_id
|
||||
LIMIT 50
|
||||
```
|
||||
|
||||
### 10. Check database statistics
|
||||
```cypher
|
||||
MATCH (n)
|
||||
RETURN count(n) as total_nodes,
|
||||
size([(n)-[r]->() | r]) as total_relationships
|
||||
```
|
||||
|
||||
## Common Issues and Solutions
|
||||
|
||||
### Issue 1: No nodes at all
|
||||
**Symptom**: Query 1 returns 0 nodes
|
||||
**Cause**: Service hasn't written anything to Neo4j, or connection failed
|
||||
**Solution**:
|
||||
- Check service logs: `docker-compose logs multi-document-upload-service`
|
||||
- Verify Neo4j connection in service configuration
|
||||
- Check if job completed with 0 relations (extraction failed)
|
||||
|
||||
### Issue 2: Nodes exist but no relationships
|
||||
**Symptom**: Query 1 returns nodes, but Query 6 returns no relationships
|
||||
**Cause**: Relationships weren't created, or different relationship type
|
||||
**Solution**:
|
||||
- Check Query 5 to see what relationship types actually exist
|
||||
- Check service logs for graph writing errors
|
||||
- Verify the job actually extracted relations (check job status)
|
||||
|
||||
### Issue 3: Different relationship type
|
||||
**Symptom**: Query 5 shows relationships but not `CAUSES`
|
||||
**Cause**: Service might be using a different relationship type
|
||||
**Solution**:
|
||||
- Check Query 3 to see all relationship types
|
||||
- Update query to use the correct relationship type
|
||||
|
||||
### Issue 4: Different node labels
|
||||
**Symptom**: Query 6 returns no results, but Query 2 shows different labels
|
||||
**Cause**: Service might be using different node labels
|
||||
**Solution**:
|
||||
- Check Query 2 to see what labels exist
|
||||
- Update query to match actual labels
|
||||
|
||||
## Expected Structure
|
||||
|
||||
After a successful upload, you should see:
|
||||
|
||||
### Nodes
|
||||
- **Label**: `Concept`
|
||||
- **Properties**: `name`, `lastSeen`
|
||||
|
||||
### Relationships
|
||||
- **Type**: `CAUSES`
|
||||
- **Properties**: `confidence`, `explanation`, `source_file_id`, `source_snippet`, `job_id`, `model`, `updated_at`
|
||||
|
||||
### Example Query
|
||||
```cypher
|
||||
MATCH (cause:Concept)-[r:CAUSES]->(effect:Concept)
|
||||
RETURN cause.name as cause,
|
||||
effect.name as effect,
|
||||
r.confidence as confidence,
|
||||
r.job_id as job_id,
|
||||
r.source_file_id as source_file
|
||||
LIMIT 50
|
||||
```
|
||||
|
||||
## Troubleshooting Steps
|
||||
|
||||
1. **Check service logs**:
|
||||
```bash
|
||||
docker-compose logs -f multi-document-upload-service
|
||||
```
|
||||
|
||||
2. **Check if job completed successfully**:
|
||||
```bash
|
||||
curl http://localhost:8000/api/multi-docs/jobs/{job_id}
|
||||
```
|
||||
|
||||
3. **Check Neo4j connection**:
|
||||
```bash
|
||||
docker-compose logs neo4j | grep -i error
|
||||
```
|
||||
|
||||
4. **Verify Neo4j is running**:
|
||||
```bash
|
||||
docker-compose ps neo4j
|
||||
```
|
||||
|
||||
5. **Test Neo4j connection manually**:
|
||||
```bash
|
||||
docker-compose exec neo4j cypher-shell -u neo4j -p password "MATCH (n) RETURN count(n)"
|
||||
```
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. Run the diagnostic queries above
|
||||
2. Check the service logs for errors
|
||||
3. Verify the job status via API
|
||||
4. Re-upload documents after fixing dependencies
|
||||
5. Check if relations were actually extracted (job status should show relation count)
|
||||
|
||||
@ -1,85 +0,0 @@
|
||||
# Quick Testing Guide - Multi-Document Upload
|
||||
|
||||
## 🚀 Quick Start Testing
|
||||
|
||||
### 1. Start Services
|
||||
```bash
|
||||
cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine
|
||||
docker-compose up -d multi-document-upload-service neo4j redis postgres api-gateway
|
||||
```
|
||||
|
||||
### 2. Verify Services
|
||||
```bash
|
||||
# Check health
|
||||
curl http://localhost:8024/health
|
||||
curl http://localhost:8000/api/multi-docs/health
|
||||
```
|
||||
|
||||
### 3. Test via Frontend
|
||||
|
||||
1. **Open Frontend**: `http://localhost:3001`
|
||||
2. **Login** (if required)
|
||||
3. **Go to Project Builder**
|
||||
4. **Complete Steps 1-2** (Project Type & Features)
|
||||
5. **Step 3: Multi Docs Upload** appears
|
||||
6. **Upload files**:
|
||||
- Click upload area
|
||||
- Select multiple files (PDF, DOCX, etc.)
|
||||
- Click "Start Upload"
|
||||
7. **Watch Progress**:
|
||||
- Progress bar updates
|
||||
- Status messages appear
|
||||
- Polls every 4 seconds
|
||||
8. **Auto-proceeds** when completed
|
||||
|
||||
### 4. Verify in Neo4j
|
||||
|
||||
```bash
|
||||
# Open Neo4j Browser: http://localhost:7474
|
||||
# Login: neo4j / password
|
||||
|
||||
# Query causal relationships:
|
||||
MATCH (n)-[r:CAUSES]->(m)
|
||||
RETURN n, r, m
|
||||
LIMIT 50
|
||||
```
|
||||
|
||||
## 📝 Test Checklist
|
||||
|
||||
- [ ] Service starts successfully
|
||||
- [ ] Health endpoint works
|
||||
- [ ] Frontend component renders
|
||||
- [ ] File upload works
|
||||
- [ ] Progress updates correctly
|
||||
- [ ] Job completes successfully
|
||||
- [ ] Neo4j graph contains relationships
|
||||
- [ ] Error handling works
|
||||
- [ ] Skip button works
|
||||
|
||||
## 🔍 Debug Commands
|
||||
|
||||
```bash
|
||||
# View service logs
|
||||
docker-compose logs -f multi-document-upload-service
|
||||
|
||||
# Check job status (replace {job_id})
|
||||
curl http://localhost:8000/api/multi-docs/jobs/{job_id}
|
||||
|
||||
# Check graph summary
|
||||
curl http://localhost:8000/api/multi-docs/jobs/{job_id}/graph
|
||||
```
|
||||
|
||||
## ⚠️ Common Issues
|
||||
|
||||
1. **502 Bad Gateway**: Service not running → `docker-compose ps`
|
||||
2. **413 Too Large**: File too big → Reduce file size
|
||||
3. **No progress**: Check browser console → Check network tab
|
||||
4. **No relationships**: Check Claude API key → Check service logs
|
||||
|
||||
## 🎯 Expected Flow
|
||||
|
||||
```
|
||||
Upload Files → Job Created → Files Saved → Content Extracted →
|
||||
Claude Analysis → Graph Built → Completed → Auto-proceed to Next Step
|
||||
```
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,152 +0,0 @@
|
||||
# Rebuild Instructions - Multi-Document Upload Service
|
||||
|
||||
## Issue: Empty Graph in Neo4j
|
||||
|
||||
**Problem**: Query returns "(no changes, no records)" because the job completed with 0 relations.
|
||||
|
||||
**Root Cause**: PDF extraction failed due to missing dependencies (`unstructured[pdf]`).
|
||||
|
||||
## Fixes Applied
|
||||
|
||||
1. ✅ Added PDF dependencies (`unstructured[pdf]`, `unstructured[docx]`, etc.)
|
||||
2. ✅ Added fallback extractors (pdfplumber, python-docx, python-pptx)
|
||||
3. ✅ Improved error handling and logging
|
||||
4. ✅ Fixed Neo4j query syntax
|
||||
5. ✅ Better status messages
|
||||
|
||||
## Rebuild Steps
|
||||
|
||||
### Step 1: Rebuild the Service
|
||||
|
||||
```bash
|
||||
cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine
|
||||
|
||||
# Stop the service
|
||||
docker-compose stop multi-document-upload-service
|
||||
|
||||
# Rebuild with new dependencies
|
||||
docker-compose build --no-cache multi-document-upload-service
|
||||
|
||||
# Start the service
|
||||
docker-compose up -d multi-document-upload-service
|
||||
|
||||
# Check logs to verify it's starting correctly
|
||||
docker-compose logs -f multi-document-upload-service
|
||||
```
|
||||
|
||||
### Step 2: Verify Dependencies
|
||||
|
||||
```bash
|
||||
# Check if unstructured[pdf] is installed
|
||||
docker-compose exec multi-document-upload-service pip list | grep unstructured
|
||||
|
||||
# You should see:
|
||||
# unstructured
|
||||
# unstructured-pdf
|
||||
# unstructured-docx
|
||||
# etc.
|
||||
```
|
||||
|
||||
### Step 3: Test the Service
|
||||
|
||||
```bash
|
||||
# Check health endpoint
|
||||
curl http://localhost:8024/health
|
||||
|
||||
# Should return:
|
||||
# {
|
||||
# "status": "ok",
|
||||
# "claude_model": "claude-3-5-haiku-latest",
|
||||
# ...
|
||||
# }
|
||||
```
|
||||
|
||||
### Step 4: Re-upload Documents
|
||||
|
||||
1. Open frontend: `http://localhost:3001/project-builder`
|
||||
2. Go to Step 1: Project Type
|
||||
3. Find "Upload Documents for Knowledge Graph" section
|
||||
4. Upload a PDF or other document
|
||||
5. Wait for processing to complete
|
||||
6. Check status - should show relation count > 0
|
||||
|
||||
### Step 5: Verify in Neo4j
|
||||
|
||||
Run these queries in Neo4j Browser (`http://localhost:7474`):
|
||||
|
||||
```cypher
|
||||
// Check if any nodes exist
|
||||
MATCH (n)
|
||||
RETURN count(n) as node_count
|
||||
|
||||
// Check for CAUSES relationships
|
||||
MATCH (n:Concept)-[r:CAUSES]->(m:Concept)
|
||||
RETURN n.name as cause,
|
||||
m.name as effect,
|
||||
r.confidence as confidence,
|
||||
r.job_id as job_id
|
||||
LIMIT 50
|
||||
```
|
||||
|
||||
## Expected Results
|
||||
|
||||
After rebuilding and re-uploading:
|
||||
|
||||
1. **PDF extraction succeeds** ✅
|
||||
2. **Text is extracted** ✅
|
||||
3. **Relations are extracted** ✅
|
||||
4. **Relations are written to Neo4j** ✅
|
||||
5. **Query returns results** ✅
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
If you still see 0 relations:
|
||||
|
||||
1. **Check service logs**:
|
||||
```bash
|
||||
docker-compose logs multi-document-upload-service | tail -50
|
||||
```
|
||||
|
||||
2. **Check extraction logs**:
|
||||
```bash
|
||||
docker-compose logs multi-document-upload-service | grep -i "extract\|pdf"
|
||||
```
|
||||
|
||||
3. **Check Claude analysis**:
|
||||
```bash
|
||||
docker-compose logs multi-document-upload-service | grep -i "claude\|analyze\|relation"
|
||||
```
|
||||
|
||||
4. **Check Neo4j connection**:
|
||||
```bash
|
||||
docker-compose logs multi-document-upload-service | grep -i "neo4j\|graph\|write"
|
||||
```
|
||||
|
||||
5. **Verify document has causal language**:
|
||||
- Not all documents contain causal relationships
|
||||
- Try uploading a document with clear cause-effect statements
|
||||
- Example: "Smoking causes lung cancer"
|
||||
|
||||
## Quick Test
|
||||
|
||||
Test with a simple text file:
|
||||
|
||||
1. Create a test file `test_causal.txt`:
|
||||
```
|
||||
Smoking cigarettes causes lung cancer.
|
||||
Heavy rain causes flooding.
|
||||
Exercise improves health.
|
||||
```
|
||||
|
||||
2. Upload it via the frontend
|
||||
3. Check Neo4j for relationships
|
||||
4. Should see 3 causal relationships
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. Rebuild the service
|
||||
2. Re-upload documents
|
||||
3. Check Neo4j for relationships
|
||||
4. If still no results, check service logs
|
||||
5. Verify the document contains causal language
|
||||
|
||||
@ -1,300 +0,0 @@
|
||||
# Multi-Document Upload Service - Frontend Testing Guide
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. **Backend Services Running**:
|
||||
```bash
|
||||
cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine
|
||||
docker-compose up -d
|
||||
```
|
||||
|
||||
2. **Verify Services are Running**:
|
||||
- API Gateway: `http://localhost:8000/health`
|
||||
- Multi-Document Upload Service: `http://localhost:8024/health`
|
||||
- Neo4j: `http://localhost:7474` (Browser interface)
|
||||
- Frontend: `http://localhost:3001` (or your frontend port)
|
||||
|
||||
3. **Check Service Health**:
|
||||
```bash
|
||||
# Check API Gateway
|
||||
curl http://localhost:8000/health
|
||||
|
||||
# Check Multi-Document Upload Service directly
|
||||
curl http://localhost:8024/health
|
||||
|
||||
# Check via API Gateway proxy
|
||||
curl http://localhost:8000/api/multi-docs/health
|
||||
```
|
||||
|
||||
## Frontend Testing Steps
|
||||
|
||||
### Step 1: Navigate to Project Builder
|
||||
|
||||
1. Open your browser and go to: `http://localhost:3001` (or your frontend URL)
|
||||
2. Log in if required
|
||||
3. Click on **"Project Builder"** in the navigation
|
||||
|
||||
### Step 2: Go to Multi Docs Upload Step
|
||||
|
||||
1. In the Project Builder, you should see the workflow steps:
|
||||
- **Step 1**: Project Type
|
||||
- **Step 2**: Features
|
||||
- **Step 3**: Multi Docs Upload ← **This is the new step**
|
||||
- **Step 4**: Business Context
|
||||
- **Step 5**: Generate
|
||||
- **Step 6**: Architecture
|
||||
|
||||
2. Complete Steps 1 and 2 (Project Type and Features selection)
|
||||
3. You will automatically be taken to **Step 3: Multi Docs Upload**
|
||||
|
||||
### Step 3: Upload Documents
|
||||
|
||||
1. **Click on the upload area** or **drag and drop files**
|
||||
2. **Select multiple files** (you can mix different formats):
|
||||
- PDF files (`.pdf`)
|
||||
- Word documents (`.doc`, `.docx`)
|
||||
- PowerPoint (`.ppt`, `.pptx`)
|
||||
- Excel files (`.xls`, `.xlsx`)
|
||||
- JSON files (`.json`)
|
||||
- XML files (`.xml`)
|
||||
- Markdown files (`.md`)
|
||||
- Images (`.png`, `.jpg`, `.jpeg`) - will use OCR
|
||||
- Audio files (`.mp3`, `.wav`) - will be transcribed
|
||||
- Video files (`.mp4`, `.avi`) - will be transcribed
|
||||
|
||||
3. **View selected files**: You should see a list of all selected files with:
|
||||
- File icon
|
||||
- File name
|
||||
- Remove button for each file
|
||||
|
||||
4. **Click "Start Upload"** button
|
||||
|
||||
### Step 4: Monitor Upload Progress
|
||||
|
||||
After clicking "Start Upload", you should see:
|
||||
|
||||
1. **Upload Status**:
|
||||
- Button shows "Uploading..." with spinner
|
||||
- Progress bar appears
|
||||
- Stage messages appear:
|
||||
- "Job received"
|
||||
- "Saving files"
|
||||
- "Extracting document content"
|
||||
- "Calling Claude for causal relations"
|
||||
- "Writing to Neo4j knowledge graph"
|
||||
- "Completed"
|
||||
|
||||
2. **Progress Indicators**:
|
||||
- Progress percentage (0-100%)
|
||||
- Status message showing current stage
|
||||
- Processed files count vs total files count
|
||||
|
||||
3. **Polling**: The frontend automatically polls the job status every 4 seconds
|
||||
|
||||
### Step 5: Verify Results
|
||||
|
||||
Once the job is completed:
|
||||
|
||||
1. **Check Neo4j Graph**:
|
||||
- Open Neo4j Browser: `http://localhost:7474`
|
||||
- Login with:
|
||||
- Username: `neo4j`
|
||||
- Password: `password`
|
||||
- Run Cypher query to see the graph:
|
||||
```cypher
|
||||
MATCH (n)-[r:CAUSES]->(m)
|
||||
RETURN n, r, m
|
||||
LIMIT 50
|
||||
```
|
||||
|
||||
2. **Check Job Status via API**:
|
||||
```bash
|
||||
# Replace {job_id} with the actual job ID from the frontend
|
||||
curl http://localhost:8000/api/multi-docs/jobs/{job_id}
|
||||
```
|
||||
|
||||
3. **Get Graph Summary**:
|
||||
```bash
|
||||
curl http://localhost:8000/api/multi-docs/jobs/{job_id}/graph
|
||||
```
|
||||
|
||||
## Testing Different Scenarios
|
||||
|
||||
### Scenario 1: Single PDF File
|
||||
- Upload one PDF file
|
||||
- Verify it processes correctly
|
||||
- Check Neo4j for causal relationships
|
||||
|
||||
### Scenario 2: Multiple Mixed Format Files
|
||||
- Upload 3-5 files of different formats (PDF, DOCX, JSON, image)
|
||||
- Verify all files are processed
|
||||
- Check that progress updates correctly
|
||||
|
||||
### Scenario 3: Large Files
|
||||
- Upload a large PDF (10+ MB)
|
||||
- Verify it handles large files correctly
|
||||
- Check processing time
|
||||
|
||||
### Scenario 4: Error Handling
|
||||
- Try uploading an unsupported file type
|
||||
- Verify error message appears
|
||||
- Check that the error is displayed clearly
|
||||
|
||||
### Scenario 5: Skip Option
|
||||
- Upload files
|
||||
- Click "Skip" button before completion
|
||||
- Verify you can proceed to the next step
|
||||
- Job continues processing in the background
|
||||
|
||||
## Browser Developer Tools
|
||||
|
||||
### Check Network Requests
|
||||
|
||||
1. **Open Developer Tools** (F12)
|
||||
2. **Go to Network tab**
|
||||
3. **Filter by "multi-docs"**
|
||||
4. **Monitor requests**:
|
||||
- `POST /api/multi-docs/jobs` - Upload files
|
||||
- `GET /api/multi-docs/jobs/{job_id}` - Poll job status
|
||||
- `GET /api/multi-docs/jobs/{job_id}/graph` - Get graph summary
|
||||
|
||||
### Check Console Logs
|
||||
|
||||
1. **Open Console tab**
|
||||
2. **Look for**:
|
||||
- Upload progress logs
|
||||
- Job status updates
|
||||
- Any error messages
|
||||
|
||||
### Check Response Data
|
||||
|
||||
Verify the API responses:
|
||||
|
||||
```javascript
|
||||
// Upload response should be:
|
||||
{
|
||||
"job_id": "uuid-here",
|
||||
"stage": "received",
|
||||
"total_files": 3,
|
||||
"created_at": "2024-01-01T00:00:00Z"
|
||||
}
|
||||
|
||||
// Status response should be:
|
||||
{
|
||||
"job_id": "uuid-here",
|
||||
"stage": "extracting",
|
||||
"status_message": "Extracting document content",
|
||||
"total_files": 3,
|
||||
"processed_files": 1,
|
||||
"error": null,
|
||||
"created_at": "2024-01-01T00:00:00Z",
|
||||
"updated_at": "2024-01-01T00:01:00Z",
|
||||
"files": [...]
|
||||
}
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Issue: Upload fails with 502 Bad Gateway
|
||||
**Solution**:
|
||||
- Check if multi-document-upload-service is running:
|
||||
```bash
|
||||
docker-compose ps multi-document-upload-service
|
||||
```
|
||||
- Check service logs:
|
||||
```bash
|
||||
docker-compose logs multi-document-upload-service
|
||||
```
|
||||
|
||||
### Issue: Upload fails with 413 Request Entity Too Large
|
||||
**Solution**:
|
||||
- Check file sizes (max 500MB total per job)
|
||||
- Reduce number of files or file sizes
|
||||
- Check API Gateway body size limits
|
||||
|
||||
### Issue: Status polling stops working
|
||||
**Solution**:
|
||||
- Check browser console for errors
|
||||
- Verify job ID is correct
|
||||
- Check if job completed or failed
|
||||
- Check network tab for failed requests
|
||||
|
||||
### Issue: No causal relationships found
|
||||
**Solution**:
|
||||
- Check Claude API key is configured correctly
|
||||
- Check service logs for Claude API errors
|
||||
- Verify documents contain causal language
|
||||
- Check Neo4j connection
|
||||
|
||||
### Issue: Frontend shows "Failed" status
|
||||
**Solution**:
|
||||
- Check the error message in the frontend
|
||||
- Check backend service logs:
|
||||
```bash
|
||||
docker-compose logs -f multi-document-upload-service
|
||||
```
|
||||
- Verify all dependencies are running (Neo4j, Redis, Postgres)
|
||||
|
||||
## Expected Behavior
|
||||
|
||||
### Successful Flow:
|
||||
1. ✅ Files upload successfully
|
||||
2. ✅ Job ID is returned
|
||||
3. ✅ Status polling starts automatically
|
||||
4. ✅ Progress updates every 4 seconds
|
||||
5. ✅ Stage changes are displayed
|
||||
6. ✅ Progress bar updates
|
||||
7. ✅ Job completes successfully
|
||||
8. ✅ Frontend automatically proceeds to next step
|
||||
9. ✅ Neo4j contains causal relationships
|
||||
|
||||
### Error Flow:
|
||||
1. ✅ Error message is displayed clearly
|
||||
2. ✅ User can retry upload
|
||||
3. ✅ User can skip and proceed
|
||||
4. ✅ Error details are logged in console
|
||||
|
||||
## API Endpoints Reference
|
||||
|
||||
### Upload Files
|
||||
```bash
|
||||
POST /api/multi-docs/jobs
|
||||
Content-Type: multipart/form-data
|
||||
|
||||
Form Data:
|
||||
- files: File[] (multiple files)
|
||||
- job_name: string (optional)
|
||||
```
|
||||
|
||||
### Get Job Status
|
||||
```bash
|
||||
GET /api/multi-docs/jobs/{job_id}
|
||||
```
|
||||
|
||||
### Get Graph Summary
|
||||
```bash
|
||||
GET /api/multi-docs/jobs/{job_id}/graph
|
||||
```
|
||||
|
||||
### Health Check
|
||||
```bash
|
||||
GET /api/multi-docs/health
|
||||
```
|
||||
|
||||
## Next Steps After Testing
|
||||
|
||||
1. **Verify Neo4j Graph**: Check that causal relationships are stored correctly
|
||||
2. **Check Storage**: Verify files are stored in the persistent volume
|
||||
3. **Monitor Performance**: Check processing times for different file types
|
||||
4. **Test Error Scenarios**: Verify error handling works correctly
|
||||
5. **Test Large Batches**: Upload 50+ files to test scalability
|
||||
|
||||
## Support
|
||||
|
||||
If you encounter issues:
|
||||
1. Check service logs: `docker-compose logs multi-document-upload-service`
|
||||
2. Check API Gateway logs: `docker-compose logs api-gateway`
|
||||
3. Check Neo4j logs: `docker-compose logs neo4j`
|
||||
4. Verify all environment variables are set correctly
|
||||
5. Check network connectivity between services
|
||||
|
||||
@ -8,10 +8,6 @@ pydantic-settings>=2.2.1
|
||||
aiofiles>=23.2.1
|
||||
tenacity>=8.2.3
|
||||
python-dotenv>=1.0.1
|
||||
unstructured[pdf]>=0.15.0
|
||||
unstructured[docx]>=0.15.0
|
||||
unstructured[pptx]>=0.15.0
|
||||
unstructured[xlsx]>=0.15.0
|
||||
pdfplumber>=0.11.0
|
||||
python-docx>=1.1.0
|
||||
python-pptx>=0.6.23
|
||||
@ -30,5 +26,13 @@ beautifulsoup4>=4.12.3
|
||||
lxml>=5.2.1
|
||||
sqlalchemy>=2.0.25
|
||||
httpx>=0.27.0
|
||||
tiktoken>=0.7.0
|
||||
dowhy>=0.11.0
|
||||
qdrant-client>=1.7.0
|
||||
sentence-transformers>=2.2.0
|
||||
numpy>=1.24.0
|
||||
scipy>=1.11.0
|
||||
networkx>=3.1
|
||||
spacy>=3.7.0
|
||||
markdown>=3.5.0
|
||||
weasyprint>=60.0
|
||||
|
||||
|
||||
@ -1,328 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List
|
||||
|
||||
from anthropic import Anthropic, BadRequestError
|
||||
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential, RetryCallState
|
||||
|
||||
from .models import CausalRelation
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def is_billing_error(exception: Exception) -> bool:
|
||||
"""Check if the exception is a billing/credit related error that shouldn't be retried."""
|
||||
if isinstance(exception, BadRequestError):
|
||||
error_message = str(exception).lower()
|
||||
billing_keywords = ["credit", "balance", "too low", "billing", "upgrade", "purchase credits"]
|
||||
return any(keyword in error_message for keyword in billing_keywords)
|
||||
return False
|
||||
|
||||
|
||||
def should_retry_exception(retry_state: RetryCallState) -> bool:
|
||||
"""Custom retry condition that excludes billing errors."""
|
||||
exception = retry_state.outcome.exception()
|
||||
if exception is None:
|
||||
return False
|
||||
# Don't retry billing errors - they won't be resolved by retrying
|
||||
if is_billing_error(exception):
|
||||
return False
|
||||
# Retry other exceptions
|
||||
return True
|
||||
|
||||
|
||||
CLAUDE_PROMPT_TEMPLATE = """You are an expert analyst extracting causal relationships from documents.
|
||||
|
||||
Given the following text chunk, identify all explicit or strongly implied cause and effect pairs.
|
||||
Return JSON with the schema:
|
||||
[
|
||||
{
|
||||
"cause": "<short phrase>",
|
||||
"effect": "<short phrase>",
|
||||
"confidence": 0-1 float,
|
||||
"explanation": "<why this is causal>",
|
||||
"source_snippet": "<exact quote or paraphrase>"
|
||||
}
|
||||
]
|
||||
|
||||
Only include items when the causal direction is clear.
|
||||
If none are found, return an empty list [].
|
||||
|
||||
Text chunk:
|
||||
```
|
||||
<<<CHUNK_PLACEHOLDER>>>
|
||||
```"""
|
||||
|
||||
IMAGE_PROMPT_TEMPLATE = """You are an expert analyst extracting causal relationships from images, diagrams, and visual content.
|
||||
|
||||
Analyze this image/diagram for causal relationships. Look for:
|
||||
- Architecture flows (A → B → C)
|
||||
- Dependency relationships
|
||||
- Cause-effect chains in diagrams
|
||||
- Process flows
|
||||
- System interactions
|
||||
- Data flows
|
||||
- Sequential relationships
|
||||
- Visual connections between components
|
||||
|
||||
Return JSON with the schema:
|
||||
[
|
||||
{
|
||||
"cause": "<short phrase describing the cause>",
|
||||
"effect": "<short phrase describing the effect>",
|
||||
"confidence": 0-1 float,
|
||||
"explanation": "<why this is causal, referencing visual elements>",
|
||||
"source_snippet": "<description of what you see in the image that shows this relationship>"
|
||||
}
|
||||
]
|
||||
|
||||
Only include items when the causal direction is clear from the visual structure.
|
||||
If none are found, return an empty list []."""
|
||||
|
||||
|
||||
class ClaudeCausalExtractor:
|
||||
def __init__(self, api_key: str, model: str, max_output_tokens: int = 4000):
|
||||
self.client = Anthropic(api_key=api_key)
|
||||
self.model = model
|
||||
self.max_output_tokens = max_output_tokens
|
||||
|
||||
@retry(
|
||||
retry=should_retry_exception,
|
||||
wait=wait_exponential(multiplier=1, min=1, max=10),
|
||||
stop=stop_after_attempt(3),
|
||||
reraise=True,
|
||||
)
|
||||
def analyze_chunk(self, chunk: str, source_file_id: str) -> List[CausalRelation]:
|
||||
logger.debug("Analyzing chunk with Claude model %s", self.model)
|
||||
|
||||
# Validate chunk is not empty and is readable text
|
||||
if not chunk or not chunk.strip():
|
||||
logger.warning("Empty or whitespace-only chunk, skipping")
|
||||
return []
|
||||
|
||||
# Check if chunk contains mostly readable text (not binary data)
|
||||
# Simple heuristic: if >50% of characters are non-printable or control chars, skip it
|
||||
printable_chars = sum(1 for c in chunk if c.isprintable() or c.isspace())
|
||||
if len(chunk) > 100 and printable_chars / len(chunk) < 0.5:
|
||||
logger.warning("Chunk appears to contain binary data, skipping analysis")
|
||||
return []
|
||||
|
||||
# Use string replacement with a unique placeholder to avoid KeyError with braces in content
|
||||
# This prevents Python's .format() from interpreting braces in the chunk text as format placeholders
|
||||
prompt_text = CLAUDE_PROMPT_TEMPLATE.replace("<<<CHUNK_PLACEHOLDER>>>", chunk)
|
||||
|
||||
try:
|
||||
message = self.client.messages.create(
|
||||
model=self.model,
|
||||
max_tokens=self.max_output_tokens,
|
||||
temperature=0.0,
|
||||
system="You extract causal (cause→effect) relations with high precision.",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [{"type": "text", "text": prompt_text}],
|
||||
}
|
||||
],
|
||||
)
|
||||
except BadRequestError as e:
|
||||
# Check if it's a billing error
|
||||
if is_billing_error(e):
|
||||
error_msg = (
|
||||
"Anthropic API credit balance is too low. "
|
||||
"Please go to Plans & Billing to upgrade or purchase credits. "
|
||||
f"Error: {str(e)}"
|
||||
)
|
||||
logger.error(error_msg)
|
||||
raise RuntimeError(error_msg) from e
|
||||
# Re-raise other BadRequestErrors
|
||||
raise
|
||||
|
||||
content_blocks = message.content or []
|
||||
raw_text = "".join(block.text for block in content_blocks if hasattr(block, "text")) # type: ignore[attr-defined]
|
||||
if not raw_text:
|
||||
return []
|
||||
|
||||
# Try to extract JSON from markdown code blocks if present
|
||||
json_text = raw_text.strip()
|
||||
|
||||
# Look for JSON in markdown code blocks (```json ... ```)
|
||||
json_match = re.search(r'```(?:json)?\s*(\[.*?\])\s*```', json_text, re.DOTALL)
|
||||
if json_match:
|
||||
json_text = json_match.group(1)
|
||||
else:
|
||||
# Look for JSON array/object at the start or end
|
||||
json_match = re.search(r'(\[.*?\]|{.*?})', json_text, re.DOTALL)
|
||||
if json_match:
|
||||
json_text = json_match.group(1)
|
||||
|
||||
try:
|
||||
data = json.loads(json_text)
|
||||
if not isinstance(data, list):
|
||||
logger.warning("Claude response is not a list: %s", type(data))
|
||||
return []
|
||||
|
||||
relations: List[CausalRelation] = []
|
||||
for item in data:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
cause = item.get("cause", "").strip()
|
||||
effect = item.get("effect", "").strip()
|
||||
if not cause or not effect:
|
||||
continue # Skip invalid relations
|
||||
|
||||
relations.append(
|
||||
CausalRelation(
|
||||
cause=cause,
|
||||
effect=effect,
|
||||
confidence=float(item.get("confidence", 0.0)),
|
||||
explanation=item.get("explanation"),
|
||||
source_file_id=source_file_id,
|
||||
source_snippet=item.get("source_snippet"),
|
||||
metadata={"model": self.model},
|
||||
)
|
||||
)
|
||||
logger.info("Extracted %d relations from Claude response", len(relations))
|
||||
return relations
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning("Failed to parse Claude response as JSON: %s. Raw text: %s", e, raw_text[:200])
|
||||
return []
|
||||
|
||||
def analyze(self, chunks: Iterable[str], source_file_id: str) -> List[CausalRelation]:
|
||||
relations: List[CausalRelation] = []
|
||||
for chunk in chunks:
|
||||
relations.extend(self.analyze_chunk(chunk, source_file_id=source_file_id))
|
||||
return relations
|
||||
|
||||
@retry(
|
||||
retry=should_retry_exception,
|
||||
wait=wait_exponential(multiplier=1, min=1, max=10),
|
||||
stop=stop_after_attempt(3),
|
||||
reraise=True,
|
||||
)
|
||||
def analyze_image(self, image_path: Path, source_file_id: str) -> List[CausalRelation]:
|
||||
"""
|
||||
Analyze an image using Claude Vision API to extract causal relationships.
|
||||
Sends image directly to Claude (no OCR).
|
||||
"""
|
||||
logger.info("Analyzing image with Claude Vision: %s", image_path.name)
|
||||
|
||||
try:
|
||||
# Read and encode image as base64
|
||||
with open(image_path, "rb") as image_file:
|
||||
image_data = image_file.read()
|
||||
|
||||
# Determine media type
|
||||
suffix = image_path.suffix.lower()
|
||||
media_type_map = {
|
||||
".png": "image/png",
|
||||
".jpg": "image/jpeg",
|
||||
".jpeg": "image/jpeg",
|
||||
".gif": "image/gif",
|
||||
".webp": "image/webp",
|
||||
}
|
||||
media_type = media_type_map.get(suffix, "image/png")
|
||||
|
||||
# Encode to base64
|
||||
base64_image = base64.b64encode(image_data).decode("utf-8")
|
||||
|
||||
# Prepare content for Claude Vision API
|
||||
content = [
|
||||
{
|
||||
"type": "image",
|
||||
"source": {
|
||||
"type": "base64",
|
||||
"media_type": media_type,
|
||||
"data": base64_image,
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": IMAGE_PROMPT_TEMPLATE,
|
||||
},
|
||||
]
|
||||
|
||||
# Call Claude Vision API
|
||||
try:
|
||||
message = self.client.messages.create(
|
||||
model=self.model, # Claude models support vision
|
||||
max_tokens=self.max_output_tokens,
|
||||
temperature=0.0,
|
||||
system="You extract causal (cause→effect) relations from visual content with high precision.",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": content,
|
||||
}
|
||||
],
|
||||
)
|
||||
except BadRequestError as e:
|
||||
# Check if it's a billing error
|
||||
if is_billing_error(e):
|
||||
error_msg = (
|
||||
"Anthropic API credit balance is too low. "
|
||||
"Please go to Plans & Billing to upgrade or purchase credits. "
|
||||
f"Error: {str(e)}"
|
||||
)
|
||||
logger.error(error_msg)
|
||||
raise RuntimeError(error_msg) from e
|
||||
# Re-raise other BadRequestErrors
|
||||
raise
|
||||
|
||||
# Parse response
|
||||
content_blocks = message.content or []
|
||||
raw_text = "".join(block.text for block in content_blocks if hasattr(block, "text")) # type: ignore[attr-defined]
|
||||
if not raw_text:
|
||||
logger.warning("No text response from Claude Vision for image %s", image_path.name)
|
||||
return []
|
||||
|
||||
# Extract JSON from response
|
||||
json_text = raw_text.strip()
|
||||
json_match = re.search(r'```(?:json)?\s*(\[.*?\])\s*```', json_text, re.DOTALL)
|
||||
if json_match:
|
||||
json_text = json_match.group(1)
|
||||
else:
|
||||
json_match = re.search(r'(\[.*?\]|{.*?})', json_text, re.DOTALL)
|
||||
if json_match:
|
||||
json_text = json_match.group(1)
|
||||
|
||||
try:
|
||||
data = json.loads(json_text)
|
||||
if not isinstance(data, list):
|
||||
logger.warning("Claude Vision response is not a list: %s", type(data))
|
||||
return []
|
||||
|
||||
relations: List[CausalRelation] = []
|
||||
for item in data:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
cause = item.get("cause", "").strip()
|
||||
effect = item.get("effect", "").strip()
|
||||
if not cause or not effect:
|
||||
continue
|
||||
|
||||
relations.append(
|
||||
CausalRelation(
|
||||
cause=cause,
|
||||
effect=effect,
|
||||
confidence=float(item.get("confidence", 0.0)),
|
||||
explanation=item.get("explanation"),
|
||||
source_file_id=source_file_id,
|
||||
source_snippet=item.get("source_snippet") or f"Image: {image_path.name}",
|
||||
metadata={"model": self.model, "content_type": "image", "image_path": str(image_path)},
|
||||
)
|
||||
)
|
||||
logger.info("Extracted %d relations from image %s", len(relations), image_path.name)
|
||||
return relations
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning("Failed to parse Claude Vision response as JSON: %s. Raw text: %s", e, raw_text[:200])
|
||||
return []
|
||||
|
||||
except Exception as exc:
|
||||
logger.exception("Failed to analyze image %s: %s", image_path, exc)
|
||||
return []
|
||||
|
||||
@ -20,7 +20,7 @@ class Settings(BaseSettings):
|
||||
model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")
|
||||
|
||||
anthropic_api_key: str | None = Field(default=None, validation_alias="ANTHROPIC_API_KEY")
|
||||
claude_model: str = Field(default=os.getenv("MULTI_DOC_CLAUDE_MODEL", "claude-3-5-sonnet-20241022"))
|
||||
claude_model: str = Field(default=os.getenv("MULTI_DOC_CLAUDE_MODEL", os.getenv("CLAUDE_MODEL", "claude-3-5-haiku-latest")))
|
||||
claude_max_input_tokens: int = Field(default=200_000)
|
||||
claude_max_output_tokens: int = Field(default=16_000)
|
||||
|
||||
@ -37,6 +37,27 @@ class Settings(BaseSettings):
|
||||
|
||||
job_retention_days: int = Field(default=30)
|
||||
|
||||
# Qwen2.5-VL API configuration
|
||||
qwen_api_key: str | None = Field(default=None, validation_alias="QWEN_API_KEY")
|
||||
qwen_api_url: str = Field(default=os.getenv("QWEN_API_URL", "https://api.example.com/v1/chat/completions"))
|
||||
qwen_model: str = Field(default=os.getenv("QWEN_MODEL", "qwen2.5-vl"))
|
||||
|
||||
# DoWhy configuration
|
||||
dowhy_enabled: bool = Field(default=True)
|
||||
dowhy_confidence_threshold: float = Field(default=0.05)
|
||||
|
||||
# Embedding configuration
|
||||
embedding_model: str = Field(default="sentence-transformers/all-MiniLM-L6-v2")
|
||||
embedding_dimension: int = Field(default=384)
|
||||
|
||||
# Qdrant configuration
|
||||
qdrant_url: str = Field(default=os.getenv("QDRANT_URL", "http://localhost:6333"))
|
||||
qdrant_collection_name: str = Field(default="kg_embeddings")
|
||||
qdrant_vector_size: int = Field(default=384)
|
||||
|
||||
# Report generation configuration
|
||||
report_format: str = Field(default="markdown")
|
||||
|
||||
def ensure_storage_dirs(self) -> None:
|
||||
(self.storage_root / "jobs").mkdir(parents=True, exist_ok=True)
|
||||
(self.storage_root / "uploads").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
@ -1,168 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Try to import unstructured, but fall back to alternatives if not available
|
||||
try:
|
||||
from unstructured.partition.auto import partition
|
||||
HAS_UNSTRUCTURED = True
|
||||
except ImportError:
|
||||
HAS_UNSTRUCTURED = False
|
||||
logger.warning("unstructured not available, will use fallback extractors")
|
||||
|
||||
# Fallback extractors
|
||||
try:
|
||||
import pdfplumber
|
||||
HAS_PDFPLUMBER = True
|
||||
except ImportError:
|
||||
HAS_PDFPLUMBER = False
|
||||
|
||||
try:
|
||||
from docx import Document as DocxDocument
|
||||
HAS_DOCX = True
|
||||
except ImportError:
|
||||
HAS_DOCX = False
|
||||
|
||||
try:
|
||||
from pptx import Presentation
|
||||
HAS_PPTX = True
|
||||
except ImportError:
|
||||
HAS_PPTX = False
|
||||
|
||||
# Image processing libraries
|
||||
try:
|
||||
from PIL import Image
|
||||
import pytesseract
|
||||
HAS_OCR = True
|
||||
except ImportError:
|
||||
HAS_OCR = False
|
||||
logger.warning("OCR libraries not available, image extraction will be limited")
|
||||
|
||||
|
||||
def extract_text(path: Path) -> str:
|
||||
"""
|
||||
Extract text from a file using multiple strategies.
|
||||
Falls back through: unstructured -> format-specific -> plain text read.
|
||||
"""
|
||||
suffix = path.suffix.lower()
|
||||
|
||||
# Validate PDF file before processing
|
||||
if suffix == ".pdf":
|
||||
# Quick validation: check if file starts with PDF magic bytes
|
||||
try:
|
||||
with path.open("rb") as f:
|
||||
header = f.read(4)
|
||||
if header != b"%PDF":
|
||||
raise ValueError(
|
||||
f"File {path.name} does not appear to be a valid PDF. "
|
||||
f"PDF files must start with '%PDF' magic bytes. "
|
||||
f"Got: {header[:20] if len(header) > 0 else 'empty file'}"
|
||||
)
|
||||
except Exception as exc:
|
||||
if isinstance(exc, ValueError):
|
||||
raise
|
||||
logger.warning("Could not validate PDF header: %s", exc)
|
||||
|
||||
# Image files - return empty text (will be processed directly with Claude Vision)
|
||||
# We skip OCR and send images directly to Claude Vision API
|
||||
if suffix in {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}:
|
||||
logger.info("Image file detected: %s. Will be processed directly with Claude Vision (no OCR)", path.name)
|
||||
# Return empty string - images will be handled separately in pipeline
|
||||
return ""
|
||||
|
||||
# Plain text files - direct read
|
||||
if suffix in {".txt", ".md", ".json", ".xml", ".html", ".csv"}:
|
||||
try:
|
||||
return path.read_text(encoding="utf-8", errors="ignore")
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to read %s as text: %s", path, exc)
|
||||
raise
|
||||
|
||||
# Try unstructured first (if available)
|
||||
if HAS_UNSTRUCTURED:
|
||||
try:
|
||||
elements = partition(filename=str(path))
|
||||
lines: List[str] = []
|
||||
for element in elements:
|
||||
text = getattr(element, "text", None)
|
||||
if text:
|
||||
lines.append(text.strip())
|
||||
if lines:
|
||||
logger.info("Extracted %d lines using unstructured", len(lines))
|
||||
return "\n".join(lines)
|
||||
except Exception as exc:
|
||||
logger.warning("unstructured extraction failed for %s: %s", path, exc)
|
||||
# Continue to fallback methods
|
||||
|
||||
# Fallback: PDF with pdfplumber
|
||||
if suffix == ".pdf" and HAS_PDFPLUMBER:
|
||||
try:
|
||||
with pdfplumber.open(path) as pdf:
|
||||
text_parts = []
|
||||
for page in pdf.pages:
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
text_parts.append(page_text)
|
||||
if text_parts:
|
||||
logger.info("Extracted PDF using pdfplumber")
|
||||
return "\n".join(text_parts)
|
||||
except Exception as exc:
|
||||
logger.warning("pdfplumber extraction failed for %s: %s", path, exc)
|
||||
|
||||
# Fallback: DOCX
|
||||
if suffix == ".docx" and HAS_DOCX:
|
||||
try:
|
||||
doc = DocxDocument(path)
|
||||
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
||||
if paragraphs:
|
||||
logger.info("Extracted DOCX using python-docx")
|
||||
return "\n".join(paragraphs)
|
||||
except Exception as exc:
|
||||
logger.warning("python-docx extraction failed for %s: %s", path, exc)
|
||||
|
||||
# Fallback: PPTX
|
||||
if suffix in {".pptx", ".ppt"} and HAS_PPTX:
|
||||
try:
|
||||
prs = Presentation(path)
|
||||
text_parts = []
|
||||
for slide in prs.slides:
|
||||
for shape in slide.shapes:
|
||||
if hasattr(shape, "text") and shape.text:
|
||||
text_parts.append(shape.text.strip())
|
||||
if text_parts:
|
||||
logger.info("Extracted PPTX using python-pptx")
|
||||
return "\n".join(text_parts)
|
||||
except Exception as exc:
|
||||
logger.warning("python-pptx extraction failed for %s: %s", path, exc)
|
||||
|
||||
# Last resort: try to read as text anyway, but validate it's readable
|
||||
try:
|
||||
content = path.read_text(encoding="utf-8", errors="ignore")
|
||||
if content.strip():
|
||||
# Check if content is actually readable text (not binary data)
|
||||
# Simple heuristic: if >30% of characters are printable, consider it text
|
||||
printable_chars = sum(1 for c in content if c.isprintable() or c.isspace())
|
||||
total_chars = len(content)
|
||||
|
||||
if total_chars > 0 and printable_chars / total_chars > 0.3:
|
||||
logger.warning("Read %s as plain text (may contain binary data)", path)
|
||||
return content
|
||||
else:
|
||||
logger.error("Content from %s appears to be binary data, cannot extract text", path)
|
||||
raise ValueError(f"File {path} appears to be binary or corrupted. Cannot extract readable text.")
|
||||
except Exception as exc:
|
||||
if isinstance(exc, ValueError):
|
||||
raise
|
||||
logger.warning("Failed to read %s as text: %s", path, exc)
|
||||
|
||||
# If all else fails, raise an error
|
||||
raise ValueError(
|
||||
f"Could not extract text from {path}. "
|
||||
f"File type may not be supported, file may be corrupted, or dependencies are missing. "
|
||||
f"Supported formats: PDF, DOCX, PPTX, XLSX, TXT, MD, JSON, XML, HTML, CSV, PNG, JPG, JPEG (with OCR)"
|
||||
)
|
||||
|
||||
@ -0,0 +1,320 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
HAS_PYMUPDF = True
|
||||
except ImportError:
|
||||
HAS_PYMUPDF = False
|
||||
logger.warning("PyMuPDF not available")
|
||||
|
||||
try:
|
||||
from docx import Document as DocxDocument
|
||||
HAS_DOCX = True
|
||||
except ImportError:
|
||||
HAS_DOCX = False
|
||||
logger.warning("python-docx not available")
|
||||
|
||||
try:
|
||||
from pptx import Presentation
|
||||
HAS_PPTX = True
|
||||
except ImportError:
|
||||
HAS_PPTX = False
|
||||
logger.warning("python-pptx not available")
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
HAS_PANDAS = True
|
||||
except ImportError:
|
||||
HAS_PANDAS = False
|
||||
logger.warning("pandas not available")
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractedText:
|
||||
"""Structured text extraction with context."""
|
||||
text: str
|
||||
page_number: int
|
||||
metadata: dict
|
||||
context: Optional[str] = None # Surrounding context
|
||||
|
||||
|
||||
def extract_text_with_context(path: Path) -> List[ExtractedText]:
|
||||
"""
|
||||
Extract text from PDF using PyMuPDF with page-level context.
|
||||
Returns structured text with metadata.
|
||||
"""
|
||||
if not HAS_PYMUPDF:
|
||||
raise ImportError("PyMuPDF is required for text extraction")
|
||||
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"File not found: {path}")
|
||||
|
||||
if path.suffix.lower() != ".pdf":
|
||||
# For non-PDF files, fall back to simple text reading
|
||||
try:
|
||||
text = path.read_text(encoding="utf-8", errors="ignore")
|
||||
return [ExtractedText(
|
||||
text=text,
|
||||
page_number=1,
|
||||
metadata={"file_type": path.suffix, "filename": path.name},
|
||||
context=None
|
||||
)]
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to read %s as text: %s", path, exc)
|
||||
raise
|
||||
|
||||
extracted_pages: List[ExtractedText] = []
|
||||
|
||||
try:
|
||||
doc = fitz.open(path)
|
||||
|
||||
for page_num in range(len(doc)):
|
||||
page = doc[page_num]
|
||||
|
||||
# Extract text
|
||||
text = page.get_text()
|
||||
|
||||
# Extract metadata
|
||||
metadata = {
|
||||
"page_number": page_num + 1,
|
||||
"page_count": len(doc),
|
||||
"filename": path.name,
|
||||
"file_type": "pdf",
|
||||
"page_rect": {
|
||||
"width": page.rect.width,
|
||||
"height": page.rect.height
|
||||
}
|
||||
}
|
||||
|
||||
# Extract context (surrounding pages for better understanding)
|
||||
context = None
|
||||
if page_num > 0:
|
||||
prev_page = doc[page_num - 1]
|
||||
prev_text = prev_page.get_text()[:500] # Last 500 chars of previous page
|
||||
context = f"Previous page context: {prev_text}"
|
||||
|
||||
if text.strip():
|
||||
extracted_pages.append(ExtractedText(
|
||||
text=text,
|
||||
page_number=page_num + 1,
|
||||
metadata=metadata,
|
||||
context=context
|
||||
))
|
||||
|
||||
doc.close()
|
||||
logger.info("Extracted text from %d pages in %s", len(extracted_pages), path.name)
|
||||
return extracted_pages
|
||||
|
||||
except Exception as exc:
|
||||
logger.exception("Failed to extract text from PDF %s: %s", path, exc)
|
||||
raise
|
||||
|
||||
|
||||
def extract_text_from_docx(path: Path) -> str:
|
||||
"""
|
||||
Extract text from DOCX file using python-docx.
|
||||
Reads paragraphs and tables as per README Step 2.2b.
|
||||
"""
|
||||
if not HAS_DOCX:
|
||||
raise ImportError("python-docx is required for DOCX extraction")
|
||||
|
||||
try:
|
||||
doc = DocxDocument(path)
|
||||
text_parts = []
|
||||
|
||||
# Extract paragraphs
|
||||
for paragraph in doc.paragraphs:
|
||||
if paragraph.text.strip():
|
||||
text_parts.append(paragraph.text.strip())
|
||||
|
||||
# Extract tables
|
||||
for table in doc.tables:
|
||||
table_text = []
|
||||
for row in table.rows:
|
||||
row_text = []
|
||||
for cell in row.cells:
|
||||
if cell.text.strip():
|
||||
row_text.append(cell.text.strip())
|
||||
if row_text:
|
||||
table_text.append(" | ".join(row_text))
|
||||
if table_text:
|
||||
text_parts.append("\n".join(table_text))
|
||||
|
||||
result = "\n\n".join(text_parts)
|
||||
logger.info("Extracted %d characters from DOCX %s", len(result), path.name)
|
||||
return result
|
||||
except Exception as exc:
|
||||
logger.exception("Failed to extract text from DOCX %s: %s", path, exc)
|
||||
raise
|
||||
|
||||
|
||||
def extract_text_from_pptx(path: Path) -> str:
|
||||
"""
|
||||
Extract text from PPTX file using python-pptx.
|
||||
Reads slides, titles, and notes as per README Step 2.2c.
|
||||
"""
|
||||
if not HAS_PPTX:
|
||||
raise ImportError("python-pptx is required for PPTX extraction")
|
||||
|
||||
try:
|
||||
prs = Presentation(path)
|
||||
text_parts = []
|
||||
|
||||
for slide_num, slide in enumerate(prs.slides, 1):
|
||||
slide_text = []
|
||||
|
||||
# Extract slide title
|
||||
if slide.shapes.title and slide.shapes.title.text:
|
||||
slide_text.append(f"Slide {slide_num} Title: {slide.shapes.title.text.strip()}")
|
||||
|
||||
# Extract content from shapes
|
||||
for shape in slide.shapes:
|
||||
if hasattr(shape, "text") and shape.text.strip():
|
||||
# Skip title (already extracted)
|
||||
if not (slide.shapes.title and shape == slide.shapes.title):
|
||||
slide_text.append(shape.text.strip())
|
||||
|
||||
# Extract notes (if available)
|
||||
if hasattr(slide, "notes_slide") and slide.notes_slide:
|
||||
notes_text = ""
|
||||
for shape in slide.notes_slide.shapes:
|
||||
if hasattr(shape, "text") and shape.text.strip():
|
||||
notes_text += shape.text.strip() + " "
|
||||
if notes_text.strip():
|
||||
slide_text.append(f"Notes: {notes_text.strip()}")
|
||||
|
||||
if slide_text:
|
||||
text_parts.append("\n".join(slide_text))
|
||||
|
||||
result = "\n\n".join(text_parts)
|
||||
logger.info("Extracted %d characters from PPTX %s (%d slides)",
|
||||
len(result), path.name, len(prs.slides))
|
||||
return result
|
||||
except Exception as exc:
|
||||
logger.exception("Failed to extract text from PPTX %s: %s", path, exc)
|
||||
raise
|
||||
|
||||
|
||||
def extract_text_from_spreadsheet(path: Path) -> str:
|
||||
"""
|
||||
Extract text from CSV/XLSX file using pandas.
|
||||
Reads rows and columns, converts to text representation as per README Step 2.2d.
|
||||
"""
|
||||
if not HAS_PANDAS:
|
||||
raise ImportError("pandas is required for spreadsheet extraction")
|
||||
|
||||
try:
|
||||
suffix = path.suffix.lower()
|
||||
text_parts = []
|
||||
|
||||
if suffix == ".csv":
|
||||
df = pd.read_csv(path, encoding="utf-8", errors="ignore")
|
||||
elif suffix in {".xlsx", ".xls"}:
|
||||
# Read first sheet by default
|
||||
df = pd.read_excel(path, engine="openpyxl" if suffix == ".xlsx" else None)
|
||||
else:
|
||||
raise ValueError(f"Unsupported spreadsheet format: {suffix}")
|
||||
|
||||
# Convert DataFrame to text representation
|
||||
# Add column headers
|
||||
headers = " | ".join(str(col) for col in df.columns)
|
||||
text_parts.append(f"Columns: {headers}")
|
||||
|
||||
# Add rows (limit to first 1000 rows to avoid huge output)
|
||||
max_rows = min(1000, len(df))
|
||||
for idx, row in df.head(max_rows).iterrows():
|
||||
row_values = " | ".join(str(val) if pd.notna(val) else "" for val in row)
|
||||
text_parts.append(f"Row {idx + 1}: {row_values}")
|
||||
|
||||
if len(df) > max_rows:
|
||||
text_parts.append(f"... ({len(df) - max_rows} more rows)")
|
||||
|
||||
result = "\n".join(text_parts)
|
||||
logger.info("Extracted %d characters from spreadsheet %s (%d rows)",
|
||||
len(result), path.name, len(df))
|
||||
return result
|
||||
except Exception as exc:
|
||||
logger.exception("Failed to extract text from spreadsheet %s: %s", path, exc)
|
||||
raise
|
||||
|
||||
|
||||
def clean_text(text: str) -> str:
|
||||
"""
|
||||
Clean extracted text as per README Step 2.3.
|
||||
- Remove extra whitespace
|
||||
- Fix encoding issues
|
||||
- Preserve important structure
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
# Fix encoding issues (remove non-printable characters except newlines and tabs)
|
||||
cleaned = "".join(char for char in text if char.isprintable() or char in "\n\t\r")
|
||||
|
||||
# Remove extra whitespace (but preserve paragraph breaks)
|
||||
# Replace multiple spaces with single space
|
||||
cleaned = re.sub(r'[ \t]+', ' ', cleaned)
|
||||
|
||||
# Normalize line breaks (preserve double newlines for paragraphs)
|
||||
cleaned = re.sub(r'\r\n', '\n', cleaned) # Windows line breaks
|
||||
cleaned = re.sub(r'\r', '\n', cleaned) # Old Mac line breaks
|
||||
|
||||
# Preserve paragraph structure (double newlines)
|
||||
# But remove excessive blank lines (more than 2 consecutive)
|
||||
cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
|
||||
|
||||
# Remove leading/trailing whitespace from each line
|
||||
lines = [line.strip() for line in cleaned.split('\n')]
|
||||
cleaned = '\n'.join(lines)
|
||||
|
||||
# Remove leading/trailing whitespace overall
|
||||
cleaned = cleaned.strip()
|
||||
|
||||
return cleaned
|
||||
|
||||
|
||||
def extract_all_text(path: Path) -> str:
|
||||
"""
|
||||
Extract all text from a file based on type (as per README Step 2).
|
||||
Routes to appropriate extractor: PDF, DOCX, PPTX, CSV/XLSX, or plain text.
|
||||
"""
|
||||
suffix = path.suffix.lower()
|
||||
|
||||
# Step 2.2a: PDF
|
||||
if suffix == ".pdf" and HAS_PYMUPDF:
|
||||
extracted_pages = extract_text_with_context(path)
|
||||
text = "\n\n".join([page.text for page in extracted_pages])
|
||||
|
||||
# Step 2.2b: DOCX (Word)
|
||||
elif suffix == ".docx" and HAS_DOCX:
|
||||
text = extract_text_from_docx(path)
|
||||
|
||||
# Step 2.2c: PPTX (PowerPoint)
|
||||
elif suffix in {".pptx", ".ppt"} and HAS_PPTX:
|
||||
text = extract_text_from_pptx(path)
|
||||
|
||||
# Step 2.2d: CSV/XLSX (Spreadsheet)
|
||||
elif suffix in {".csv", ".xlsx", ".xls"} and HAS_PANDAS:
|
||||
text = extract_text_from_spreadsheet(path)
|
||||
|
||||
# Fallback: Plain text files
|
||||
else:
|
||||
try:
|
||||
text = path.read_text(encoding="utf-8", errors="ignore")
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to read %s as text: %s", path, exc)
|
||||
raise
|
||||
|
||||
# Step 2.3: TEXT CLEANING
|
||||
text = clean_text(text)
|
||||
|
||||
return text
|
||||
|
||||
@ -0,0 +1,153 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import httpx
|
||||
|
||||
from ..config import get_settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class QwenVisionClient:
|
||||
"""Client for Qwen2.5-VL API to extract relationships from diagrams and ERDs."""
|
||||
|
||||
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None, model: Optional[str] = None):
|
||||
settings = get_settings()
|
||||
self.api_key = api_key or settings.qwen_api_key
|
||||
self.api_url = api_url or settings.qwen_api_url
|
||||
self.model = model or settings.qwen_model
|
||||
|
||||
if not self.api_key:
|
||||
logger.warning("Qwen API key not configured")
|
||||
|
||||
def extract_relationships_from_image(self, image_path: Path, source_file_id: str) -> List[Dict]:
|
||||
"""
|
||||
Extract relationships (entities, connections, flows) from an image using Qwen2.5-VL.
|
||||
Returns list of extracted relationships.
|
||||
"""
|
||||
if not self.api_key:
|
||||
logger.warning("Qwen API key not configured, skipping image analysis")
|
||||
return []
|
||||
|
||||
try:
|
||||
# Read and encode image
|
||||
with open(image_path, "rb") as img_file:
|
||||
image_data = img_file.read()
|
||||
|
||||
base64_image = base64.b64encode(image_data).decode("utf-8")
|
||||
|
||||
# Determine media type
|
||||
suffix = image_path.suffix.lower()
|
||||
media_type_map = {
|
||||
".png": "image/png",
|
||||
".jpg": "image/jpeg",
|
||||
".jpeg": "image/jpeg",
|
||||
".gif": "image/gif",
|
||||
".webp": "image/webp",
|
||||
}
|
||||
media_type = media_type_map.get(suffix, "image/png")
|
||||
|
||||
# Prepare prompt for relationship extraction
|
||||
prompt = """Analyze this diagram/ERD/image and extract all relationships, entities, and connections.
|
||||
|
||||
Extract:
|
||||
1. Entities (boxes, nodes, components)
|
||||
2. Relationships between entities (arrows, connections, flows)
|
||||
3. Data flows and dependencies
|
||||
4. Process flows
|
||||
5. Architecture patterns
|
||||
|
||||
Return JSON with this structure:
|
||||
[
|
||||
{
|
||||
"entity1": "name of first entity",
|
||||
"entity2": "name of second entity",
|
||||
"relationship_type": "causes|depends_on|flows_to|contains|uses",
|
||||
"description": "description of the relationship",
|
||||
"confidence": 0.0-1.0
|
||||
}
|
||||
]
|
||||
|
||||
Focus on cause-effect relationships, dependencies, and flows."""
|
||||
|
||||
# Prepare API request
|
||||
payload = {
|
||||
"model": self.model,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:{media_type};base64,{base64_image}"
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": prompt
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"max_tokens": 4000,
|
||||
"temperature": 0.0
|
||||
}
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
# Make API call
|
||||
with httpx.Client(timeout=60.0) as client:
|
||||
response = client.post(self.api_url, json=payload, headers=headers)
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
|
||||
# Parse response
|
||||
content = result.get("choices", [{}])[0].get("message", {}).get("content", "")
|
||||
|
||||
if not content:
|
||||
logger.warning("Empty response from Qwen API for image %s", image_path.name)
|
||||
return []
|
||||
|
||||
# Extract JSON from response
|
||||
json_text = content.strip()
|
||||
|
||||
# Try to find JSON in markdown code blocks
|
||||
if "```json" in json_text:
|
||||
json_text = json_text.split("```json")[1].split("```")[0].strip()
|
||||
elif "```" in json_text:
|
||||
json_text = json_text.split("```")[1].split("```")[0].strip()
|
||||
|
||||
# Parse JSON
|
||||
try:
|
||||
relationships = json.loads(json_text)
|
||||
if not isinstance(relationships, list):
|
||||
relationships = [relationships]
|
||||
|
||||
# Add source metadata
|
||||
for rel in relationships:
|
||||
rel["source_file_id"] = source_file_id
|
||||
rel["source_image"] = str(image_path.name)
|
||||
rel["extraction_method"] = "qwen2.5-vl"
|
||||
|
||||
logger.info("Extracted %d relationships from image %s using Qwen2.5-VL",
|
||||
len(relationships), image_path.name)
|
||||
return relationships
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning("Failed to parse Qwen response as JSON: %s. Content: %s",
|
||||
e, content[:200])
|
||||
return []
|
||||
|
||||
except Exception as exc:
|
||||
logger.exception("Failed to extract relationships from image %s: %s", image_path, exc)
|
||||
return []
|
||||
|
||||
@ -2,15 +2,16 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
from fastapi import BackgroundTasks, Depends, FastAPI, File, Form, HTTPException, UploadFile
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import FileResponse
|
||||
|
||||
from .claude_client import ClaudeCausalExtractor
|
||||
from .config import Settings, get_settings
|
||||
from .jobs import JobStore
|
||||
from .models import CreateJobResponse, JobGraphSummary, JobStage, JobStatusResponse
|
||||
from .models import CreateJobResponse, JobGraphSummary, JobStage, JobStatusResponse, ProjectReport
|
||||
from .processors.graph_writer import GraphWriter
|
||||
from .storage import StorageManager
|
||||
from .workflows.pipeline import JobPipeline
|
||||
@ -20,8 +21,8 @@ logging.basicConfig(level=logging.INFO)
|
||||
|
||||
app = FastAPI(
|
||||
title="Multi Document Upload Service",
|
||||
version="0.1.0",
|
||||
description="Processes multi-format documents to build causal knowledge graphs using Claude.",
|
||||
version="0.2.0",
|
||||
description="Processes multi-format documents to build knowledge graphs and generate beginner-friendly onboarding reports.",
|
||||
)
|
||||
|
||||
|
||||
@ -40,7 +41,6 @@ class ServiceContainer:
|
||||
storage: StorageManager
|
||||
job_store: JobStore
|
||||
graph_writer: GraphWriter
|
||||
claude_extractor: ClaudeCausalExtractor
|
||||
pipeline: JobPipeline
|
||||
|
||||
|
||||
@ -51,29 +51,24 @@ def get_container() -> ServiceContainer:
|
||||
global _container
|
||||
if _container is None:
|
||||
settings = get_settings()
|
||||
if not settings.anthropic_api_key:
|
||||
raise HTTPException(status_code=500, detail="ANTHROPIC_API_KEY is not configured")
|
||||
# Anthropic API key is only needed for report generation, not required at startup
|
||||
# if not settings.anthropic_api_key:
|
||||
# raise HTTPException(status_code=500, detail="ANTHROPIC_API_KEY is not configured")
|
||||
|
||||
storage = StorageManager(settings.storage_root)
|
||||
job_store = JobStore(settings.storage_root)
|
||||
graph_writer = GraphWriter(settings.neo4j_uri, settings.neo4j_user, settings.neo4j_password)
|
||||
claude_extractor = ClaudeCausalExtractor(
|
||||
api_key=settings.anthropic_api_key,
|
||||
model=settings.claude_model,
|
||||
max_output_tokens=min(settings.claude_max_output_tokens, 4000),
|
||||
)
|
||||
|
||||
pipeline = JobPipeline(
|
||||
job_store=job_store,
|
||||
storage=storage,
|
||||
graph_writer=graph_writer,
|
||||
claude_extractor=claude_extractor,
|
||||
)
|
||||
_container = ServiceContainer(
|
||||
settings=settings,
|
||||
storage=storage,
|
||||
job_store=job_store,
|
||||
graph_writer=graph_writer,
|
||||
claude_extractor=claude_extractor,
|
||||
pipeline=pipeline,
|
||||
)
|
||||
return _container
|
||||
@ -170,14 +165,86 @@ async def get_job_graph(job_id: str, container: ServiceContainer = Depends(get_d
|
||||
)
|
||||
|
||||
|
||||
@app.get("/jobs/{job_id}/report", response_model=ProjectReport)
|
||||
async def get_job_report(job_id: str, container: ServiceContainer = Depends(get_dependencies)) -> ProjectReport:
|
||||
"""Get the generated beginner-friendly onboarding report."""
|
||||
job_store = container.job_store
|
||||
if not job_store.exists(job_id):
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
job = job_store.get(job_id)
|
||||
if job.stage != JobStage.COMPLETED:
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail="Report not ready yet. Job is still processing."
|
||||
)
|
||||
if not job.report:
|
||||
# Check if there was an error during report generation
|
||||
error_msg = "Report not found. "
|
||||
if job.error:
|
||||
# Check if error is specifically about report generation
|
||||
if "report generation" in job.error.lower() or "claude" in job.error.lower():
|
||||
error_msg = job.error
|
||||
else:
|
||||
error_msg += f"Error during generation: {job.error}"
|
||||
else:
|
||||
error_msg += "Report generation may have failed (check logs for details)."
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail=error_msg
|
||||
)
|
||||
return job.report
|
||||
|
||||
|
||||
@app.get("/jobs/{job_id}/report/pdf")
|
||||
async def get_job_report_pdf(job_id: str, container: ServiceContainer = Depends(get_dependencies)):
|
||||
"""Download the PDF version of the onboarding report (as per README Step 7.9)."""
|
||||
job_store = container.job_store
|
||||
if not job_store.exists(job_id):
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
job = job_store.get(job_id)
|
||||
if job.stage != JobStage.COMPLETED:
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail="Report not ready yet. Job is still processing."
|
||||
)
|
||||
if not job.report:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail="Report not found. Job may have completed without generating report."
|
||||
)
|
||||
|
||||
# Get PDF path from report metadata
|
||||
pdf_path_str = job.report.metadata.get("pdf_path")
|
||||
if not pdf_path_str:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail="PDF not available. Report may have been generated without PDF conversion."
|
||||
)
|
||||
|
||||
pdf_path = Path(pdf_path_str)
|
||||
if not pdf_path.exists():
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail="PDF file not found on server."
|
||||
)
|
||||
|
||||
return FileResponse(
|
||||
path=pdf_path,
|
||||
media_type="application/pdf",
|
||||
filename=f"onboarding_report_{job_id}.pdf"
|
||||
)
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def healthcheck(container: ServiceContainer = Depends(get_dependencies)):
|
||||
settings = container.settings
|
||||
return {
|
||||
"status": "ok",
|
||||
"claude_model": settings.claude_model,
|
||||
"max_input_tokens_per_min": settings.claude_max_input_tokens,
|
||||
"max_output_tokens_per_min": settings.claude_max_output_tokens,
|
||||
"qwen_model": settings.qwen_model,
|
||||
"embedding_model": settings.embedding_model,
|
||||
"qdrant_url": settings.qdrant_url,
|
||||
"dowhy_enabled": settings.dowhy_enabled,
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -10,9 +10,10 @@ from pydantic import BaseModel, Field
|
||||
class JobStage(str, Enum):
|
||||
RECEIVED = "received"
|
||||
SAVING_FILES = "saving_files"
|
||||
EXTRACTING = "extracting"
|
||||
ANALYZING = "analyzing"
|
||||
BUILDING_GRAPH = "building_graph"
|
||||
EXTRACTING = "extracting" # PyMuPDF + Qwen2.5-VL
|
||||
BUILDING_GRAPH = "building_graph" # DoWhy + Neo4j
|
||||
INDEXING_VECTORS = "indexing_vectors" # Qdrant
|
||||
GENERATING_REPORT = "generating_report" # Claude onboarding doc
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
|
||||
@ -34,6 +35,7 @@ class CausalRelation(BaseModel):
|
||||
explanation: Optional[str] = None
|
||||
source_file_id: Optional[str] = None
|
||||
source_snippet: Optional[str] = None
|
||||
relationship_type: str = Field(default="CAUSES") # DEPENDS_ON, USES, IMPLEMENTS, etc.
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
|
||||
@ -46,6 +48,7 @@ class JobRecord(BaseModel):
|
||||
total_files: int = 0
|
||||
processed_files: int = 0
|
||||
relations: List[CausalRelation] = Field(default_factory=list)
|
||||
report: Optional[ProjectReport] = None # Generated onboarding report
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow)
|
||||
updated_at: datetime = Field(default_factory=datetime.utcnow)
|
||||
error: str | None = None
|
||||
@ -82,3 +85,15 @@ class JobGraphSummary(BaseModel):
|
||||
edge_count: int
|
||||
generated_at: datetime
|
||||
|
||||
|
||||
class ProjectReport(BaseModel):
|
||||
"""Beginner-friendly onboarding report generated from project documents."""
|
||||
job_id: str
|
||||
title: str = "Project Onboarding Guide"
|
||||
content: str # Markdown content
|
||||
sections: Dict[str, str] = Field(default_factory=dict) # Section name -> content
|
||||
key_concepts: List[str] = Field(default_factory=list) # Important concepts covered
|
||||
total_pages: int = 0 # Estimated pages
|
||||
generated_at: datetime = Field(default_factory=datetime.utcnow)
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
|
||||
@ -1,24 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Iterable, List
|
||||
|
||||
import tiktoken
|
||||
|
||||
|
||||
class TextChunker:
|
||||
def __init__(self, model_name: str, token_target: int = 800, overlap: int = 200):
|
||||
self.encoder = tiktoken.encoding_for_model("gpt-4o") if "claude" not in model_name else tiktoken.get_encoding("cl100k_base")
|
||||
self.token_target = token_target
|
||||
self.overlap = overlap
|
||||
|
||||
def chunk(self, text: str) -> Iterable[str]:
|
||||
tokens = self.encoder.encode(text)
|
||||
step = max(self.token_target - self.overlap, 1)
|
||||
chunks: List[str] = []
|
||||
for start in range(0, len(tokens), step):
|
||||
end = min(start + self.token_target, len(tokens))
|
||||
chunk_tokens = tokens[start:end]
|
||||
chunk_text = self.encoder.decode(chunk_tokens)
|
||||
chunks.append(chunk_text)
|
||||
return chunks
|
||||
|
||||
@ -0,0 +1,187 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from ..config import get_settings
|
||||
from ..models import CausalRelation
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import dowhy
|
||||
from dowhy import CausalModel
|
||||
HAS_DOWHY = True
|
||||
except ImportError:
|
||||
HAS_DOWHY = False
|
||||
logger.warning("DoWhy not available")
|
||||
|
||||
|
||||
class DoWhyAnalyzer:
|
||||
"""Validate causal relationships using DoWhy Structural Causal Models."""
|
||||
|
||||
def __init__(self, confidence_threshold: Optional[float] = None):
|
||||
if not HAS_DOWHY:
|
||||
raise ImportError("DoWhy is required for causal analysis")
|
||||
|
||||
settings = get_settings()
|
||||
self.confidence_threshold = confidence_threshold or settings.dowhy_confidence_threshold
|
||||
self.enabled = settings.dowhy_enabled
|
||||
|
||||
def validate_relationships(
|
||||
self,
|
||||
relationships: List[CausalRelation],
|
||||
text_data: Optional[str] = None
|
||||
) -> List[CausalRelation]:
|
||||
"""
|
||||
Validate causal relationships using DoWhy SCM.
|
||||
Filters out relationships that don't pass validation.
|
||||
"""
|
||||
if not self.enabled:
|
||||
logger.info("DoWhy validation is disabled, returning all relationships")
|
||||
return relationships
|
||||
|
||||
if not relationships:
|
||||
return []
|
||||
|
||||
validated: List[CausalRelation] = []
|
||||
|
||||
# Group relationships by cause to build SCM
|
||||
cause_groups = {}
|
||||
for rel in relationships:
|
||||
cause = rel.cause
|
||||
if cause not in cause_groups:
|
||||
cause_groups[cause] = []
|
||||
cause_groups[cause].append(rel)
|
||||
|
||||
# Validate each group
|
||||
for cause, effects in cause_groups.items():
|
||||
for rel in effects:
|
||||
try:
|
||||
is_valid = self._validate_single_relationship(rel, relationships, text_data)
|
||||
if is_valid:
|
||||
# Update confidence with validation score
|
||||
rel.confidence = min(rel.confidence + 0.1, 0.95) # Boost validated relationships
|
||||
rel.metadata["dowhy_validated"] = True
|
||||
validated.append(rel)
|
||||
else:
|
||||
logger.debug("DoWhy validation failed for: %s -> %s", rel.cause, rel.effect)
|
||||
except Exception as exc:
|
||||
logger.warning("DoWhy validation error for %s -> %s: %s",
|
||||
rel.cause, rel.effect, exc)
|
||||
# If validation fails, keep the relationship but mark it
|
||||
rel.metadata["dowhy_validated"] = False
|
||||
rel.metadata["dowhy_error"] = str(exc)
|
||||
validated.append(rel) # Keep it but with lower confidence
|
||||
|
||||
logger.info("DoWhy validated %d/%d relationships", len(validated), len(relationships))
|
||||
return validated
|
||||
|
||||
def _validate_single_relationship(
|
||||
self,
|
||||
relationship: CausalRelation,
|
||||
all_relationships: List[CausalRelation],
|
||||
text_data: Optional[str] = None
|
||||
) -> bool:
|
||||
"""
|
||||
Validate a single relationship using DoWhy.
|
||||
Returns True if relationship is valid, False otherwise.
|
||||
"""
|
||||
try:
|
||||
# Build a simple causal graph from relationships
|
||||
# Extract unique variables (causes and effects)
|
||||
variables = set()
|
||||
for rel in all_relationships:
|
||||
variables.add(rel.cause)
|
||||
variables.add(rel.effect)
|
||||
|
||||
# Create a simple dataset for DoWhy
|
||||
# Since we don't have actual data, we'll use a heuristic approach
|
||||
# based on relationship frequency and structure
|
||||
|
||||
# Check if there's a path from cause to effect in the graph
|
||||
has_path = self._check_causal_path(
|
||||
relationship.cause,
|
||||
relationship.effect,
|
||||
all_relationships
|
||||
)
|
||||
|
||||
if not has_path:
|
||||
return False
|
||||
|
||||
# Additional validation: check for confounders
|
||||
# If there are many relationships involving both cause and effect,
|
||||
# it's more likely to be valid
|
||||
related_count = sum(
|
||||
1 for rel in all_relationships
|
||||
if rel.cause == relationship.cause or rel.effect == relationship.effect
|
||||
)
|
||||
|
||||
# If there are multiple relationships involving these concepts,
|
||||
# it's more likely to be a valid causal relationship
|
||||
if related_count >= 2:
|
||||
return True
|
||||
|
||||
# For single relationships, use confidence threshold
|
||||
return relationship.confidence >= 0.6
|
||||
|
||||
except Exception as exc:
|
||||
logger.warning("DoWhy validation error: %s", exc)
|
||||
return False
|
||||
|
||||
def _check_causal_path(
|
||||
self,
|
||||
cause: str,
|
||||
effect: str,
|
||||
relationships: List[CausalRelation],
|
||||
max_depth: int = 3
|
||||
) -> bool:
|
||||
"""Check if there's a causal path from cause to effect."""
|
||||
if max_depth == 0:
|
||||
return False
|
||||
|
||||
# Direct relationship
|
||||
for rel in relationships:
|
||||
if rel.cause == cause and rel.effect == effect:
|
||||
return True
|
||||
|
||||
# Indirect relationship (transitive)
|
||||
for rel in relationships:
|
||||
if rel.cause == cause:
|
||||
# Check if rel.effect leads to the target effect
|
||||
if self._check_causal_path(rel.effect, effect, relationships, max_depth - 1):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def build_scm_from_relationships(
|
||||
self,
|
||||
relationships: List[CausalRelation]
|
||||
) -> Optional[CausalModel]:
|
||||
"""
|
||||
Build a Structural Causal Model from relationships.
|
||||
This is a simplified version for text-based causal inference.
|
||||
"""
|
||||
if not relationships:
|
||||
return None
|
||||
|
||||
try:
|
||||
# Extract all unique variables
|
||||
variables = set()
|
||||
for rel in relationships:
|
||||
variables.add(rel.cause)
|
||||
variables.add(rel.effect)
|
||||
|
||||
# Create a simple adjacency matrix representation
|
||||
# This is a heuristic approach since we don't have actual data
|
||||
|
||||
# For now, return None as building a full SCM requires actual data
|
||||
# The validation uses graph-based heuristics instead
|
||||
return None
|
||||
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to build SCM: %s", exc)
|
||||
return None
|
||||
|
||||
@ -0,0 +1,85 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import List
|
||||
|
||||
from ..config import get_settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
HAS_SENTENCE_TRANSFORMERS = True
|
||||
except ImportError:
|
||||
HAS_SENTENCE_TRANSFORMERS = False
|
||||
logger.warning("sentence-transformers not available")
|
||||
|
||||
|
||||
class Embedder:
|
||||
"""Generate embeddings using sentence-transformers."""
|
||||
|
||||
def __init__(self, model_name: str | None = None):
|
||||
if not HAS_SENTENCE_TRANSFORMERS:
|
||||
raise ImportError("sentence-transformers is required for embeddings")
|
||||
|
||||
settings = get_settings()
|
||||
self.model_name = model_name or settings.embedding_model
|
||||
|
||||
logger.info("Loading embedding model: %s", self.model_name)
|
||||
try:
|
||||
self.model = SentenceTransformer(self.model_name)
|
||||
self.dimension = self.model.get_sentence_embedding_dimension()
|
||||
logger.info("Loaded embedding model with dimension: %d", self.dimension)
|
||||
except Exception as exc:
|
||||
logger.exception("Failed to load embedding model %s: %s", self.model_name, exc)
|
||||
raise
|
||||
|
||||
def embed_text(self, text: str) -> List[float]:
|
||||
"""Generate embedding for a single text."""
|
||||
if not text or not text.strip():
|
||||
# Return zero vector for empty text
|
||||
return [0.0] * self.dimension
|
||||
|
||||
try:
|
||||
embedding = self.model.encode(text, normalize_embeddings=True)
|
||||
return embedding.tolist()
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to embed text: %s", exc)
|
||||
return [0.0] * self.dimension
|
||||
|
||||
def embed_batch(self, texts: List[str], batch_size: int = 32) -> List[List[float]]:
|
||||
"""Generate embeddings for a batch of texts."""
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
try:
|
||||
embeddings = self.model.encode(
|
||||
texts,
|
||||
batch_size=batch_size,
|
||||
normalize_embeddings=True,
|
||||
show_progress_bar=False
|
||||
)
|
||||
return embeddings.tolist()
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to embed batch: %s", exc)
|
||||
return [[0.0] * self.dimension] * len(texts)
|
||||
|
||||
def embed_relation(self, cause: str, effect: str, explanation: str | None = None) -> List[float]:
|
||||
"""Generate embedding for a cause-effect relationship."""
|
||||
# Combine cause, effect, and explanation into a single text
|
||||
parts = [cause, "causes", effect]
|
||||
if explanation:
|
||||
parts.append(explanation)
|
||||
|
||||
text = " ".join(parts)
|
||||
return self.embed_text(text)
|
||||
|
||||
def embed_concept(self, concept_name: str, description: str | None = None) -> List[float]:
|
||||
"""Generate embedding for a concept/node."""
|
||||
if description:
|
||||
text = f"{concept_name}: {description}"
|
||||
else:
|
||||
text = concept_name
|
||||
|
||||
return self.embed_text(text)
|
||||
|
||||
@ -0,0 +1,253 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from typing import Dict, List, Set
|
||||
|
||||
from anthropic import Anthropic, BadRequestError
|
||||
|
||||
from ..config import get_settings
|
||||
from ..models import CausalRelation
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class EntityResolver:
|
||||
"""
|
||||
Resolve entity mentions using Claude AI as per README Stage 4.
|
||||
Identifies that different mentions refer to the same entity.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
settings = get_settings()
|
||||
self.api_key = settings.anthropic_api_key
|
||||
self.model = settings.claude_model
|
||||
self.max_output_tokens = settings.claude_max_output_tokens
|
||||
|
||||
if not self.api_key:
|
||||
logger.warning("ANTHROPIC_API_KEY not set - Entity resolution will be skipped")
|
||||
self.client = None
|
||||
else:
|
||||
try:
|
||||
self.client = Anthropic(api_key=self.api_key)
|
||||
logger.info("EntityResolver initialized with Claude AI")
|
||||
except Exception as e:
|
||||
logger.warning("Failed to initialize Claude AI for entity resolution: %s", e)
|
||||
self.client = None
|
||||
|
||||
def resolve_entities(self, relations: List[CausalRelation]) -> Dict[str, Dict]:
|
||||
"""
|
||||
Resolve entity mentions across all documents as per README Step 4.
|
||||
|
||||
Step 4.1: Collect all entities
|
||||
Step 4.2: Group by entity type
|
||||
Step 4.3: AI-powered resolution (Claude API)
|
||||
Step 4.4: Create canonical names
|
||||
|
||||
Returns mapping: canonical_name -> {mentions, type, role, confidence}
|
||||
"""
|
||||
if not self.client:
|
||||
logger.info("Entity resolution skipped (Claude AI not available)")
|
||||
return {}
|
||||
|
||||
if not relations:
|
||||
return {}
|
||||
|
||||
# Step 4.1: COLLECT ALL ENTITIES
|
||||
all_mentions: Set[str] = set()
|
||||
for rel in relations:
|
||||
all_mentions.add(rel.cause.strip())
|
||||
all_mentions.add(rel.effect.strip())
|
||||
|
||||
if not all_mentions:
|
||||
return {}
|
||||
|
||||
logger.info("Collecting %d entity mentions for resolution", len(all_mentions))
|
||||
|
||||
# Step 4.2: GROUP BY ENTITY TYPE (simple heuristic)
|
||||
people_mentions = []
|
||||
project_mentions = []
|
||||
team_mentions = []
|
||||
other_mentions = []
|
||||
|
||||
for mention in all_mentions:
|
||||
mention_lower = mention.lower()
|
||||
if any(word in mention_lower for word in ["team", "department", "group", "division"]):
|
||||
team_mentions.append(mention)
|
||||
elif any(word in mention_lower for word in ["project", "system", "application", "platform"]):
|
||||
project_mentions.append(mention)
|
||||
elif len(mention.split()) <= 3 and not any(char.isdigit() for char in mention):
|
||||
# Likely a person name (short, no numbers)
|
||||
people_mentions.append(mention)
|
||||
else:
|
||||
other_mentions.append(mention)
|
||||
|
||||
# Step 4.3: AI-POWERED RESOLUTION (Claude API)
|
||||
resolved_entities = {}
|
||||
|
||||
# Resolve people
|
||||
if people_mentions:
|
||||
people_resolved = self._resolve_with_claude(people_mentions, "Person")
|
||||
resolved_entities.update(people_resolved)
|
||||
|
||||
# Resolve projects
|
||||
if project_mentions:
|
||||
projects_resolved = self._resolve_with_claude(project_mentions, "Project")
|
||||
resolved_entities.update(projects_resolved)
|
||||
|
||||
# Resolve teams
|
||||
if team_mentions:
|
||||
teams_resolved = self._resolve_with_claude(team_mentions, "Team")
|
||||
resolved_entities.update(teams_resolved)
|
||||
|
||||
# Resolve others
|
||||
if other_mentions:
|
||||
others_resolved = self._resolve_with_claude(other_mentions, "Entity")
|
||||
resolved_entities.update(others_resolved)
|
||||
|
||||
logger.info("Resolved %d entities from %d mentions", len(resolved_entities), len(all_mentions))
|
||||
|
||||
return resolved_entities
|
||||
|
||||
def _resolve_with_claude(self, mentions: List[str], entity_type: str) -> Dict[str, Dict]:
|
||||
"""Use Claude AI to resolve entity mentions."""
|
||||
if not self.client or not mentions:
|
||||
return {}
|
||||
|
||||
try:
|
||||
system_prompt = """You are an expert at entity resolution. Your task is to identify which mentions refer to the same real-world entity.
|
||||
|
||||
Analyze the given list of entity mentions and group them by the actual entity they refer to.
|
||||
|
||||
Return a JSON object where:
|
||||
- Key: Canonical name (best/most complete name)
|
||||
- Value: Object with:
|
||||
- "mentions": List of all mentions that refer to this entity
|
||||
- "type": Entity type (Person, Project, Team, etc.)
|
||||
- "role": Role or description (if applicable)
|
||||
- "confidence": Confidence score (0.0 to 1.0)
|
||||
|
||||
Example:
|
||||
{
|
||||
"John Smith": {
|
||||
"mentions": ["John", "J. Smith", "John Smith", "Smith"],
|
||||
"type": "Person",
|
||||
"role": "Project Lead",
|
||||
"confidence": 0.95
|
||||
},
|
||||
"Project Alpha": {
|
||||
"mentions": ["Project Alpha", "Alpha", "The Alpha Project"],
|
||||
"type": "Project",
|
||||
"role": null,
|
||||
"confidence": 0.90
|
||||
}
|
||||
}
|
||||
|
||||
Be thorough and group all related mentions together."""
|
||||
|
||||
user_prompt = f"""Analyze these {entity_type} entity mentions and resolve which ones refer to the same entity:
|
||||
|
||||
{json.dumps(mentions, indent=2)}
|
||||
|
||||
Return a JSON object mapping canonical names to their resolved mentions."""
|
||||
|
||||
message = self.client.messages.create(
|
||||
model=self.model,
|
||||
max_tokens=self.max_output_tokens,
|
||||
temperature=0.2, # Lower temperature for more consistent resolution
|
||||
system=system_prompt,
|
||||
messages=[{"role": "user", "content": user_prompt}]
|
||||
)
|
||||
|
||||
response_text = "".join(
|
||||
block.text for block in message.content
|
||||
if hasattr(block, "text")
|
||||
)
|
||||
|
||||
if not response_text:
|
||||
logger.warning("Empty response from Claude for entity resolution")
|
||||
return {}
|
||||
|
||||
# Parse JSON response
|
||||
try:
|
||||
json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
|
||||
if json_match:
|
||||
json_text = json_match.group(0)
|
||||
else:
|
||||
json_text = response_text
|
||||
|
||||
resolved = json.loads(json_text)
|
||||
|
||||
# Validate and structure the response
|
||||
result = {}
|
||||
for canonical_name, entity_data in resolved.items():
|
||||
if isinstance(entity_data, dict):
|
||||
result[canonical_name] = {
|
||||
"mentions": entity_data.get("mentions", [canonical_name]),
|
||||
"type": entity_data.get("type", entity_type),
|
||||
"role": entity_data.get("role"),
|
||||
"confidence": float(entity_data.get("confidence", 0.85))
|
||||
}
|
||||
else:
|
||||
# Fallback if structure is different
|
||||
result[canonical_name] = {
|
||||
"mentions": [canonical_name] if isinstance(entity_data, str) else entity_data,
|
||||
"type": entity_type,
|
||||
"role": None,
|
||||
"confidence": 0.8
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning("Failed to parse Claude response as JSON: %s. Response: %s",
|
||||
e, response_text[:500])
|
||||
return {}
|
||||
|
||||
except BadRequestError as e:
|
||||
logger.warning("Claude API error during entity resolution: %s", e)
|
||||
return {}
|
||||
except Exception as e:
|
||||
logger.warning("Entity resolution failed: %s", e)
|
||||
return {}
|
||||
|
||||
def apply_resolution_to_relations(
|
||||
self,
|
||||
relations: List[CausalRelation],
|
||||
resolved_entities: Dict[str, Dict]
|
||||
) -> List[CausalRelation]:
|
||||
"""
|
||||
Apply entity resolution to relationships.
|
||||
Replace mentions with canonical names.
|
||||
"""
|
||||
if not resolved_entities:
|
||||
return relations
|
||||
|
||||
# Create reverse mapping: mention -> canonical_name
|
||||
mention_to_canonical: Dict[str, str] = {}
|
||||
for canonical_name, entity_data in resolved_entities.items():
|
||||
mentions = entity_data.get("mentions", [])
|
||||
for mention in mentions:
|
||||
mention_to_canonical[mention.lower()] = canonical_name
|
||||
|
||||
# Update relations with canonical names
|
||||
updated_relations = []
|
||||
for rel in relations:
|
||||
# Resolve cause
|
||||
cause_lower = rel.cause.strip().lower()
|
||||
if cause_lower in mention_to_canonical:
|
||||
rel.cause = mention_to_canonical[cause_lower]
|
||||
|
||||
# Resolve effect
|
||||
effect_lower = rel.effect.strip().lower()
|
||||
if effect_lower in mention_to_canonical:
|
||||
rel.effect = mention_to_canonical[effect_lower]
|
||||
|
||||
# Store resolution info in metadata
|
||||
rel.metadata["entity_resolved"] = True
|
||||
updated_relations.append(rel)
|
||||
|
||||
logger.info("Applied entity resolution to %d relationships", len(updated_relations))
|
||||
return updated_relations
|
||||
|
||||
@ -1,38 +1,65 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from typing import Iterable
|
||||
import re
|
||||
from typing import Dict, Iterable, List, Optional
|
||||
|
||||
from anthropic import Anthropic, BadRequestError
|
||||
from neo4j import GraphDatabase, Transaction
|
||||
|
||||
from ..config import get_settings
|
||||
from ..models import CausalRelation
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
MERGE_QUERY = """
|
||||
MERGE (cause:Concept {name: $cause})
|
||||
ON CREATE SET cause.created_at = timestamp(), cause.lastSeen = timestamp()
|
||||
ON MATCH SET cause.lastSeen = timestamp()
|
||||
MERGE (effect:Concept {name: $effect})
|
||||
ON CREATE SET effect.created_at = timestamp(), effect.lastSeen = timestamp()
|
||||
ON MATCH SET effect.lastSeen = timestamp()
|
||||
MERGE (cause)-[r:CAUSES]->(effect)
|
||||
ON CREATE SET r.confidence = $confidence,
|
||||
r.explanation = $explanation,
|
||||
r.source_file_id = $source_file_id,
|
||||
r.source_snippet = $source_snippet,
|
||||
r.job_id = $job_id,
|
||||
r.model = $model,
|
||||
r.created_at = timestamp(),
|
||||
r.updated_at = timestamp()
|
||||
ON MATCH SET r.confidence = $confidence,
|
||||
r.explanation = $explanation,
|
||||
r.source_file_id = $source_file_id,
|
||||
r.source_snippet = $source_snippet,
|
||||
r.job_id = $job_id,
|
||||
r.model = $model,
|
||||
r.updated_at = timestamp()
|
||||
# Query to create Document node
|
||||
CREATE_DOCUMENT_QUERY = """
|
||||
MERGE (doc:Document {filename: $filename})
|
||||
ON CREATE SET doc.uploaded_at = timestamp(),
|
||||
doc.file_path = $file_path,
|
||||
doc.job_id = $job_id,
|
||||
doc.created_at = timestamp()
|
||||
ON MATCH SET doc.lastSeen = timestamp()
|
||||
"""
|
||||
|
||||
# Query to create Entity nodes and relationship with dynamic type
|
||||
CREATE_ENTITY_RELATIONSHIP_QUERY = """
|
||||
MERGE (source:Entity:Concept {name: $source})
|
||||
ON CREATE SET source.created_at = timestamp(),
|
||||
source.lastSeen = timestamp(),
|
||||
source.type = COALESCE($source_type, 'Entity')
|
||||
ON MATCH SET source.lastSeen = timestamp()
|
||||
|
||||
MERGE (target:Entity:Concept {name: $target})
|
||||
ON CREATE SET target.created_at = timestamp(),
|
||||
target.lastSeen = timestamp(),
|
||||
target.type = COALESCE($target_type, 'Entity')
|
||||
ON MATCH SET target.lastSeen = timestamp()
|
||||
|
||||
WITH source, target
|
||||
CALL apoc.merge.relationship(
|
||||
source,
|
||||
$rel_type,
|
||||
{confidence: $confidence,
|
||||
explanation: $explanation,
|
||||
source_file_id: $source_file_id,
|
||||
source_snippet: $source_snippet,
|
||||
job_id: $job_id,
|
||||
model: $model,
|
||||
created_at: timestamp(),
|
||||
updated_at: timestamp()},
|
||||
{confidence: $confidence,
|
||||
explanation: $explanation,
|
||||
source_file_id: $source_file_id,
|
||||
source_snippet: $source_snippet,
|
||||
job_id: $job_id,
|
||||
model: $model,
|
||||
updated_at: timestamp()},
|
||||
target
|
||||
) YIELD rel
|
||||
RETURN rel
|
||||
"""
|
||||
|
||||
|
||||
@ -43,12 +70,42 @@ class GraphWriter:
|
||||
def close(self) -> None:
|
||||
self._driver.close()
|
||||
|
||||
def write_relations(self, job_id: str, relations: Iterable[CausalRelation]) -> None:
|
||||
def write_documents(self, job_id: str, files: Iterable) -> None:
|
||||
"""Create Document nodes for uploaded files."""
|
||||
files_list = list(files)
|
||||
if not files_list:
|
||||
return
|
||||
|
||||
logger.info("Creating %d document nodes for job %s", len(files_list), job_id)
|
||||
|
||||
with self._driver.session() as session:
|
||||
def _write_docs(tx: Transaction) -> None:
|
||||
for file_record in files_list:
|
||||
try:
|
||||
tx.run(
|
||||
CREATE_DOCUMENT_QUERY,
|
||||
filename=file_record.filename,
|
||||
file_path=file_record.stored_path,
|
||||
job_id=job_id
|
||||
)
|
||||
logger.debug("Created document node: %s", file_record.filename)
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to create document node for %s: %s", file_record.filename, exc)
|
||||
|
||||
session.execute_write(_write_docs)
|
||||
logger.info("Created document nodes for job %s", job_id)
|
||||
|
||||
def write_relations(self, job_id: str, relations: Iterable[CausalRelation], files: Iterable = None) -> None:
|
||||
"""Write entities and relationships to Neo4j with multiple relationship types."""
|
||||
relations_list = list(relations)
|
||||
if not relations_list:
|
||||
logger.warning("No relations to write for job %s", job_id)
|
||||
return
|
||||
|
||||
# Create document nodes if files provided
|
||||
if files:
|
||||
self.write_documents(job_id, files)
|
||||
|
||||
logger.info("Writing %d relations to Neo4j for job %s", len(relations_list), job_id)
|
||||
|
||||
with self._driver.session() as session:
|
||||
@ -58,11 +115,70 @@ class GraphWriter:
|
||||
if not relation.cause or not relation.effect:
|
||||
logger.warning("Skipping relation with empty cause or effect: %s -> %s", relation.cause, relation.effect)
|
||||
continue
|
||||
|
||||
# Get relationship type (default to CAUSES for backward compatibility)
|
||||
rel_type = getattr(relation, 'relationship_type', None) or "CAUSES"
|
||||
|
||||
# Sanitize relationship type (only allow alphanumeric and underscores)
|
||||
rel_type = re.sub(r'[^A-Z0-9_]', '', rel_type.upper())
|
||||
if not rel_type:
|
||||
rel_type = "CAUSES"
|
||||
|
||||
# Infer entity types from names (simple heuristic)
|
||||
source_type = self._infer_entity_type(relation.cause)
|
||||
target_type = self._infer_entity_type(relation.effect)
|
||||
|
||||
try:
|
||||
# Create source entity
|
||||
tx.run("""
|
||||
MERGE (source:Entity:Concept {name: $source})
|
||||
ON CREATE SET source.created_at = timestamp(),
|
||||
source.lastSeen = timestamp(),
|
||||
source.type = $source_type
|
||||
ON MATCH SET source.lastSeen = timestamp()
|
||||
""",
|
||||
source=relation.cause.strip(),
|
||||
source_type=source_type
|
||||
)
|
||||
|
||||
# Create target entity
|
||||
tx.run("""
|
||||
MERGE (target:Entity:Concept {name: $target})
|
||||
ON CREATE SET target.created_at = timestamp(),
|
||||
target.lastSeen = timestamp(),
|
||||
target.type = $target_type
|
||||
ON MATCH SET target.lastSeen = timestamp()
|
||||
""",
|
||||
target=relation.effect.strip(),
|
||||
target_type=target_type
|
||||
)
|
||||
|
||||
# Create relationship with dynamic type (sanitized)
|
||||
query = f"""
|
||||
MATCH (source:Entity {{name: $source}})
|
||||
MATCH (target:Entity {{name: $target}})
|
||||
MERGE (source)-[r:{rel_type}]->(target)
|
||||
ON CREATE SET r.confidence = $confidence,
|
||||
r.explanation = $explanation,
|
||||
r.source_file_id = $source_file_id,
|
||||
r.source_snippet = $source_snippet,
|
||||
r.job_id = $job_id,
|
||||
r.model = $model,
|
||||
r.created_at = timestamp(),
|
||||
r.updated_at = timestamp()
|
||||
ON MATCH SET r.confidence = $confidence,
|
||||
r.explanation = $explanation,
|
||||
r.source_file_id = $source_file_id,
|
||||
r.source_snippet = $source_snippet,
|
||||
r.job_id = $job_id,
|
||||
r.model = $model,
|
||||
r.updated_at = timestamp()
|
||||
"""
|
||||
|
||||
result = tx.run(
|
||||
MERGE_QUERY,
|
||||
cause=relation.cause.strip(),
|
||||
effect=relation.effect.strip(),
|
||||
query,
|
||||
source=relation.cause.strip(),
|
||||
target=relation.effect.strip(),
|
||||
confidence=float(relation.confidence) if relation.confidence else 0.0,
|
||||
explanation=relation.explanation or "",
|
||||
source_file_id=relation.source_file_id or "",
|
||||
@ -70,12 +186,145 @@ class GraphWriter:
|
||||
job_id=job_id,
|
||||
model=relation.metadata.get("model") or "",
|
||||
)
|
||||
|
||||
# Link entities to documents if source_file_id is a filename
|
||||
if relation.source_file_id and relation.source_file_id != "combined_text":
|
||||
link_query = f"""
|
||||
MATCH (entity:Entity {{name: $entity_name}})
|
||||
MATCH (doc:Document {{filename: $filename}})
|
||||
MERGE (entity)-[:EXTRACTED_FROM]->(doc)
|
||||
"""
|
||||
try:
|
||||
tx.run(link_query, entity_name=relation.cause.strip(), filename=relation.source_file_id)
|
||||
tx.run(link_query, entity_name=relation.effect.strip(), filename=relation.source_file_id)
|
||||
except:
|
||||
pass # Ignore if document doesn't exist
|
||||
|
||||
count += 1
|
||||
logger.debug("Wrote relation: %s -> %s (confidence: %s)", relation.cause, relation.effect, relation.confidence)
|
||||
logger.debug("Wrote relation: %s -[%s]-> %s (confidence: %s)",
|
||||
relation.cause, rel_type, relation.effect, relation.confidence)
|
||||
except Exception as exc:
|
||||
logger.exception("Failed to write relation %s -> %s: %s", relation.cause, relation.effect, exc)
|
||||
logger.info("Successfully wrote %d/%d relations to Neo4j", count, len(relations_list))
|
||||
|
||||
session.execute_write(_write)
|
||||
logger.info("Persisted causal relations for job %s", job_id)
|
||||
logger.info("Persisted relations for job %s", job_id)
|
||||
|
||||
def _infer_entity_type(self, entity_name: str) -> str:
|
||||
"""Infer entity type from name (simple heuristic)."""
|
||||
name_lower = entity_name.lower()
|
||||
|
||||
# Technology patterns
|
||||
if any(tech in name_lower for tech in ['react', 'node', 'python', 'java', 'postgres', 'mysql', 'redis', 'mongodb', 'docker', 'kubernetes']):
|
||||
return "Technology"
|
||||
|
||||
# Service patterns
|
||||
if any(word in name_lower for word in ['service', 'api', 'gateway', 'auth', 'payment', 'notification']):
|
||||
return "Service"
|
||||
|
||||
# Component patterns
|
||||
if any(word in name_lower for word in ['component', 'module', 'system', 'application', 'platform']):
|
||||
return "Component"
|
||||
|
||||
# Process patterns
|
||||
if any(word in name_lower for word in ['flow', 'process', 'workflow', 'pipeline', 'procedure']):
|
||||
return "Process"
|
||||
|
||||
# Default
|
||||
return "Entity"
|
||||
|
||||
def query_causal_chains(
|
||||
self,
|
||||
job_id: str,
|
||||
min_length: int = 2,
|
||||
max_length: int = 4,
|
||||
min_confidence: float = 0.8,
|
||||
limit: int = 20
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Query Neo4j for causal chains as per README Step 7.3.
|
||||
Returns sequences of connected events.
|
||||
"""
|
||||
# Query for causal chains - match any relationship type
|
||||
query = f"""
|
||||
MATCH path = (start:Entity)-[r*{min_length}..{max_length}]->(end:Entity)
|
||||
WHERE ALL(rel in relationships(path) WHERE rel.job_id = $job_id AND rel.confidence >= $min_confidence)
|
||||
WITH path,
|
||||
[node in nodes(path) | node.name] as chain,
|
||||
[rel in relationships(path) | rel.confidence] as confidences,
|
||||
[rel in relationships(path) | type(rel)] as rel_types,
|
||||
[rel in relationships(path) | rel.explanation] as explanations
|
||||
RETURN chain, confidences, rel_types, explanations
|
||||
ORDER BY reduce(conf = 0.0, c in confidences | conf + c) DESC
|
||||
LIMIT $limit
|
||||
"""
|
||||
|
||||
try:
|
||||
with self._driver.session() as session:
|
||||
result = session.run(
|
||||
query,
|
||||
job_id=job_id,
|
||||
min_confidence=min_confidence,
|
||||
limit=limit
|
||||
)
|
||||
|
||||
chains = []
|
||||
for record in result:
|
||||
chain = record["chain"]
|
||||
confidences = record["confidences"]
|
||||
rel_types = record["rel_types"]
|
||||
explanations = record["explanations"]
|
||||
|
||||
# Calculate average confidence
|
||||
avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
|
||||
|
||||
chains.append({
|
||||
"chain": chain,
|
||||
"confidences": confidences,
|
||||
"rel_types": rel_types,
|
||||
"explanations": explanations,
|
||||
"avg_confidence": avg_confidence,
|
||||
"length": len(chain) - 1
|
||||
})
|
||||
|
||||
logger.info("Found %d causal chains for job %s", len(chains), job_id)
|
||||
return chains
|
||||
except Exception as exc:
|
||||
logger.exception("Failed to query causal chains: %s", exc)
|
||||
return []
|
||||
|
||||
def query_key_entities(self, job_id: str, limit: int = 20) -> List[Dict]:
|
||||
"""
|
||||
Query Neo4j for key entities (most involved) as per README Step 7.3.
|
||||
"""
|
||||
query = """
|
||||
MATCH (e:Entity)-[r]->(target)
|
||||
WHERE r.job_id = $job_id
|
||||
WITH e, count(r) as relation_count, collect(DISTINCT type(r)) as rel_types
|
||||
RETURN e.name as name,
|
||||
e.type as type,
|
||||
relation_count,
|
||||
rel_types
|
||||
ORDER BY relation_count DESC
|
||||
LIMIT $limit
|
||||
"""
|
||||
|
||||
try:
|
||||
with self._driver.session() as session:
|
||||
result = session.run(query, job_id=job_id, limit=limit)
|
||||
|
||||
entities = []
|
||||
for record in result:
|
||||
entities.append({
|
||||
"name": record["name"],
|
||||
"type": record.get("type", "Entity"),
|
||||
"relation_count": record["relation_count"],
|
||||
"relation_types": record["rel_types"]
|
||||
})
|
||||
|
||||
logger.info("Found %d key entities for job %s", len(entities), job_id)
|
||||
return entities
|
||||
except Exception as exc:
|
||||
logger.exception("Failed to query key entities: %s", exc)
|
||||
return []
|
||||
|
||||
|
||||
@ -0,0 +1,625 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from anthropic import Anthropic, BadRequestError
|
||||
|
||||
from ..config import get_settings
|
||||
from ..models import CausalRelation
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Try to import SpaCy
|
||||
try:
|
||||
import spacy
|
||||
from spacy.lang.en import English
|
||||
HAS_SPACY = True
|
||||
except ImportError:
|
||||
HAS_SPACY = False
|
||||
logger.warning("spacy not available - NLP detection will be skipped")
|
||||
|
||||
|
||||
class RelationshipExtractor:
|
||||
"""Extract potential cause-effect relationships from text using NLP (SpaCy) + Claude AI."""
|
||||
|
||||
# Causal keywords for NLP detection (Step 3.1)
|
||||
CAUSAL_KEYWORDS = [
|
||||
"because", "due to", "as a result", "led to", "caused", "therefore",
|
||||
"consequently", "hence", "thus", "so", "since", "owing to",
|
||||
"resulted in", "brought about", "gave rise to", "triggered",
|
||||
"provoked", "induced", "generated", "produced", "created"
|
||||
]
|
||||
|
||||
# Common cause-effect patterns (expanded for architecture/technical documents)
|
||||
CAUSE_EFFECT_PATTERNS = [
|
||||
# Direct causal patterns
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+causes?\s+(\w+(?:\s+\w+){0,15})", "causes"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+leads?\s+to\s+(\w+(?:\s+\w+){0,15})", "leads_to"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+results?\s+in\s+(\w+(?:\s+\w+){0,15})", "results_in"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+triggers?\s+(\w+(?:\s+\w+){0,15})", "triggers"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+produces?\s+(\w+(?:\s+\w+){0,15})", "produces"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+enables?\s+(\w+(?:\s+\w+){0,15})", "enables"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+allows?\s+(\w+(?:\s+\w+){0,15})", "allows"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+facilitates?\s+(\w+(?:\s+\w+){0,15})", "facilitates"),
|
||||
|
||||
# Dependency patterns
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+depends?\s+on\s+(\w+(?:\s+\w+){0,15})", "depends_on"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+requires?\s+(\w+(?:\s+\w+){0,15})", "requires"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+needs?\s+(\w+(?:\s+\w+){0,15})", "needs"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+relies?\s+on\s+(\w+(?:\s+\w+){0,15})", "relies_on"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+uses?\s+(\w+(?:\s+\w+){0,15})", "uses"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+utilizes?\s+(\w+(?:\s+\w+){0,15})", "utilizes"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+leverages?\s+(\w+(?:\s+\w+){0,15})", "leverages"),
|
||||
|
||||
# Architectural/System patterns
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+connects?\s+to\s+(\w+(?:\s+\w+){0,15})", "connects_to"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+communicates?\s+with\s+(\w+(?:\s+\w+){0,15})", "communicates_with"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+interacts?\s+with\s+(\w+(?:\s+\w+){0,15})", "interacts_with"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+integrates?\s+with\s+(\w+(?:\s+\w+){0,15})", "integrates_with"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+provides?\s+(\w+(?:\s+\w+){0,15})", "provides"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+supports?\s+(\w+(?:\s+\w+){0,15})", "supports"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+handles?\s+(\w+(?:\s+\w+){0,15})", "handles"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+manages?\s+(\w+(?:\s+\w+){0,15})", "manages"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+controls?\s+(\w+(?:\s+\w+){0,15})", "controls"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+processes?\s+(\w+(?:\s+\w+){0,15})", "processes"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+generates?\s+(\w+(?:\s+\w+){0,15})", "generates"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+creates?\s+(\w+(?:\s+\w+){0,15})", "creates"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+implements?\s+(\w+(?:\s+\w+){0,15})", "implements"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+delivers?\s+(\w+(?:\s+\w+){0,15})", "delivers"),
|
||||
|
||||
# Flow patterns
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+flows?\s+to\s+(\w+(?:\s+\w+){0,15})", "flows_to"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+sends?\s+to\s+(\w+(?:\s+\w+){0,15})", "sends_to"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+transmits?\s+to\s+(\w+(?:\s+\w+){0,15})", "transmits_to"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+receives?\s+from\s+(\w+(?:\s+\w+){0,15})", "receives_from"),
|
||||
|
||||
# Conditional patterns
|
||||
(r"if\s+(\w+(?:\s+\w+){0,15}),\s+then\s+(\w+(?:\s+\w+){0,15})", "if_then"),
|
||||
(r"when\s+(\w+(?:\s+\w+){0,15}),\s+(\w+(?:\s+\w+){0,15})\s+occurs?", "when_then"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+implies?\s+(\w+(?:\s+\w+){0,15})", "implies"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+ensures?\s+(\w+(?:\s+\w+){0,15})", "ensures"),
|
||||
|
||||
# Sequential patterns
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+follows?\s+(\w+(?:\s+\w+){0,15})", "follows"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+comes?\s+after\s+(\w+(?:\s+\w+){0,15})", "comes_after"),
|
||||
(r"first\s+(\w+(?:\s+\w+){0,15}),\s+then\s+(\w+(?:\s+\w+){0,15})", "first_then"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+precedes?\s+(\w+(?:\s+\w+){0,15})", "precedes"),
|
||||
|
||||
# Containment patterns
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+contains?\s+(\w+(?:\s+\w+){0,15})", "contains"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+includes?\s+(\w+(?:\s+\w+){0,15})", "includes"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+consists?\s+of\s+(\w+(?:\s+\w+){0,15})", "consists_of"),
|
||||
|
||||
# Influence patterns
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+affects?\s+(\w+(?:\s+\w+){0,15})", "affects"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+impacts?\s+(\w+(?:\s+\w+){0,15})", "impacts"),
|
||||
(r"(\w+(?:\s+\w+){0,15})\s+influences?\s+(\w+(?:\s+\w+){0,15})", "influences"),
|
||||
]
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize NLP and Claude AI components."""
|
||||
settings = get_settings()
|
||||
|
||||
# Initialize SpaCy NLP model (Step 3.1)
|
||||
self.nlp = None
|
||||
if HAS_SPACY:
|
||||
try:
|
||||
# Try to load English model, fallback to blank if not available
|
||||
try:
|
||||
self.nlp = spacy.load("en_core_web_sm")
|
||||
except OSError:
|
||||
logger.warning("en_core_web_sm model not found, using blank English model")
|
||||
self.nlp = English()
|
||||
self.nlp.add_pipe("sentencizer")
|
||||
logger.info("SpaCy NLP model loaded")
|
||||
except Exception as e:
|
||||
logger.warning("Failed to load SpaCy model: %s", e)
|
||||
self.nlp = None
|
||||
|
||||
# Initialize Claude AI client (Step 3.2)
|
||||
self.claude_client = None
|
||||
self.claude_model = settings.claude_model
|
||||
self.claude_max_input_tokens = settings.claude_max_input_tokens
|
||||
self.claude_max_output_tokens = settings.claude_max_output_tokens
|
||||
|
||||
if settings.anthropic_api_key:
|
||||
try:
|
||||
self.claude_client = Anthropic(api_key=settings.anthropic_api_key)
|
||||
logger.info("Claude AI client initialized")
|
||||
except Exception as e:
|
||||
logger.warning("Failed to initialize Claude AI client: %s", e)
|
||||
else:
|
||||
logger.warning("ANTHROPIC_API_KEY not set - Claude AI extraction will be skipped")
|
||||
|
||||
def extract_from_text(self, text: str, source_file_id: str) -> List[CausalRelation]:
|
||||
"""
|
||||
Extract cause-effect relationships using NLP (SpaCy) + Claude AI.
|
||||
Implements Step 3.1 (NLP Detection) and Step 3.2 (Claude AI Extraction).
|
||||
"""
|
||||
if not text or not text.strip():
|
||||
return []
|
||||
|
||||
all_relationships: List[CausalRelation] = []
|
||||
|
||||
# Step 3.1: BASIC NLP DETECTION (SpaCy)
|
||||
nlp_relationships = self._extract_with_nlp(text, source_file_id)
|
||||
all_relationships.extend(nlp_relationships)
|
||||
logger.info("NLP (SpaCy) extracted %d candidate relationships (low confidence)",
|
||||
len(nlp_relationships))
|
||||
|
||||
# Step 3.2: AI-POWERED EXTRACTION (Claude API)
|
||||
if self.claude_client:
|
||||
claude_relationships = self._extract_with_claude(text, source_file_id)
|
||||
all_relationships.extend(claude_relationships)
|
||||
logger.info("Claude AI extracted %d relationships (high confidence)",
|
||||
len(claude_relationships))
|
||||
else:
|
||||
logger.info("Claude AI extraction skipped (API key not configured)")
|
||||
|
||||
# Also run pattern matching as fallback
|
||||
pattern_relationships = self._extract_with_patterns(text, source_file_id)
|
||||
all_relationships.extend(pattern_relationships)
|
||||
logger.info("Pattern matching extracted %d relationships", len(pattern_relationships))
|
||||
|
||||
# Deduplicate relationships
|
||||
seen = set()
|
||||
unique_relationships = []
|
||||
for rel in all_relationships:
|
||||
key = (rel.cause.lower().strip(), rel.effect.lower().strip())
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
unique_relationships.append(rel)
|
||||
|
||||
logger.info("Total unique relationships extracted: %d (from %d total)",
|
||||
len(unique_relationships), len(all_relationships))
|
||||
return unique_relationships
|
||||
|
||||
def _extract_with_nlp(self, text: str, source_file_id: str) -> List[CausalRelation]:
|
||||
"""
|
||||
Step 3.1: Basic NLP Detection using SpaCy.
|
||||
Look for causal keywords and find sentences containing these patterns.
|
||||
Returns potential causal relationships (low confidence).
|
||||
"""
|
||||
if not self.nlp:
|
||||
return []
|
||||
|
||||
relationships: List[CausalRelation] = []
|
||||
|
||||
try:
|
||||
# Process text with SpaCy
|
||||
doc = self.nlp(text)
|
||||
|
||||
# Find sentences containing causal keywords
|
||||
for sent in doc.sents:
|
||||
sent_text = sent.text.strip()
|
||||
if len(sent_text) < 10:
|
||||
continue
|
||||
|
||||
# Check if sentence contains causal keywords
|
||||
sent_lower = sent_text.lower()
|
||||
has_causal_keyword = any(keyword in sent_lower for keyword in self.CAUSAL_KEYWORDS)
|
||||
|
||||
if has_causal_keyword:
|
||||
# Try to extract cause-effect using dependency parsing
|
||||
cause = None
|
||||
effect = None
|
||||
|
||||
# Look for causal conjunctions
|
||||
for token in sent:
|
||||
if token.text.lower() in ["because", "due", "since", "as"]:
|
||||
# Find the clause after the causal conjunction
|
||||
if token.dep_ in ["mark", "prep"]:
|
||||
# Try to extract cause and effect
|
||||
cause_span = None
|
||||
effect_span = None
|
||||
|
||||
# Simple heuristic: text before "because/due to" is effect, after is cause
|
||||
if "because" in sent_lower or "since" in sent_lower:
|
||||
parts = re.split(r'\b(because|since)\b', sent_text, flags=re.IGNORECASE)
|
||||
if len(parts) >= 3:
|
||||
effect = parts[0].strip()
|
||||
cause = parts[2].strip()
|
||||
elif "due to" in sent_lower:
|
||||
parts = re.split(r'\bdue to\b', sent_text, flags=re.IGNORECASE)
|
||||
if len(parts) >= 2:
|
||||
effect = parts[0].strip()
|
||||
cause = parts[1].strip()
|
||||
|
||||
if cause and effect:
|
||||
# Clean up cause and effect
|
||||
cause = re.sub(r'^[,\s]+|[,\s]+$', '', cause)
|
||||
effect = re.sub(r'^[,\s]+|[,\s]+$', '', effect)
|
||||
|
||||
if len(cause) >= 3 and len(effect) >= 3:
|
||||
relationships.append(CausalRelation(
|
||||
cause=cause,
|
||||
effect=effect,
|
||||
confidence=0.5, # Low confidence for NLP
|
||||
explanation=f"Extracted using NLP (SpaCy) - found causal keyword",
|
||||
source_file_id=source_file_id,
|
||||
source_snippet=sent_text[:200],
|
||||
relationship_type="CAUSES",
|
||||
metadata={
|
||||
"extraction_method": "spacy_nlp",
|
||||
"sentence": sent_text
|
||||
}
|
||||
))
|
||||
except Exception as e:
|
||||
logger.warning("NLP extraction failed: %s", e)
|
||||
|
||||
return relationships
|
||||
|
||||
def _extract_with_claude(self, text: str, source_file_id: str) -> List[CausalRelation]:
|
||||
"""
|
||||
Step 3.2: AI-Powered Extraction using Claude API.
|
||||
Send full document text to Claude AI and ask it to find ALL causal relationships.
|
||||
Returns high-quality causal relationships (high confidence).
|
||||
"""
|
||||
if not self.claude_client:
|
||||
return []
|
||||
|
||||
relationships: List[CausalRelation] = []
|
||||
|
||||
try:
|
||||
# Prepare prompt for Claude
|
||||
system_prompt = """You are an expert at analyzing text and extracting cause-effect relationships.
|
||||
Your task is to identify ALL causal relationships in the given text, including both explicit and implicit ones.
|
||||
|
||||
For each causal relationship, extract:
|
||||
- Cause: What triggered or led to this?
|
||||
- Effect: What was the result or outcome?
|
||||
- Context: Additional background information
|
||||
- Entities: Who or what is involved (people, teams, projects, systems)
|
||||
- Confidence: How certain are you? (0.0 to 1.0)
|
||||
- Source sentence: The sentence or passage where this relationship was found
|
||||
- Date: When did this happen (if mentioned)
|
||||
|
||||
Return the results as a JSON array of objects with this structure:
|
||||
[
|
||||
{
|
||||
"cause": "string",
|
||||
"effect": "string",
|
||||
"context": "string (optional)",
|
||||
"entities": ["string"],
|
||||
"confidence": 0.0-1.0,
|
||||
"source_sentence": "string",
|
||||
"date": "string (optional)"
|
||||
}
|
||||
]
|
||||
|
||||
Focus on:
|
||||
- Explicit relationships ("because X, therefore Y")
|
||||
- Implicit relationships (strongly implied cause-effect)
|
||||
- Technical and architectural dependencies
|
||||
- Business decisions and their impacts
|
||||
- Process flows and sequences"""
|
||||
|
||||
# Truncate text to fit within token limits (rough estimate: 1 token ≈ 4 characters)
|
||||
max_chars = (self.claude_max_input_tokens - 1000) * 4
|
||||
truncated_text = text[:max_chars] if len(text) > max_chars else text
|
||||
|
||||
user_prompt = f"""Analyze the following text and extract ALL causal relationships.
|
||||
|
||||
Text:
|
||||
{truncated_text}
|
||||
|
||||
Return a JSON array of causal relationships. Be thorough and find both explicit and implicit relationships."""
|
||||
|
||||
# Call Claude API
|
||||
message = self.claude_client.messages.create(
|
||||
model=self.claude_model,
|
||||
max_tokens=self.claude_max_output_tokens,
|
||||
temperature=0.3, # Lower temperature for more focused extraction
|
||||
system=system_prompt,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": user_prompt
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
# Extract response text
|
||||
content_blocks = message.content or []
|
||||
response_text = "".join(
|
||||
block.text for block in content_blocks
|
||||
if hasattr(block, "text")
|
||||
)
|
||||
|
||||
if not response_text:
|
||||
logger.warning("Empty response from Claude AI")
|
||||
return []
|
||||
|
||||
# Parse JSON response
|
||||
try:
|
||||
# Try to extract JSON from response (might have markdown code blocks)
|
||||
json_match = re.search(r'\[.*\]', response_text, re.DOTALL)
|
||||
if json_match:
|
||||
json_text = json_match.group(0)
|
||||
else:
|
||||
json_text = response_text
|
||||
|
||||
claude_results = json.loads(json_text)
|
||||
|
||||
# Convert Claude results to CausalRelation objects
|
||||
for result in claude_results:
|
||||
cause = result.get("cause", "").strip()
|
||||
effect = result.get("effect", "").strip()
|
||||
context = result.get("context", "")
|
||||
entities = result.get("entities", [])
|
||||
confidence = float(result.get("confidence", 0.85))
|
||||
source_sentence = result.get("source_sentence", "")
|
||||
date = result.get("date", "")
|
||||
|
||||
if not cause or not effect:
|
||||
continue
|
||||
|
||||
# Map to Neo4j relationship type (default to CAUSES)
|
||||
relationship_type = "CAUSES"
|
||||
|
||||
explanation = context or f"Extracted by Claude AI"
|
||||
if entities:
|
||||
explanation += f" (Entities: {', '.join(entities)})"
|
||||
|
||||
relationships.append(CausalRelation(
|
||||
cause=cause,
|
||||
effect=effect,
|
||||
confidence=min(confidence, 0.95), # Cap at 0.95
|
||||
explanation=explanation,
|
||||
source_file_id=source_file_id,
|
||||
source_snippet=source_sentence[:200] if source_sentence else "",
|
||||
relationship_type=relationship_type,
|
||||
metadata={
|
||||
"extraction_method": "claude_ai",
|
||||
"context": context,
|
||||
"entities": entities,
|
||||
"date": date,
|
||||
"source_sentence": source_sentence
|
||||
}
|
||||
))
|
||||
|
||||
logger.info("Claude AI successfully extracted %d relationships", len(relationships))
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning("Failed to parse Claude AI response as JSON: %s. Response: %s",
|
||||
e, response_text[:500])
|
||||
except Exception as e:
|
||||
logger.warning("Error processing Claude AI response: %s", e)
|
||||
|
||||
except BadRequestError as e:
|
||||
logger.warning("Claude API error: %s", e)
|
||||
except Exception as e:
|
||||
logger.warning("Claude AI extraction failed: %s", e)
|
||||
|
||||
return relationships
|
||||
|
||||
def _extract_with_patterns(self, text: str, source_file_id: str) -> List[CausalRelation]:
|
||||
"""
|
||||
Fallback: Pattern-based extraction (original method).
|
||||
Returns candidate relationships for DoWhy validation.
|
||||
"""
|
||||
if not text or not text.strip():
|
||||
return []
|
||||
|
||||
relationships: List[CausalRelation] = []
|
||||
seen = set() # Avoid duplicates
|
||||
|
||||
# Normalize text
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
sentences = re.split(r'[.!?]\s+', text)
|
||||
|
||||
for sentence in sentences:
|
||||
sentence = sentence.strip()
|
||||
if len(sentence) < 10: # Skip very short sentences
|
||||
continue
|
||||
|
||||
for pattern, rel_type in self.CAUSE_EFFECT_PATTERNS:
|
||||
matches = re.finditer(pattern, sentence, re.IGNORECASE)
|
||||
|
||||
for match in matches:
|
||||
cause = match.group(1).strip()
|
||||
effect = match.group(2).strip()
|
||||
|
||||
# Filter out very short or very long phrases (increased limit for technical terms)
|
||||
if len(cause) < 3 or len(cause) > 150:
|
||||
continue
|
||||
if len(effect) < 3 or len(effect) > 150:
|
||||
continue
|
||||
|
||||
# Skip common false positives
|
||||
if cause.lower() in ["this", "that", "it", "they", "we"]:
|
||||
continue
|
||||
if effect.lower() in ["this", "that", "it", "they", "we"]:
|
||||
continue
|
||||
|
||||
# Create unique key
|
||||
key = (cause.lower(), effect.lower())
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
|
||||
# Calculate confidence based on pattern type
|
||||
confidence = self._calculate_confidence(rel_type, sentence)
|
||||
|
||||
# Map pattern type to Neo4j relationship type (uppercase with underscores)
|
||||
neo4j_rel_type = self._map_to_neo4j_relationship_type(rel_type)
|
||||
|
||||
relationships.append(CausalRelation(
|
||||
cause=cause,
|
||||
effect=effect,
|
||||
confidence=confidence,
|
||||
explanation=f"Extracted from text using pattern: {rel_type}",
|
||||
source_file_id=source_file_id,
|
||||
source_snippet=sentence[:200], # First 200 chars
|
||||
relationship_type=neo4j_rel_type,
|
||||
metadata={
|
||||
"extraction_method": "pattern_matching",
|
||||
"pattern_type": rel_type,
|
||||
"sentence": sentence
|
||||
}
|
||||
))
|
||||
|
||||
logger.info("Extracted %d candidate relationships from text (source: %s)",
|
||||
len(relationships), source_file_id)
|
||||
return relationships
|
||||
|
||||
def _calculate_confidence(self, rel_type: str, sentence: str) -> float:
|
||||
"""Calculate confidence score based on pattern type and sentence quality."""
|
||||
base_confidence = {
|
||||
"causes": 0.8,
|
||||
"leads_to": 0.75,
|
||||
"results_in": 0.75,
|
||||
"triggers": 0.7,
|
||||
"produces": 0.7,
|
||||
"depends_on": 0.65,
|
||||
"requires": 0.65,
|
||||
"needs": 0.6,
|
||||
"if_then": 0.8,
|
||||
"when_then": 0.75,
|
||||
"implies": 0.7,
|
||||
"follows": 0.6,
|
||||
"comes_after": 0.6,
|
||||
"first_then": 0.7,
|
||||
"enables": 0.7,
|
||||
"allows": 0.65,
|
||||
"facilitates": 0.65,
|
||||
"relies_on": 0.65,
|
||||
"uses": 0.6,
|
||||
"utilizes": 0.6,
|
||||
"leverages": 0.6,
|
||||
"connects_to": 0.7,
|
||||
"communicates_with": 0.7,
|
||||
"interacts_with": 0.7,
|
||||
"integrates_with": 0.7,
|
||||
"provides": 0.7,
|
||||
"supports": 0.7,
|
||||
"handles": 0.65,
|
||||
"manages": 0.65,
|
||||
"controls": 0.65,
|
||||
"processes": 0.65,
|
||||
"generates": 0.7,
|
||||
"creates": 0.7,
|
||||
"implements": 0.7,
|
||||
"delivers": 0.7,
|
||||
"flows_to": 0.7,
|
||||
"sends_to": 0.7,
|
||||
"transmits_to": 0.7,
|
||||
"receives_from": 0.7,
|
||||
"ensures": 0.75,
|
||||
"precedes": 0.6,
|
||||
"contains": 0.6,
|
||||
"includes": 0.6,
|
||||
"consists_of": 0.6,
|
||||
"affects": 0.65,
|
||||
"impacts": 0.65,
|
||||
"influences": 0.65,
|
||||
}.get(rel_type, 0.5)
|
||||
|
||||
# Adjust based on sentence length (longer sentences might be more descriptive)
|
||||
if len(sentence) > 50:
|
||||
base_confidence += 0.05
|
||||
|
||||
return min(base_confidence, 0.95)
|
||||
|
||||
def _map_to_neo4j_relationship_type(self, pattern_type: str) -> str:
|
||||
"""Map pattern type to Neo4j relationship type (uppercase with underscores)."""
|
||||
# Map lowercase pattern types to Neo4j relationship types
|
||||
mapping = {
|
||||
"causes": "CAUSES",
|
||||
"leads_to": "LEADS_TO",
|
||||
"results_in": "RESULTS_IN",
|
||||
"triggers": "TRIGGERS",
|
||||
"produces": "PRODUCES",
|
||||
"depends_on": "DEPENDS_ON",
|
||||
"requires": "REQUIRES",
|
||||
"needs": "NEEDS",
|
||||
"relies_on": "RELIES_ON",
|
||||
"uses": "USES",
|
||||
"utilizes": "UTILIZES",
|
||||
"leverages": "LEVERAGES",
|
||||
"connects_to": "CONNECTS_TO",
|
||||
"communicates_with": "COMMUNICATES_WITH",
|
||||
"interacts_with": "INTERACTS_WITH",
|
||||
"integrates_with": "INTEGRATES_WITH",
|
||||
"provides": "PROVIDES",
|
||||
"supports": "SUPPORTS",
|
||||
"handles": "HANDLES",
|
||||
"manages": "MANAGES",
|
||||
"controls": "CONTROLS",
|
||||
"processes": "PROCESSES",
|
||||
"generates": "GENERATES",
|
||||
"creates": "CREATES",
|
||||
"implements": "IMPLEMENTS",
|
||||
"delivers": "DELIVERS",
|
||||
"flows_to": "FLOWS_TO",
|
||||
"sends_to": "SENDS_TO",
|
||||
"transmits_to": "TRANSMITS_TO",
|
||||
"receives_from": "RECEIVES_FROM",
|
||||
"if_then": "IF_THEN",
|
||||
"when_then": "WHEN_THEN",
|
||||
"implies": "IMPLIES",
|
||||
"ensures": "ENSURES",
|
||||
"follows": "FOLLOWS",
|
||||
"comes_after": "COMES_AFTER",
|
||||
"first_then": "FIRST_THEN",
|
||||
"precedes": "PRECEDES",
|
||||
"contains": "CONTAINS",
|
||||
"includes": "INCLUDES",
|
||||
"consists_of": "CONSISTS_OF",
|
||||
"affects": "AFFECTS",
|
||||
"impacts": "IMPACTS",
|
||||
"influences": "INFLUENCES",
|
||||
"enables": "ENABLES",
|
||||
"allows": "ALLOWS",
|
||||
"facilitates": "FACILITATES",
|
||||
}
|
||||
return mapping.get(pattern_type, "CAUSES") # Default to CAUSES if not found
|
||||
|
||||
def extract_from_qwen_results(self, qwen_results: List[Dict], source_file_id: str) -> List[CausalRelation]:
|
||||
"""Convert Qwen2.5-VL extraction results to CausalRelation objects."""
|
||||
relationships: List[CausalRelation] = []
|
||||
|
||||
for result in qwen_results:
|
||||
entity1 = result.get("entity1", "").strip()
|
||||
entity2 = result.get("entity2", "").strip()
|
||||
rel_type = result.get("relationship_type", "").strip()
|
||||
description = result.get("description", "").strip()
|
||||
confidence = float(result.get("confidence", 0.7))
|
||||
|
||||
if not entity1 or not entity2:
|
||||
continue
|
||||
|
||||
# Map relationship type to cause-effect
|
||||
# For most types, entity1 is cause, entity2 is effect
|
||||
cause = entity1
|
||||
effect = entity2
|
||||
|
||||
# Some relationship types might need reversal
|
||||
if rel_type in ["depends_on", "requires", "needs"]:
|
||||
# If A depends on B, then B is the cause, A is the effect
|
||||
cause, effect = effect, cause
|
||||
|
||||
# Map Qwen relationship type to Neo4j format
|
||||
neo4j_rel_type = self._map_to_neo4j_relationship_type(rel_type.lower().replace("-", "_"))
|
||||
|
||||
relationships.append(CausalRelation(
|
||||
cause=cause,
|
||||
effect=effect,
|
||||
confidence=confidence,
|
||||
explanation=description or f"Extracted from diagram: {rel_type}",
|
||||
source_file_id=source_file_id,
|
||||
source_snippet=description,
|
||||
relationship_type=neo4j_rel_type,
|
||||
metadata={
|
||||
"extraction_method": "qwen2.5-vl",
|
||||
"relationship_type": rel_type,
|
||||
"original_entity1": entity1,
|
||||
"original_entity2": entity2
|
||||
}
|
||||
))
|
||||
|
||||
return relationships
|
||||
|
||||
@ -0,0 +1,570 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Set
|
||||
|
||||
from anthropic import Anthropic, BadRequestError
|
||||
|
||||
from ..config import get_settings
|
||||
from ..models import CausalRelation, ProjectReport
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Try to import PDF generation libraries
|
||||
try:
|
||||
import markdown
|
||||
from markdown.extensions import codehilite, fenced_code, tables
|
||||
HAS_MARKDOWN = True
|
||||
except ImportError:
|
||||
HAS_MARKDOWN = False
|
||||
logger.warning("markdown library not available - PDF conversion will be limited")
|
||||
|
||||
try:
|
||||
from weasyprint import HTML, CSS
|
||||
from weasyprint.text.fonts import FontConfiguration
|
||||
HAS_WEASYPRINT = True
|
||||
except ImportError:
|
||||
HAS_WEASYPRINT = False
|
||||
logger.warning("weasyprint not available - PDF conversion will be skipped")
|
||||
|
||||
|
||||
class ReportGenerator:
|
||||
"""Generate beginner-friendly onboarding reports from knowledge graph."""
|
||||
|
||||
def __init__(self, api_key: str | None = None, model: str | None = None):
|
||||
settings = get_settings()
|
||||
self.api_key = api_key or settings.anthropic_api_key
|
||||
self.model = model or settings.claude_model
|
||||
self.max_output_tokens = settings.claude_max_output_tokens
|
||||
|
||||
if not self.api_key:
|
||||
raise ValueError("Anthropic API key is required for report generation")
|
||||
|
||||
self.client = Anthropic(api_key=self.api_key)
|
||||
|
||||
def generate_onboarding_report(
|
||||
self,
|
||||
job_id: str,
|
||||
relations: List[CausalRelation],
|
||||
vector_store,
|
||||
embedder,
|
||||
graph_writer=None,
|
||||
kg_summary: Dict | None = None
|
||||
) -> ProjectReport:
|
||||
"""
|
||||
Generate a beginner-friendly onboarding report from the knowledge graph.
|
||||
"""
|
||||
logger.info("Generating onboarding report for job %s", job_id)
|
||||
|
||||
# Step 1: Analyze KG structure
|
||||
key_concepts = self._analyze_kg_structure(relations)
|
||||
|
||||
# Step 2: Semantic search for different topics
|
||||
overview_content = self._search_topic(
|
||||
"project overview main purpose goals objectives",
|
||||
vector_store, embedder, job_id, top_k=10
|
||||
)
|
||||
|
||||
concepts_content = self._search_topic(
|
||||
"core concepts definitions key terms important ideas",
|
||||
vector_store, embedder, job_id, top_k=15
|
||||
)
|
||||
|
||||
processes_content = self._search_topic(
|
||||
"how system works processes flows procedures steps",
|
||||
vector_store, embedder, job_id, top_k=15
|
||||
)
|
||||
|
||||
relationships_content = self._search_topic(
|
||||
"cause effect dependencies relationships connections",
|
||||
vector_store, embedder, job_id, top_k=20
|
||||
)
|
||||
|
||||
components_content = self._search_topic(
|
||||
"components modules systems parts architecture",
|
||||
vector_store, embedder, job_id, top_k=15
|
||||
)
|
||||
|
||||
# Step 3: Query Neo4j for causal chains (as per README Step 7.3)
|
||||
causal_chains = []
|
||||
key_entities = []
|
||||
if graph_writer:
|
||||
try:
|
||||
# Query 1: Get critical causal chains
|
||||
causal_chains = graph_writer.query_causal_chains(
|
||||
job_id=job_id,
|
||||
min_length=2,
|
||||
max_length=4,
|
||||
min_confidence=0.8,
|
||||
limit=20
|
||||
)
|
||||
logger.info("Retrieved %d causal chains from Neo4j", len(causal_chains))
|
||||
|
||||
# Query 2: Get key entities
|
||||
key_entities = graph_writer.query_key_entities(job_id=job_id, limit=20)
|
||||
logger.info("Retrieved %d key entities from Neo4j", len(key_entities))
|
||||
except Exception as neo4j_exc:
|
||||
logger.warning("Failed to query Neo4j: %s", neo4j_exc)
|
||||
|
||||
# Step 4: Organize content hierarchically
|
||||
organized_content = self._organize_content(
|
||||
key_concepts,
|
||||
overview_content,
|
||||
concepts_content,
|
||||
processes_content,
|
||||
relationships_content,
|
||||
components_content,
|
||||
causal_chains,
|
||||
key_entities
|
||||
)
|
||||
|
||||
# Step 5: Generate report with Claude
|
||||
report_content = self._claude_generate_report(
|
||||
job_id=job_id,
|
||||
relations=relations,
|
||||
organized_content=organized_content,
|
||||
kg_summary=kg_summary or {}
|
||||
)
|
||||
|
||||
# Step 6: Parse sections
|
||||
sections = self._parse_sections(report_content)
|
||||
|
||||
# Step 7: Convert to PDF (as per README Step 7.8)
|
||||
pdf_path = None
|
||||
if HAS_WEASYPRINT and HAS_MARKDOWN:
|
||||
try:
|
||||
pdf_path = self._convert_to_pdf(report_content, job_id)
|
||||
logger.info("Generated PDF report: %s", pdf_path)
|
||||
except Exception as pdf_exc:
|
||||
logger.warning("PDF conversion failed: %s", pdf_exc)
|
||||
|
||||
# Estimate pages (rough: ~500 words per page)
|
||||
word_count = len(report_content.split())
|
||||
estimated_pages = max(1, word_count // 500)
|
||||
|
||||
return ProjectReport(
|
||||
job_id=job_id,
|
||||
title="Project Onboarding Guide",
|
||||
content=report_content,
|
||||
sections=sections,
|
||||
key_concepts=list(key_concepts)[:20], # Top 20 concepts
|
||||
total_pages=estimated_pages,
|
||||
generated_at=datetime.utcnow(),
|
||||
metadata={
|
||||
"total_relations": len(relations),
|
||||
"total_concepts": len(key_concepts),
|
||||
"causal_chains_count": len(causal_chains),
|
||||
"key_entities_count": len(key_entities),
|
||||
"model": self.model,
|
||||
"pdf_path": str(pdf_path) if pdf_path else None
|
||||
}
|
||||
)
|
||||
|
||||
def _analyze_kg_structure(self, relations: List[CausalRelation]) -> Set[str]:
|
||||
"""Identify key concepts from the knowledge graph."""
|
||||
concepts = set()
|
||||
|
||||
for rel in relations:
|
||||
concepts.add(rel.cause)
|
||||
concepts.add(rel.effect)
|
||||
|
||||
# Identify high-degree nodes (concepts involved in many relationships)
|
||||
cause_counts: Dict[str, int] = {}
|
||||
effect_counts: Dict[str, int] = {}
|
||||
|
||||
for rel in relations:
|
||||
cause_counts[rel.cause] = cause_counts.get(rel.cause, 0) + 1
|
||||
effect_counts[rel.effect] = effect_counts.get(rel.effect, 0) + 1
|
||||
|
||||
# Key concepts are those with high degree (appear in many relationships)
|
||||
all_counts = {**cause_counts, **effect_counts}
|
||||
threshold = max(1, len(relations) // 10) # Top 10% most connected
|
||||
|
||||
key_concepts = {
|
||||
concept for concept, count in all_counts.items()
|
||||
if count >= threshold
|
||||
}
|
||||
|
||||
# If threshold is too high, use top N concepts
|
||||
if len(key_concepts) < 5:
|
||||
sorted_concepts = sorted(all_counts.items(), key=lambda x: x[1], reverse=True)
|
||||
key_concepts = {concept for concept, _ in sorted_concepts[:20]}
|
||||
|
||||
logger.info("Identified %d key concepts from %d relationships",
|
||||
len(key_concepts), len(relations))
|
||||
return key_concepts
|
||||
|
||||
def _search_topic(
|
||||
self,
|
||||
query: str,
|
||||
vector_store,
|
||||
embedder,
|
||||
job_id: str,
|
||||
top_k: int = 10
|
||||
) -> List[Dict]:
|
||||
"""Search for content related to a topic."""
|
||||
try:
|
||||
results = vector_store.search_by_text(
|
||||
query_text=query,
|
||||
embedder=embedder,
|
||||
job_id=job_id,
|
||||
top_k=top_k
|
||||
)
|
||||
return results
|
||||
except Exception as exc:
|
||||
logger.warning("Search failed for topic '%s': %s", query, exc)
|
||||
return []
|
||||
|
||||
def _organize_content(
|
||||
self,
|
||||
key_concepts: Set[str],
|
||||
overview_content: List[Dict],
|
||||
concepts_content: List[Dict],
|
||||
processes_content: List[Dict],
|
||||
relationships_content: List[Dict],
|
||||
components_content: List[Dict],
|
||||
causal_chains: List[Dict] = None,
|
||||
key_entities: List[Dict] = None
|
||||
) -> Dict:
|
||||
"""Organize retrieved content into a structured format."""
|
||||
return {
|
||||
"key_concepts": list(key_concepts),
|
||||
"overview": [r.get("payload", {}) for r in overview_content],
|
||||
"concepts": [r.get("payload", {}) for r in concepts_content],
|
||||
"processes": [r.get("payload", {}) for r in processes_content],
|
||||
"relationships": [r.get("payload", {}) for r in relationships_content],
|
||||
"components": [r.get("payload", {}) for r in components_content],
|
||||
"causal_chains": causal_chains or [],
|
||||
"key_entities": key_entities or [],
|
||||
}
|
||||
|
||||
def _claude_generate_report(
|
||||
self,
|
||||
job_id: str,
|
||||
relations: List[CausalRelation],
|
||||
organized_content: Dict,
|
||||
kg_summary: Dict
|
||||
) -> str:
|
||||
"""Generate report using Claude AI."""
|
||||
|
||||
# Build KG summary text
|
||||
kg_summary_text = self._build_kg_summary(relations, organized_content)
|
||||
|
||||
# Build system prompt
|
||||
system_prompt = """You are an expert technical writer specializing in creating beginner-friendly onboarding documentation for new team members.
|
||||
|
||||
Your goal is to explain complex project information in simple, clear language that anyone can understand, even without technical background.
|
||||
|
||||
Guidelines:
|
||||
- Use simple, clear language - avoid jargon or explain it when necessary
|
||||
- Use examples and analogies to make concepts relatable
|
||||
- Structure information logically (basics first, then advanced)
|
||||
- Make it engaging and easy to follow
|
||||
- Cover all important aspects comprehensively
|
||||
- Write in a friendly, welcoming tone
|
||||
- Use headings, bullet points, and clear sections
|
||||
- Explain "why" not just "what"
|
||||
|
||||
Generate a comprehensive onboarding document that helps a new team member understand the entire project."""
|
||||
|
||||
# Format causal chains from Neo4j
|
||||
causal_chains_text = self._format_causal_chains(organized_content.get('causal_chains', []))
|
||||
key_entities_text = self._format_key_entities(organized_content.get('key_entities', []))
|
||||
|
||||
# Build user prompt
|
||||
user_prompt = f"""Generate a comprehensive, beginner-friendly onboarding document for this project.
|
||||
|
||||
KNOWLEDGE GRAPH SUMMARY:
|
||||
{kg_summary_text}
|
||||
|
||||
IMPORTANT RELATIONSHIPS:
|
||||
{self._format_relationships(relations[:50])} # Top 50 relationships
|
||||
|
||||
CAUSAL CHAINS (from Knowledge Graph):
|
||||
{causal_chains_text}
|
||||
|
||||
KEY ENTITIES (from Knowledge Graph):
|
||||
{key_entities_text}
|
||||
|
||||
KEY CONCEPTS:
|
||||
{', '.join(organized_content.get('key_concepts', [])[:30])}
|
||||
|
||||
REQUIRED SECTIONS:
|
||||
1. Project Overview
|
||||
- What is this project about?
|
||||
- Main purpose and goals
|
||||
- Key stakeholders or users
|
||||
|
||||
2. Core Concepts (Explained Simply)
|
||||
- Explain each important concept in simple terms
|
||||
- Why each concept matters
|
||||
- How concepts relate to each other
|
||||
|
||||
3. How Things Work Together
|
||||
- System flow (simple explanation)
|
||||
- Key processes and workflows
|
||||
- Dependencies explained simply
|
||||
|
||||
4. Important Relationships
|
||||
- Cause → Effect relationships (explained in plain language)
|
||||
- "When X happens, Y occurs because..."
|
||||
- Visual flow if possible (describe it)
|
||||
|
||||
5. Key Components
|
||||
- Main modules/systems/components
|
||||
- What each does (beginner-friendly)
|
||||
- How they interact
|
||||
|
||||
6. Getting Started
|
||||
- Where to start learning
|
||||
- What to understand first
|
||||
- Recommended learning path
|
||||
|
||||
7. Common Questions
|
||||
- FAQ based on the knowledge graph
|
||||
- Answers in simple terms
|
||||
|
||||
Generate the complete onboarding document in Markdown format. Make it comprehensive, beginner-friendly, and easy to follow."""
|
||||
|
||||
try:
|
||||
message = self.client.messages.create(
|
||||
model=self.model,
|
||||
max_tokens=self.max_output_tokens,
|
||||
temperature=0.3, # Slightly creative but focused
|
||||
system=system_prompt,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": user_prompt
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
content_blocks = message.content or []
|
||||
report_text = "".join(
|
||||
block.text for block in content_blocks
|
||||
if hasattr(block, "text")
|
||||
)
|
||||
|
||||
if not report_text:
|
||||
logger.warning("Empty report generated")
|
||||
return "# Project Onboarding Guide\n\nNo content available."
|
||||
|
||||
logger.info("Generated onboarding report (%d characters)", len(report_text))
|
||||
return report_text
|
||||
|
||||
except BadRequestError as e:
|
||||
# Handle API credit/authentication errors gracefully
|
||||
error_msg = str(e)
|
||||
if "credit balance" in error_msg.lower() or "too low" in error_msg.lower():
|
||||
logger.error("Claude API credit balance too low. Cannot generate report.")
|
||||
raise ValueError("Claude API credit balance is too low. Please add credits to your Anthropic account to generate reports.")
|
||||
elif "invalid_request_error" in error_msg.lower():
|
||||
logger.error("Claude API invalid request: %s", error_msg)
|
||||
raise ValueError(f"Claude API request failed: {error_msg}")
|
||||
else:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.exception("Failed to generate report: %s", e)
|
||||
raise
|
||||
|
||||
def _build_kg_summary(
|
||||
self,
|
||||
relations: List[CausalRelation],
|
||||
organized_content: Dict
|
||||
) -> str:
|
||||
"""Build a text summary of the knowledge graph."""
|
||||
summary_parts = [
|
||||
f"Total Relationships: {len(relations)}",
|
||||
f"Total Concepts: {len(organized_content.get('key_concepts', []))}",
|
||||
"",
|
||||
"Top Relationships:",
|
||||
]
|
||||
|
||||
# Show top relationships by confidence
|
||||
top_relations = sorted(relations, key=lambda r: r.confidence, reverse=True)[:20]
|
||||
for i, rel in enumerate(top_relations, 1):
|
||||
summary_parts.append(
|
||||
f"{i}. {rel.cause} → {rel.effect} "
|
||||
f"(confidence: {rel.confidence:.2f})"
|
||||
)
|
||||
|
||||
return "\n".join(summary_parts)
|
||||
|
||||
def _format_relationships(self, relations: List[CausalRelation]) -> str:
|
||||
"""Format relationships for the prompt."""
|
||||
if not relations:
|
||||
return "No relationships found."
|
||||
|
||||
lines = []
|
||||
for rel in relations[:50]: # Limit to 50
|
||||
line = f"- {rel.cause} → {rel.effect}"
|
||||
if rel.explanation:
|
||||
line += f" ({rel.explanation[:100]})"
|
||||
lines.append(line)
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def _parse_sections(self, content: str) -> Dict[str, str]:
|
||||
"""Parse markdown content into sections."""
|
||||
sections = {}
|
||||
current_section = None
|
||||
current_content = []
|
||||
|
||||
lines = content.split('\n')
|
||||
|
||||
for line in lines:
|
||||
# Check if it's a heading (starts with #)
|
||||
if line.strip().startswith('#'):
|
||||
# Save previous section
|
||||
if current_section:
|
||||
sections[current_section] = '\n'.join(current_content).strip()
|
||||
|
||||
# Start new section
|
||||
current_section = line.strip().lstrip('#').strip()
|
||||
current_content = [line]
|
||||
else:
|
||||
if current_section:
|
||||
current_content.append(line)
|
||||
else:
|
||||
# Content before first heading
|
||||
if 'introduction' not in sections:
|
||||
sections['introduction'] = line
|
||||
else:
|
||||
sections['introduction'] += '\n' + line
|
||||
|
||||
# Save last section
|
||||
if current_section:
|
||||
sections[current_section] = '\n'.join(current_content).strip()
|
||||
|
||||
return sections
|
||||
|
||||
def _format_causal_chains(self, causal_chains: List[Dict]) -> str:
|
||||
"""Format causal chains from Neo4j for the prompt."""
|
||||
if not causal_chains:
|
||||
return "No causal chains found in knowledge graph."
|
||||
|
||||
lines = []
|
||||
for i, chain_data in enumerate(causal_chains[:20], 1): # Top 20 chains
|
||||
chain = chain_data.get("chain", [])
|
||||
avg_confidence = chain_data.get("avg_confidence", 0.0)
|
||||
|
||||
if len(chain) >= 2:
|
||||
chain_text = " → ".join(chain)
|
||||
lines.append(f"{i}. {chain_text} (confidence: {avg_confidence:.2f})")
|
||||
|
||||
return "\n".join(lines) if lines else "No causal chains found."
|
||||
|
||||
def _format_key_entities(self, key_entities: List[Dict]) -> str:
|
||||
"""Format key entities from Neo4j for the prompt."""
|
||||
if not key_entities:
|
||||
return "No key entities found in knowledge graph."
|
||||
|
||||
lines = []
|
||||
for entity in key_entities[:20]: # Top 20 entities
|
||||
name = entity.get("name", "")
|
||||
entity_type = entity.get("type", "Entity")
|
||||
relation_count = entity.get("relation_count", 0)
|
||||
lines.append(f"- {name} ({entity_type}): involved in {relation_count} relationships")
|
||||
|
||||
return "\n".join(lines) if lines else "No key entities found."
|
||||
|
||||
def _convert_to_pdf(self, markdown_content: str, job_id: str) -> Optional[Path]:
|
||||
"""
|
||||
Convert Markdown report to PDF as per README Step 7.8.
|
||||
Uses markdown + weasyprint for PDF generation.
|
||||
"""
|
||||
if not HAS_MARKDOWN or not HAS_WEASYPRINT:
|
||||
return None
|
||||
|
||||
try:
|
||||
# Convert Markdown to HTML
|
||||
html_content = markdown.markdown(
|
||||
markdown_content,
|
||||
extensions=['codehilite', 'fenced_code', 'tables']
|
||||
)
|
||||
|
||||
# Add CSS styling
|
||||
css_style = """
|
||||
@page {
|
||||
size: A4;
|
||||
margin: 2cm;
|
||||
}
|
||||
body {
|
||||
font-family: 'Georgia', serif;
|
||||
line-height: 1.6;
|
||||
color: #333;
|
||||
}
|
||||
h1, h2, h3, h4 {
|
||||
color: #2c3e50;
|
||||
margin-top: 1.5em;
|
||||
margin-bottom: 0.5em;
|
||||
}
|
||||
h1 { font-size: 2em; border-bottom: 2px solid #3498db; padding-bottom: 0.3em; }
|
||||
h2 { font-size: 1.5em; border-bottom: 1px solid #95a5a6; padding-bottom: 0.2em; }
|
||||
h3 { font-size: 1.2em; }
|
||||
code {
|
||||
background-color: #f4f4f4;
|
||||
padding: 2px 4px;
|
||||
border-radius: 3px;
|
||||
font-family: 'Courier New', monospace;
|
||||
}
|
||||
pre {
|
||||
background-color: #f4f4f4;
|
||||
padding: 1em;
|
||||
border-radius: 5px;
|
||||
overflow-x: auto;
|
||||
}
|
||||
table {
|
||||
border-collapse: collapse;
|
||||
width: 100%;
|
||||
margin: 1em 0;
|
||||
}
|
||||
th, td {
|
||||
border: 1px solid #ddd;
|
||||
padding: 8px;
|
||||
text-align: left;
|
||||
}
|
||||
th {
|
||||
background-color: #3498db;
|
||||
color: white;
|
||||
}
|
||||
"""
|
||||
|
||||
# Create full HTML document
|
||||
full_html = f"""
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>Project Onboarding Guide</title>
|
||||
</head>
|
||||
<body>
|
||||
{html_content}
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
# Generate PDF
|
||||
settings = get_settings()
|
||||
storage_root = Path(settings.storage_root)
|
||||
reports_dir = storage_root / "reports"
|
||||
reports_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
pdf_path = reports_dir / f"report_{job_id}.pdf"
|
||||
|
||||
HTML(string=full_html).write_pdf(
|
||||
pdf_path,
|
||||
stylesheets=[CSS(string=css_style)]
|
||||
)
|
||||
|
||||
logger.info("PDF report generated: %s", pdf_path)
|
||||
return pdf_path
|
||||
|
||||
except Exception as exc:
|
||||
logger.exception("Failed to convert Markdown to PDF: %s", exc)
|
||||
return None
|
||||
|
||||
@ -0,0 +1,269 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Dict, List, Optional
|
||||
from uuid import uuid4
|
||||
|
||||
from ..config import get_settings
|
||||
from ..models import CausalRelation
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
|
||||
HAS_QDRANT = True
|
||||
except ImportError:
|
||||
HAS_QDRANT = False
|
||||
logger.warning("qdrant-client not available")
|
||||
|
||||
|
||||
class VectorStore:
|
||||
"""Qdrant vector database client for storing KG embeddings."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
url: str | None = None,
|
||||
collection_name: str | None = None,
|
||||
vector_size: int | None = None
|
||||
):
|
||||
if not HAS_QDRANT:
|
||||
raise ImportError("qdrant-client is required for vector storage")
|
||||
|
||||
settings = get_settings()
|
||||
self.url = url or settings.qdrant_url
|
||||
self.collection_name = collection_name or settings.qdrant_collection_name
|
||||
self.vector_size = vector_size or settings.qdrant_vector_size
|
||||
|
||||
logger.info("Connecting to Qdrant at %s", self.url)
|
||||
try:
|
||||
self.client = QdrantClient(url=self.url)
|
||||
logger.info("Connected to Qdrant")
|
||||
except Exception as exc:
|
||||
logger.exception("Failed to connect to Qdrant: %s", exc)
|
||||
raise
|
||||
|
||||
# Ensure collection exists
|
||||
self._ensure_collection()
|
||||
|
||||
def _ensure_collection(self) -> None:
|
||||
"""Create collection if it doesn't exist."""
|
||||
try:
|
||||
collections = self.client.get_collections()
|
||||
collection_names = [col.name for col in collections.collections]
|
||||
|
||||
if self.collection_name not in collection_names:
|
||||
logger.info("Creating Qdrant collection: %s", self.collection_name)
|
||||
try:
|
||||
self.client.create_collection(
|
||||
collection_name=self.collection_name,
|
||||
vectors_config=VectorParams(
|
||||
size=self.vector_size,
|
||||
distance=Distance.COSINE
|
||||
)
|
||||
)
|
||||
logger.info("Created collection: %s", self.collection_name)
|
||||
except Exception as create_exc:
|
||||
# Collection might have been created by another instance
|
||||
if "already exists" in str(create_exc).lower() or "409" in str(create_exc):
|
||||
logger.info("Collection %s already exists (created by another instance)", self.collection_name)
|
||||
else:
|
||||
raise
|
||||
else:
|
||||
logger.debug("Collection %s already exists", self.collection_name)
|
||||
except Exception as exc:
|
||||
logger.exception("Failed to ensure collection: %s", exc)
|
||||
raise
|
||||
|
||||
def store_relation(
|
||||
self,
|
||||
relation: CausalRelation,
|
||||
embedding: List[float],
|
||||
job_id: str
|
||||
) -> str:
|
||||
"""Store a relationship embedding in Qdrant."""
|
||||
point_id = str(uuid4())
|
||||
|
||||
payload = {
|
||||
"job_id": job_id,
|
||||
"cause": relation.cause,
|
||||
"effect": relation.effect,
|
||||
"confidence": relation.confidence,
|
||||
"source_file_id": relation.source_file_id or "",
|
||||
"source_snippet": relation.source_snippet or "",
|
||||
"explanation": relation.explanation or "",
|
||||
}
|
||||
|
||||
point = PointStruct(
|
||||
id=point_id,
|
||||
vector=embedding,
|
||||
payload=payload
|
||||
)
|
||||
|
||||
try:
|
||||
self.client.upsert(
|
||||
collection_name=self.collection_name,
|
||||
points=[point]
|
||||
)
|
||||
logger.debug("Stored relation embedding: %s -> %s", relation.cause, relation.effect)
|
||||
return point_id
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to store relation: %s", exc)
|
||||
return ""
|
||||
|
||||
def store_concept(
|
||||
self,
|
||||
concept_name: str,
|
||||
embedding: List[float],
|
||||
job_id: str,
|
||||
description: str | None = None
|
||||
) -> str:
|
||||
"""Store a concept/node embedding in Qdrant."""
|
||||
point_id = str(uuid4())
|
||||
|
||||
payload = {
|
||||
"job_id": job_id,
|
||||
"concept_name": concept_name,
|
||||
"description": description or "",
|
||||
"type": "concept"
|
||||
}
|
||||
|
||||
point = PointStruct(
|
||||
id=point_id,
|
||||
vector=embedding,
|
||||
payload=payload
|
||||
)
|
||||
|
||||
try:
|
||||
self.client.upsert(
|
||||
collection_name=self.collection_name,
|
||||
points=[point]
|
||||
)
|
||||
logger.debug("Stored concept embedding: %s", concept_name)
|
||||
return point_id
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to store concept: %s", exc)
|
||||
return ""
|
||||
|
||||
def search(
|
||||
self,
|
||||
query_embedding: List[float],
|
||||
job_id: str | None = None,
|
||||
top_k: int = 10,
|
||||
score_threshold: float = 0.5
|
||||
) -> List[Dict]:
|
||||
"""Search for similar vectors in Qdrant."""
|
||||
try:
|
||||
# Build filter if job_id is provided
|
||||
query_filter = None
|
||||
if job_id:
|
||||
query_filter = Filter(
|
||||
must=[
|
||||
FieldCondition(
|
||||
key="job_id",
|
||||
match=MatchValue(value=job_id)
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
# Use the collections API for search
|
||||
# Check if client has search method (newer versions) or use query_points (older)
|
||||
if hasattr(self.client, 'search'):
|
||||
results = self.client.search(
|
||||
collection_name=self.collection_name,
|
||||
query_vector=query_embedding,
|
||||
query_filter=query_filter,
|
||||
limit=top_k,
|
||||
score_threshold=score_threshold
|
||||
)
|
||||
elif hasattr(self.client, 'query_points'):
|
||||
# Fallback for older API
|
||||
results = self.client.query_points(
|
||||
collection_name=self.collection_name,
|
||||
query=query_embedding,
|
||||
query_filter=query_filter,
|
||||
top=top_k,
|
||||
score_threshold=score_threshold
|
||||
)
|
||||
else:
|
||||
# Try using the collection directly
|
||||
collection = self.client.get_collection(self.collection_name)
|
||||
if hasattr(collection, 'search'):
|
||||
results = collection.search(
|
||||
query_vector=query_embedding,
|
||||
query_filter=query_filter,
|
||||
limit=top_k,
|
||||
score_threshold=score_threshold
|
||||
)
|
||||
else:
|
||||
logger.error("QdrantClient does not have search or query_points method")
|
||||
return []
|
||||
|
||||
# Convert to list of dicts
|
||||
search_results = []
|
||||
for result in results:
|
||||
search_results.append({
|
||||
"id": str(result.id),
|
||||
"score": result.score,
|
||||
"payload": result.payload
|
||||
})
|
||||
|
||||
return search_results
|
||||
|
||||
except Exception as exc:
|
||||
logger.warning("Vector search failed: %s", exc)
|
||||
import traceback
|
||||
logger.debug("Search error traceback: %s", traceback.format_exc())
|
||||
return []
|
||||
|
||||
def search_by_text(
|
||||
self,
|
||||
query_text: str,
|
||||
embedder,
|
||||
job_id: str | None = None,
|
||||
top_k: int = 10
|
||||
) -> List[Dict]:
|
||||
"""Search using text query (embeds it first)."""
|
||||
query_embedding = embedder.embed_text(query_text)
|
||||
return self.search(query_embedding, job_id=job_id, top_k=top_k)
|
||||
|
||||
def delete_job_vectors(self, job_id: str) -> int:
|
||||
"""Delete all vectors for a specific job."""
|
||||
try:
|
||||
# Qdrant doesn't have a direct delete by filter, so we need to:
|
||||
# 1. Search for all points with job_id
|
||||
# 2. Delete them by ID
|
||||
|
||||
# This is a simplified version - in production, you might want
|
||||
# to use scroll API for large datasets
|
||||
query_filter = Filter(
|
||||
must=[
|
||||
FieldCondition(
|
||||
key="job_id",
|
||||
match=MatchValue(value=job_id)
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
# Scroll to get all points
|
||||
points, _ = self.client.scroll(
|
||||
collection_name=self.collection_name,
|
||||
scroll_filter=query_filter,
|
||||
limit=10000 # Adjust based on expected size
|
||||
)
|
||||
|
||||
if points:
|
||||
point_ids = [str(point.id) for point in points]
|
||||
self.client.delete(
|
||||
collection_name=self.collection_name,
|
||||
points_selector=point_ids
|
||||
)
|
||||
logger.info("Deleted %d vectors for job %s", len(point_ids), job_id)
|
||||
return len(point_ids)
|
||||
|
||||
return 0
|
||||
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to delete job vectors: %s", exc)
|
||||
return 0
|
||||
|
||||
@ -4,14 +4,19 @@ import logging
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List
|
||||
|
||||
from ..claude_client import ClaudeCausalExtractor
|
||||
from ..config import get_settings
|
||||
from ..extractors.auto import extract_text
|
||||
from ..extractors.image_extractor import extract_images_from_file
|
||||
from ..extractors.pymupdf_extractor import extract_all_text, extract_text_with_context
|
||||
from ..extractors.qwen_vision import QwenVisionClient
|
||||
from ..jobs import JobStore
|
||||
from ..models import CausalRelation, JobStage
|
||||
from ..processors.chunker import TextChunker
|
||||
from ..processors.dowhy_analyzer import DoWhyAnalyzer
|
||||
from ..processors.embedder import Embedder
|
||||
from ..processors.entity_resolver import EntityResolver
|
||||
from ..processors.graph_writer import GraphWriter
|
||||
from ..processors.relationship_extractor import RelationshipExtractor
|
||||
from ..processors.report_generator import ReportGenerator
|
||||
from ..processors.vector_store import VectorStore
|
||||
from ..storage import StorageManager
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -23,31 +28,60 @@ class JobPipeline:
|
||||
job_store: JobStore,
|
||||
storage: StorageManager,
|
||||
graph_writer: GraphWriter,
|
||||
claude_extractor: ClaudeCausalExtractor,
|
||||
):
|
||||
self.job_store = job_store
|
||||
self.storage = storage
|
||||
self.graph_writer = graph_writer
|
||||
self.claude_extractor = claude_extractor
|
||||
|
||||
settings = get_settings()
|
||||
self.chunker = TextChunker(
|
||||
model_name=settings.claude_model,
|
||||
token_target=settings.chunk_token_target,
|
||||
overlap=settings.chunk_token_overlap,
|
||||
)
|
||||
|
||||
# Initialize extractors
|
||||
self.qwen_client = QwenVisionClient() # Only for images/diagrams
|
||||
self.relationship_extractor = RelationshipExtractor() # NLP (SpaCy) + Claude AI for text (as per README)
|
||||
self.entity_resolver = EntityResolver() # Claude AI entity resolution (as per README Stage 4)
|
||||
|
||||
# Initialize processors
|
||||
try:
|
||||
self.dowhy_analyzer = DoWhyAnalyzer() if settings.dowhy_enabled else None
|
||||
except Exception as e:
|
||||
logger.warning("DoWhy not available: %s", e)
|
||||
self.dowhy_analyzer = None
|
||||
|
||||
try:
|
||||
self.embedder = Embedder()
|
||||
self.vector_store = VectorStore()
|
||||
except Exception as e:
|
||||
logger.warning("Vector store not available: %s", e)
|
||||
self.embedder = None
|
||||
self.vector_store = None
|
||||
|
||||
try:
|
||||
self.report_generator = ReportGenerator()
|
||||
except Exception as e:
|
||||
logger.warning("Report generator not available: %s", e)
|
||||
self.report_generator = None
|
||||
|
||||
def process_job(self, job_id: str, saved_files: Iterable[str]) -> None:
|
||||
job = self.job_store.get(job_id)
|
||||
logger.info("Processing job %s with %d files", job_id, job.total_files)
|
||||
|
||||
relations: List[CausalRelation] = []
|
||||
all_text_content: List[str] = []
|
||||
all_relations: List[CausalRelation] = []
|
||||
|
||||
try:
|
||||
self.job_store.update(job_id, stage=JobStage.EXTRACTING, status_message="Extracting content")
|
||||
# ============================================================
|
||||
# STEP 1: CONTENT EXTRACTION (PyMuPDF + Qwen2.5-VL)
|
||||
# ============================================================
|
||||
self.job_store.update(
|
||||
job_id,
|
||||
stage=JobStage.EXTRACTING,
|
||||
status_message="Extracting content from documents"
|
||||
)
|
||||
|
||||
for count, file_path in enumerate(saved_files, start=1):
|
||||
file_path_obj = Path(file_path)
|
||||
file_record = next((f for f in job.files if f.stored_path == file_path), None)
|
||||
logger.info("Processing %s", file_path_obj.name)
|
||||
logger.info("Processing %s (%d/%d)", file_path_obj.name, count, job.total_files)
|
||||
source_file_id = file_record.id if file_record else file_path_obj.name
|
||||
suffix = file_path_obj.suffix.lower()
|
||||
|
||||
@ -55,27 +89,36 @@ class JobPipeline:
|
||||
is_direct_image = suffix in {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}
|
||||
|
||||
try:
|
||||
# Extract text from document (if not a direct image)
|
||||
# Step 2.1: IDENTIFY FILE TYPE and route to appropriate extractor
|
||||
# Step 2.2: Extract text based on file type (as per README)
|
||||
text = ""
|
||||
if not is_direct_image:
|
||||
try:
|
||||
text = extract_text(file_path_obj)
|
||||
# extract_all_text() handles routing:
|
||||
# - PDF → PyMuPDF (Step 2.2a)
|
||||
# - DOCX → python-docx (Step 2.2b)
|
||||
# - PPTX → python-pptx (Step 2.2c)
|
||||
# - CSV/XLSX → pandas (Step 2.2d)
|
||||
# - Text files → direct read
|
||||
# Also performs Step 2.3: Text cleaning
|
||||
text = extract_all_text(file_path_obj)
|
||||
|
||||
# Process text if available
|
||||
if text and text.strip():
|
||||
# Validate text is readable
|
||||
# Validate text is readable (basic check)
|
||||
printable_chars = sum(1 for c in text if c.isprintable() or c.isspace())
|
||||
total_chars = len(text)
|
||||
if total_chars > 100 and printable_chars / total_chars < 0.3:
|
||||
logger.warning("Text from %s appears to be binary, skipping text processing", file_path_obj.name)
|
||||
logger.warning("Text from %s appears to be binary, skipping", file_path_obj.name)
|
||||
text = ""
|
||||
else:
|
||||
# Step 2.4: STORE EXTRACTED TEXT
|
||||
all_text_content.append(text)
|
||||
extracted_path = self.storage.stage_extracted_content(job_id, file_path_obj.name, text)
|
||||
if file_record:
|
||||
file_record.extracted_path = str(extracted_path)
|
||||
logger.info("Successfully extracted %d characters from %s", len(text), file_path_obj.name)
|
||||
logger.info("Extracted %d characters from %s", len(text), file_path_obj.name)
|
||||
except Exception as text_exc:
|
||||
logger.warning("Text extraction failed for %s: %s. Will continue with image extraction if available.", file_path_obj.name, text_exc)
|
||||
logger.warning("Text extraction failed for %s: %s", file_path_obj.name, text_exc)
|
||||
text = ""
|
||||
|
||||
# Extract images from documents (PDF, DOCX, PPTX)
|
||||
@ -93,7 +136,25 @@ class JobPipeline:
|
||||
extracted_images = [file_path_obj]
|
||||
logger.info("Direct image upload detected: %s", file_path_obj.name)
|
||||
|
||||
except Exception as exc: # noqa: BLE001
|
||||
# Process images with Qwen2.5-VL
|
||||
if extracted_images:
|
||||
for image_path in extracted_images:
|
||||
try:
|
||||
qwen_results = self.qwen_client.extract_relationships_from_image(
|
||||
image_path, source_file_id
|
||||
)
|
||||
if qwen_results:
|
||||
# Convert Qwen results to CausalRelation objects
|
||||
qwen_relations = self.relationship_extractor.extract_from_qwen_results(
|
||||
qwen_results, source_file_id
|
||||
)
|
||||
all_relations.extend(qwen_relations)
|
||||
logger.info("Extracted %d relations from image %s using Qwen2.5-VL",
|
||||
len(qwen_relations), image_path.name)
|
||||
except Exception as img_exc:
|
||||
logger.warning("Failed to analyze image %s with Qwen: %s", image_path, img_exc)
|
||||
|
||||
except Exception as exc:
|
||||
logger.exception("Extraction failed for %s", file_path_obj)
|
||||
if file_record:
|
||||
file_record.error = str(exc)
|
||||
@ -103,62 +164,188 @@ class JobPipeline:
|
||||
job_id,
|
||||
files=job.files,
|
||||
processed_files=count,
|
||||
status_message=f"Analyzing causal relations ({count}/{job.total_files})",
|
||||
stage=JobStage.ANALYZING,
|
||||
status_message=f"Extracting content ({count}/{job.total_files})",
|
||||
)
|
||||
|
||||
# Process text content
|
||||
if text and text.strip():
|
||||
chunks = self.chunker.chunk(text)
|
||||
text_relations = self.claude_extractor.analyze(chunks, source_file_id=source_file_id)
|
||||
relations.extend(text_relations)
|
||||
logger.info("Extracted %d relations from text in %s", len(text_relations), file_path_obj.name)
|
||||
# ============================================================
|
||||
# STEP 2: RELATIONSHIP EXTRACTION (NLP + Claude AI as per README)
|
||||
# ============================================================
|
||||
logger.info("Extracting relationships from text content using NLP (SpaCy) + Claude AI")
|
||||
combined_text = "\n\n".join(all_text_content)
|
||||
|
||||
# Process images (extracted from documents or direct uploads)
|
||||
if extracted_images:
|
||||
for image_path in extracted_images:
|
||||
try:
|
||||
image_relations = self.claude_extractor.analyze_image(image_path, source_file_id=source_file_id)
|
||||
relations.extend(image_relations)
|
||||
logger.info("Extracted %d relations from image %s", len(image_relations), image_path.name)
|
||||
except Exception as img_exc:
|
||||
logger.warning("Failed to analyze image %s: %s", image_path, img_exc)
|
||||
# Continue with other images
|
||||
elif not text or not text.strip():
|
||||
# No text and no images - file might be empty or unsupported
|
||||
logger.warning("File %s has no extractable text or images", file_path_obj.name)
|
||||
if file_record:
|
||||
file_record.error = "No extractable content found (no text or images)"
|
||||
if combined_text.strip():
|
||||
# Extract relationships using NLP (Step 3.1) + Claude AI (Step 3.2)
|
||||
# This implements the flow described in README.md
|
||||
text_relations = self.relationship_extractor.extract_from_text(
|
||||
combined_text,
|
||||
source_file_id="combined_text"
|
||||
)
|
||||
all_relations.extend(text_relations)
|
||||
logger.info("NLP + Claude AI extracted %d relationships from text", len(text_relations))
|
||||
|
||||
# Write relations to Neo4j if any were found
|
||||
if relations:
|
||||
self.job_store.update(job_id, status_message="Writing to knowledge graph", stage=JobStage.BUILDING_GRAPH)
|
||||
# ============================================================
|
||||
# STEP 3: ENTITY RESOLUTION (Claude AI as per README Stage 4)
|
||||
# ============================================================
|
||||
if all_relations and self.entity_resolver.client:
|
||||
logger.info("Resolving entities using Claude AI")
|
||||
resolved_entities = self.entity_resolver.resolve_entities(all_relations)
|
||||
if resolved_entities:
|
||||
# Apply resolution to relationships
|
||||
all_relations = self.entity_resolver.apply_resolution_to_relations(
|
||||
all_relations, resolved_entities
|
||||
)
|
||||
logger.info("Entity resolution completed: %d canonical entities", len(resolved_entities))
|
||||
else:
|
||||
logger.info("Entity resolution returned no results")
|
||||
else:
|
||||
if not self.entity_resolver.client:
|
||||
logger.info("Entity resolution skipped (Claude AI not available)")
|
||||
|
||||
# ============================================================
|
||||
# STEP 4: DOWHY VALIDATION
|
||||
# ============================================================
|
||||
if self.dowhy_analyzer and all_relations:
|
||||
self.job_store.update(
|
||||
job_id,
|
||||
status_message="Validating relationships with DoWhy",
|
||||
stage=JobStage.BUILDING_GRAPH
|
||||
)
|
||||
logger.info("Validating %d relationships with DoWhy", len(all_relations))
|
||||
validated_relations = self.dowhy_analyzer.validate_relationships(
|
||||
all_relations,
|
||||
text_data=combined_text
|
||||
)
|
||||
all_relations = validated_relations
|
||||
logger.info("DoWhy validated %d relationships", len(all_relations))
|
||||
else:
|
||||
if not self.dowhy_analyzer:
|
||||
logger.info("DoWhy validation skipped (not available)")
|
||||
self.job_store.update(
|
||||
job_id,
|
||||
status_message="Building knowledge graph",
|
||||
stage=JobStage.BUILDING_GRAPH
|
||||
)
|
||||
|
||||
# ============================================================
|
||||
# STEP 5: WRITE TO NEO4J (Documents, Entities, Relationships)
|
||||
# ============================================================
|
||||
if all_relations:
|
||||
try:
|
||||
self.graph_writer.write_relations(job_id, relations)
|
||||
logger.info("Wrote %d relations to Neo4j for job %s", len(relations), job_id)
|
||||
status_message = f"Completed with {len(relations)} causal relationship(s) written to Neo4j"
|
||||
# Write documents, entities, and relationships with types
|
||||
self.graph_writer.write_relations(job_id, all_relations, files=job.files)
|
||||
logger.info("Wrote %d relations to Neo4j for job %s", len(all_relations), job_id)
|
||||
except Exception as graph_exc:
|
||||
logger.exception("Failed to write relations to Neo4j for job %s: %s", job_id, graph_exc)
|
||||
status_message = f"Completed with {len(relations)} relations extracted, but failed to write to Neo4j: {graph_exc}"
|
||||
else:
|
||||
logger.warning("Job %s completed with 0 relations - no causal relationships found", job_id)
|
||||
# Check if any files failed to extract
|
||||
failed_files = [f for f in job.files if f.error]
|
||||
if failed_files:
|
||||
status_message = f"Completed but {len(failed_files)} file(s) failed to extract. No relations found."
|
||||
else:
|
||||
status_message = "Completed but no causal relationships were found in the documents."
|
||||
logger.exception("Failed to write relations to Neo4j: %s", graph_exc)
|
||||
raise
|
||||
|
||||
# ============================================================
|
||||
# STEP 6: VECTOR DATABASE INDEXING (Qdrant)
|
||||
# ============================================================
|
||||
if self.vector_store and self.embedder and all_relations:
|
||||
self.job_store.update(
|
||||
job_id,
|
||||
status_message="Indexing knowledge graph in vector database",
|
||||
stage=JobStage.INDEXING_VECTORS
|
||||
)
|
||||
logger.info("Indexing %d relationships in Qdrant", len(all_relations))
|
||||
|
||||
indexed_count = 0
|
||||
for relation in all_relations:
|
||||
try:
|
||||
# Generate embedding for the relationship
|
||||
embedding = self.embedder.embed_relation(
|
||||
relation.cause,
|
||||
relation.effect,
|
||||
relation.explanation
|
||||
)
|
||||
|
||||
# Store in Qdrant
|
||||
self.vector_store.store_relation(relation, embedding, job_id)
|
||||
indexed_count += 1
|
||||
except Exception as e:
|
||||
logger.warning("Failed to index relation %s -> %s: %s",
|
||||
relation.cause, relation.effect, e)
|
||||
|
||||
# Also index concepts (nodes)
|
||||
concepts = set()
|
||||
for rel in all_relations:
|
||||
concepts.add(rel.cause)
|
||||
concepts.add(rel.effect)
|
||||
|
||||
for concept in concepts:
|
||||
try:
|
||||
embedding = self.embedder.embed_concept(concept)
|
||||
self.vector_store.store_concept(concept, embedding, job_id)
|
||||
except Exception as e:
|
||||
logger.warning("Failed to index concept %s: %s", concept, e)
|
||||
|
||||
logger.info("Indexed %d relationships and %d concepts in Qdrant",
|
||||
indexed_count, len(concepts))
|
||||
|
||||
# ============================================================
|
||||
# STEP 7: GENERATE ONBOARDING REPORT
|
||||
# ============================================================
|
||||
if self.report_generator and self.vector_store and self.embedder:
|
||||
self.job_store.update(
|
||||
job_id,
|
||||
status_message="Generating beginner-friendly onboarding report",
|
||||
stage=JobStage.GENERATING_REPORT
|
||||
)
|
||||
logger.info("Generating onboarding report for job %s", job_id)
|
||||
|
||||
try:
|
||||
kg_summary = {
|
||||
"total_relations": len(all_relations),
|
||||
"total_files": job.total_files,
|
||||
"processed_files": job.processed_files
|
||||
}
|
||||
|
||||
report = self.report_generator.generate_onboarding_report(
|
||||
job_id=job_id,
|
||||
relations=all_relations,
|
||||
vector_store=self.vector_store,
|
||||
embedder=self.embedder,
|
||||
graph_writer=self.graph_writer, # Pass graph_writer for Neo4j queries
|
||||
kg_summary=kg_summary
|
||||
)
|
||||
|
||||
logger.info("Generated onboarding report: %d sections, %d pages",
|
||||
len(report.sections), report.total_pages)
|
||||
|
||||
except Exception as report_exc:
|
||||
logger.exception("Failed to generate report: %s", report_exc)
|
||||
report = None
|
||||
# Store report generation error in job metadata
|
||||
report_error_msg = str(report_exc)
|
||||
if "credit balance" in report_error_msg.lower() or "too low" in report_error_msg.lower():
|
||||
report_error_msg = "Report generation failed: Claude API credit balance is too low. Please add credits to your Anthropic account."
|
||||
self.job_store.update(
|
||||
job_id,
|
||||
error=f"Report generation failed: {report_error_msg}"
|
||||
)
|
||||
else:
|
||||
logger.warning("Report generation skipped (components not available)")
|
||||
report = None
|
||||
|
||||
# ============================================================
|
||||
# FINAL UPDATE
|
||||
# ============================================================
|
||||
status_message = f"Completed successfully"
|
||||
if all_relations:
|
||||
status_message += f" with {len(all_relations)} relationships"
|
||||
if report:
|
||||
status_message += f" and generated onboarding report"
|
||||
|
||||
# Final update
|
||||
self.job_store.update(
|
||||
job_id,
|
||||
stage=JobStage.COMPLETED,
|
||||
status_message=status_message,
|
||||
relations=relations,
|
||||
relations=all_relations,
|
||||
report=report,
|
||||
processed_files=job.total_files,
|
||||
)
|
||||
logger.info("Job %s completed with %d relations", job_id, len(relations))
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.info("Job %s completed successfully", job_id)
|
||||
|
||||
except Exception as exc:
|
||||
logger.exception("Job %s failed: %s", job_id, exc)
|
||||
self.job_store.mark_error(job_id, f"Pipeline failed: {exc}")
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user