added qdrant db in multi doc service
This commit is contained in:
parent
603e9b4b20
commit
72fea0dee8
@ -196,27 +196,45 @@ services:
|
|||||||
# retries: 5
|
# retries: 5
|
||||||
# start_period: 60s
|
# start_period: 60s
|
||||||
|
|
||||||
chromadb:
|
# chromadb:
|
||||||
image: chromadb/chroma:latest
|
# image: chromadb/chroma:latest
|
||||||
container_name: pipeline_chromadb
|
# container_name: pipeline_chromadb
|
||||||
|
# ports:
|
||||||
|
# - "8010:8000"
|
||||||
|
# environment:
|
||||||
|
# - CHROMA_SERVER_HOST=0.0.0.0
|
||||||
|
# - CHROMA_SERVER_HTTP_PORT=8000
|
||||||
|
# - IS_PERSISTENT=TRUE
|
||||||
|
# - PERSIST_DIRECTORY=/chroma/chroma
|
||||||
|
# - ANONYMIZED_TELEMETRY=TRUE
|
||||||
|
# volumes:
|
||||||
|
# - chromadb_data:/chroma/chroma
|
||||||
|
# networks:
|
||||||
|
# - pipeline_network
|
||||||
|
# healthcheck:
|
||||||
|
# test: ["CMD-SHELL", "timeout 5 bash -c '</dev/tcp/127.0.0.1/8000' || exit 1"]
|
||||||
|
# interval: 15s
|
||||||
|
# timeout: 10s
|
||||||
|
# retries: 3
|
||||||
|
# start_period: 30s
|
||||||
|
|
||||||
|
qdrant:
|
||||||
|
image: qdrant/qdrant:latest
|
||||||
|
container_name: pipeline_qdrant
|
||||||
ports:
|
ports:
|
||||||
- "8010:8000"
|
- "6333:6333"
|
||||||
environment:
|
- "6334:6334"
|
||||||
- CHROMA_SERVER_HOST=0.0.0.0
|
|
||||||
- CHROMA_SERVER_HTTP_PORT=8000
|
|
||||||
- IS_PERSISTENT=TRUE
|
|
||||||
- PERSIST_DIRECTORY=/chroma/chroma
|
|
||||||
- ANONYMIZED_TELEMETRY=TRUE
|
|
||||||
volumes:
|
volumes:
|
||||||
- chromadb_data:/chroma/chroma
|
- qdrant_data:/qdrant/storage
|
||||||
networks:
|
networks:
|
||||||
- pipeline_network
|
- pipeline_network
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD-SHELL", "timeout 5 bash -c '</dev/tcp/127.0.0.1/8000' || exit 1"]
|
test: ["CMD-SHELL", "timeout 2 bash -c '</dev/tcp/127.0.0.1/6333' || exit 1"]
|
||||||
interval: 15s
|
interval: 30s
|
||||||
timeout: 10s
|
timeout: 10s
|
||||||
retries: 3
|
retries: 5
|
||||||
start_period: 30s
|
start_period: 30s
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -294,97 +312,97 @@ services:
|
|||||||
start_period: 40s
|
start_period: 40s
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
|
||||||
requirement-processor:
|
# requirement-processor:
|
||||||
build: ./services/requirement-processor
|
# build: ./services/requirement-processor
|
||||||
container_name: pipeline_requirement_processor
|
# container_name: pipeline_requirement_processor
|
||||||
ports:
|
# ports:
|
||||||
- "8001:8001"
|
# - "8001:8001"
|
||||||
environment:
|
# environment:
|
||||||
- POSTGRES_HOST=postgres
|
# - POSTGRES_HOST=postgres
|
||||||
- POSTGRES_PORT=5432
|
# - POSTGRES_PORT=5432
|
||||||
- POSTGRES_DB=dev_pipeline
|
# - POSTGRES_DB=dev_pipeline
|
||||||
- POSTGRES_USER=pipeline_admin
|
# - POSTGRES_USER=pipeline_admin
|
||||||
- POSTGRES_PASSWORD=secure_pipeline_2024
|
# - POSTGRES_PASSWORD=secure_pipeline_2024
|
||||||
- DATABASE_URL=postgresql://pipeline_admin:secure_pipeline_2024@postgres:5432/dev_pipeline
|
# - DATABASE_URL=postgresql://pipeline_admin:secure_pipeline_2024@postgres:5432/dev_pipeline
|
||||||
- REDIS_HOST=redis
|
# - REDIS_HOST=redis
|
||||||
- REDIS_PORT=6379
|
# - REDIS_PORT=6379
|
||||||
- REDIS_PASSWORD=redis_secure_2024
|
# - REDIS_PASSWORD=redis_secure_2024
|
||||||
- MONGODB_HOST=mongodb
|
# - MONGODB_HOST=mongodb
|
||||||
- MONGODB_PORT=27017
|
# - MONGODB_PORT=27017
|
||||||
- NEO4J_URI=bolt://neo4j:7687
|
# - NEO4J_URI=bolt://neo4j:7687
|
||||||
- NEO4J_USER=neo4j
|
# - NEO4J_USER=neo4j
|
||||||
- NEO4J_PASSWORD=password
|
# - NEO4J_PASSWORD=password
|
||||||
- CHROMA_HOST=chromadb
|
# - CHROMA_HOST=chromadb
|
||||||
- CHROMA_PORT=8000
|
# - CHROMA_PORT=8000
|
||||||
- REDIS_URL=redis://:redis_secure_2024@redis:6379
|
# - REDIS_URL=redis://:redis_secure_2024@redis:6379
|
||||||
networks:
|
# networks:
|
||||||
- pipeline_network
|
# - pipeline_network
|
||||||
depends_on:
|
# depends_on:
|
||||||
postgres:
|
# postgres:
|
||||||
condition: service_healthy
|
# condition: service_healthy
|
||||||
redis:
|
# redis:
|
||||||
condition: service_healthy
|
# condition: service_healthy
|
||||||
mongodb:
|
# mongodb:
|
||||||
condition: service_started
|
# condition: service_started
|
||||||
migrations:
|
# migrations:
|
||||||
condition: service_completed_successfully
|
# condition: service_completed_successfully
|
||||||
|
|
||||||
tech-stack-selector:
|
# tech-stack-selector:
|
||||||
build: ./services/tech-stack-selector
|
# build: ./services/tech-stack-selector
|
||||||
container_name: pipeline_tech_stack_selector
|
# container_name: pipeline_tech_stack_selector
|
||||||
ports:
|
# ports:
|
||||||
- "8002:8002"
|
# - "8002:8002"
|
||||||
environment:
|
# environment:
|
||||||
- POSTGRES_HOST=postgres
|
# - POSTGRES_HOST=postgres
|
||||||
- POSTGRES_PORT=5432
|
# - POSTGRES_PORT=5432
|
||||||
- POSTGRES_DB=dev_pipeline
|
# - POSTGRES_DB=dev_pipeline
|
||||||
- POSTGRES_USER=pipeline_admin
|
# - POSTGRES_USER=pipeline_admin
|
||||||
- POSTGRES_PASSWORD=secure_pipeline_2024
|
# - POSTGRES_PASSWORD=secure_pipeline_2024
|
||||||
- REDIS_HOST=redis
|
# - REDIS_HOST=redis
|
||||||
- REDIS_PORT=6379
|
# - REDIS_PORT=6379
|
||||||
- REDIS_PASSWORD=redis_secure_2024
|
# - REDIS_PASSWORD=redis_secure_2024
|
||||||
- CLAUDE_API_KEY=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
|
# - CLAUDE_API_KEY=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
|
||||||
networks:
|
# networks:
|
||||||
- pipeline_network
|
# - pipeline_network
|
||||||
depends_on:
|
# depends_on:
|
||||||
postgres:
|
# postgres:
|
||||||
condition: service_healthy
|
# condition: service_healthy
|
||||||
redis:
|
# redis:
|
||||||
condition: service_healthy
|
# condition: service_healthy
|
||||||
migrations:
|
# migrations:
|
||||||
condition: service_completed_successfully
|
# condition: service_completed_successfully
|
||||||
|
|
||||||
architecture-designer:
|
# architecture-designer:
|
||||||
build: ./services/architecture-designer
|
# build: ./services/architecture-designer
|
||||||
container_name: pipeline_architecture_designer
|
# container_name: pipeline_architecture_designer
|
||||||
ports:
|
# ports:
|
||||||
- "8003:8003"
|
# - "8003:8003"
|
||||||
environment:
|
# environment:
|
||||||
- PORT=8003
|
# - PORT=8003
|
||||||
- HOST=0.0.0.0
|
# - HOST=0.0.0.0
|
||||||
- CLAUDE_API_KEY=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
|
# - CLAUDE_API_KEY=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
|
||||||
- ANTHROPIC_API_KEY=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
|
# - ANTHROPIC_API_KEY=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
|
||||||
- POSTGRES_HOST=postgres
|
# - POSTGRES_HOST=postgres
|
||||||
- POSTGRES_PORT=5432
|
# - POSTGRES_PORT=5432
|
||||||
- POSTGRES_DB=dev_pipeline
|
# - POSTGRES_DB=dev_pipeline
|
||||||
- POSTGRES_USER=pipeline_admin
|
# - POSTGRES_USER=pipeline_admin
|
||||||
- POSTGRES_PASSWORD=secure_pipeline_2024
|
# - POSTGRES_PASSWORD=secure_pipeline_2024
|
||||||
- MONGODB_HOST=mongodb
|
# - MONGODB_HOST=mongodb
|
||||||
- MONGODB_PORT=27017
|
# - MONGODB_PORT=27017
|
||||||
networks:
|
# networks:
|
||||||
- pipeline_network
|
# - pipeline_network
|
||||||
depends_on:
|
# depends_on:
|
||||||
postgres:
|
# postgres:
|
||||||
condition: service_healthy
|
# condition: service_healthy
|
||||||
mongodb:
|
# mongodb:
|
||||||
condition: service_started
|
# condition: service_started
|
||||||
migrations:
|
# migrations:
|
||||||
condition: service_completed_successfully
|
# condition: service_completed_successfully
|
||||||
healthcheck:
|
# healthcheck:
|
||||||
test: ["CMD", "curl", "-f", "http://localhost:8003/health"]
|
# test: ["CMD", "curl", "-f", "http://localhost:8003/health"]
|
||||||
interval: 30s
|
# interval: 30s
|
||||||
timeout: 10s
|
# timeout: 10s
|
||||||
retries: 3
|
# retries: 3
|
||||||
|
|
||||||
# code-generator:
|
# code-generator:
|
||||||
# build: ./services/code-generator
|
# build: ./services/code-generator
|
||||||
@ -461,34 +479,34 @@ services:
|
|||||||
migrations:
|
migrations:
|
||||||
condition: service_completed_successfully
|
condition: service_completed_successfully
|
||||||
|
|
||||||
deployment-manager:
|
# deployment-manager:
|
||||||
build: ./services/deployment-manager
|
# build: ./services/deployment-manager
|
||||||
container_name: pipeline_deployment_manager
|
# container_name: pipeline_deployment_manager
|
||||||
ports:
|
# ports:
|
||||||
- "8006:8006"
|
# - "8006:8006"
|
||||||
environment:
|
# environment:
|
||||||
- POSTGRES_HOST=postgres
|
# - POSTGRES_HOST=postgres
|
||||||
- POSTGRES_PORT=5432
|
# - POSTGRES_PORT=5432
|
||||||
- POSTGRES_DB=dev_pipeline
|
# - POSTGRES_DB=dev_pipeline
|
||||||
- POSTGRES_USER=pipeline_admin
|
# - POSTGRES_USER=pipeline_admin
|
||||||
- POSTGRES_PASSWORD=secure_pipeline_2024
|
# - POSTGRES_PASSWORD=secure_pipeline_2024
|
||||||
- MONGODB_HOST=mongodb
|
# - MONGODB_HOST=mongodb
|
||||||
- MONGODB_PORT=27017
|
# - MONGODB_PORT=27017
|
||||||
- RABBITMQ_HOST=rabbitmq
|
# - RABBITMQ_HOST=rabbitmq
|
||||||
- RABBITMQ_PORT=5672
|
# - RABBITMQ_PORT=5672
|
||||||
- RABBITMQ_USER=pipeline_admin
|
# - RABBITMQ_USER=pipeline_admin
|
||||||
- RABBITMQ_PASSWORD=rabbit_secure_2024
|
# - RABBITMQ_PASSWORD=rabbit_secure_2024
|
||||||
networks:
|
# networks:
|
||||||
- pipeline_network
|
# - pipeline_network
|
||||||
depends_on:
|
# depends_on:
|
||||||
postgres:
|
# postgres:
|
||||||
condition: service_healthy
|
# condition: service_healthy
|
||||||
rabbitmq:
|
# rabbitmq:
|
||||||
condition: service_healthy
|
# condition: service_healthy
|
||||||
mongodb:
|
# mongodb:
|
||||||
condition: service_started
|
# condition: service_started
|
||||||
migrations:
|
# migrations:
|
||||||
condition: service_completed_successfully
|
# condition: service_completed_successfully
|
||||||
|
|
||||||
user-auth:
|
user-auth:
|
||||||
build: ./services/user-auth
|
build: ./services/user-auth
|
||||||
@ -583,38 +601,38 @@ services:
|
|||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
|
||||||
# AI Mockup / Wireframe Generation Service
|
# AI Mockup / Wireframe Generation Service
|
||||||
ai-mockup-service:
|
# ai-mockup-service:
|
||||||
build: ./services/ai-mockup-service
|
# build: ./services/ai-mockup-service
|
||||||
container_name: pipeline_ai_mockup_service
|
# container_name: pipeline_ai_mockup_service
|
||||||
ports:
|
# ports:
|
||||||
- "8021:8021"
|
# - "8021:8021"
|
||||||
environment:
|
# environment:
|
||||||
- PORT=8021
|
# - PORT=8021
|
||||||
- HOST=0.0.0.0
|
# - HOST=0.0.0.0
|
||||||
- CLAUDE_API_KEY=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
|
# - CLAUDE_API_KEY=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
|
||||||
- POSTGRES_HOST=postgres
|
# - POSTGRES_HOST=postgres
|
||||||
- POSTGRES_PORT=5432
|
# - POSTGRES_PORT=5432
|
||||||
- POSTGRES_DB=dev_pipeline
|
# - POSTGRES_DB=dev_pipeline
|
||||||
- POSTGRES_USER=pipeline_admin
|
# - POSTGRES_USER=pipeline_admin
|
||||||
- POSTGRES_PASSWORD=secure_pipeline_2024
|
# - POSTGRES_PASSWORD=secure_pipeline_2024
|
||||||
- REDIS_HOST=redis
|
# - REDIS_HOST=redis
|
||||||
- REDIS_PORT=6379
|
# - REDIS_PORT=6379
|
||||||
- REDIS_PASSWORD=redis_secure_2024
|
# - REDIS_PASSWORD=redis_secure_2024
|
||||||
- JWT_ACCESS_SECRET=access-secret-key-2024-tech4biz-secure_pipeline_2024
|
# - JWT_ACCESS_SECRET=access-secret-key-2024-tech4biz-secure_pipeline_2024
|
||||||
- USER_AUTH_SERVICE_URL=http://user-auth:8011
|
# - USER_AUTH_SERVICE_URL=http://user-auth:8011
|
||||||
- FLASK_ENV=development
|
# - FLASK_ENV=development
|
||||||
networks:
|
# networks:
|
||||||
- pipeline_network
|
# - pipeline_network
|
||||||
depends_on:
|
# depends_on:
|
||||||
postgres:
|
# postgres:
|
||||||
condition: service_healthy
|
# condition: service_healthy
|
||||||
user-auth:
|
# user-auth:
|
||||||
condition: service_healthy
|
# condition: service_healthy
|
||||||
healthcheck:
|
# healthcheck:
|
||||||
test: ["CMD", "curl", "-f", "http://localhost:8021/health"]
|
# test: ["CMD", "curl", "-f", "http://localhost:8021/health"]
|
||||||
interval: 30s
|
# interval: 30s
|
||||||
timeout: 10s
|
# timeout: 10s
|
||||||
retries: 3
|
# retries: 3
|
||||||
|
|
||||||
git-integration:
|
git-integration:
|
||||||
build: ./services/git-integration
|
build: ./services/git-integration
|
||||||
@ -731,7 +749,7 @@ services:
|
|||||||
environment:
|
environment:
|
||||||
- PORT=8022
|
- PORT=8022
|
||||||
- HOST=0.0.0.0
|
- HOST=0.0.0.0
|
||||||
- ANTHROPIC_API_KEY=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
|
- ANTHROPIC_API_KEY=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
|
||||||
|
|
||||||
# Neo4j Configuration
|
# Neo4j Configuration
|
||||||
- USE_NEO4J_KG=true
|
- USE_NEO4J_KG=true
|
||||||
@ -790,17 +808,37 @@ services:
|
|||||||
environment:
|
environment:
|
||||||
- PORT=8024
|
- PORT=8024
|
||||||
- HOST=0.0.0.0
|
- HOST=0.0.0.0
|
||||||
- ANTHROPIC_API_KEY=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
|
|
||||||
|
# Claude/Anthropic Configuration
|
||||||
|
- ANTHROPIC_API_KEY=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
|
||||||
|
- MULTI_DOC_CLAUDE_MODEL=claude-3-5-haiku-latest
|
||||||
- CLAUDE_MODEL=claude-3-5-haiku-latest
|
- CLAUDE_MODEL=claude-3-5-haiku-latest
|
||||||
|
|
||||||
|
# Qwen2.5-VL API Configuration
|
||||||
|
- QWEN_API_KEY=${QWEN_API_KEY:-}
|
||||||
|
- QWEN_API_URL=${QWEN_API_URL:-https://api.example.com/v1/chat/completions}
|
||||||
|
- QWEN_MODEL=qwen2.5-vl
|
||||||
|
|
||||||
# Neo4j Configuration
|
# Neo4j Configuration
|
||||||
- NEO4J_URI=bolt://neo4j:7687
|
- NEO4J_URI=bolt://neo4j:7687
|
||||||
- NEO4J_USER=neo4j
|
- NEO4J_USER=neo4j
|
||||||
- NEO4J_PASSWORD=password
|
- NEO4J_PASSWORD=password
|
||||||
- NEO4J_DATABASE=neo4j
|
- NEO4J_DATABASE=neo4j
|
||||||
|
|
||||||
|
# Qdrant Configuration
|
||||||
|
- QDRANT_URL=http://qdrant:6333
|
||||||
|
- QDRANT_COLLECTION_NAME=kg_embeddings
|
||||||
|
|
||||||
|
# DoWhy Configuration
|
||||||
|
- DOWHY_ENABLED=true
|
||||||
|
- DOWHY_CONFIDENCE_THRESHOLD=0.05
|
||||||
|
|
||||||
|
# Embedding Configuration
|
||||||
|
- EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
|
||||||
|
- EMBEDDING_DIMENSION=384
|
||||||
|
|
||||||
# Storage Configuration
|
# Storage Configuration
|
||||||
- STORAGE_DIR=/app/storage
|
- MULTI_DOC_STORAGE_ROOT=/app/storage
|
||||||
|
|
||||||
# Database configurations (optional, for job tracking)
|
# Database configurations (optional, for job tracking)
|
||||||
- POSTGRES_HOST=pipeline_postgres
|
- POSTGRES_HOST=pipeline_postgres
|
||||||
@ -817,6 +855,8 @@ services:
|
|||||||
depends_on:
|
depends_on:
|
||||||
neo4j:
|
neo4j:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
|
qdrant:
|
||||||
|
condition: service_healthy
|
||||||
postgres:
|
postgres:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
redis:
|
redis:
|
||||||
@ -958,6 +998,8 @@ volumes:
|
|||||||
driver: local
|
driver: local
|
||||||
multi_document_storage:
|
multi_document_storage:
|
||||||
driver: local
|
driver: local
|
||||||
|
qdrant_data:
|
||||||
|
driver: local
|
||||||
|
|
||||||
# =====================================
|
# =====================================
|
||||||
# Networks
|
# Networks
|
||||||
|
|||||||
@ -7094,8 +7094,29 @@ async def main():
|
|||||||
js_files = [fa for fa in frontend_files if fa.path.lower().endswith(('.js', '.jsx', '.mjs', '.cjs'))]
|
js_files = [fa for fa in frontend_files if fa.path.lower().endswith(('.js', '.jsx', '.mjs', '.cjs'))]
|
||||||
ts_files = [fa for fa in frontend_files if fa.path.lower().endswith(('.ts', '.tsx'))]
|
ts_files = [fa for fa in frontend_files if fa.path.lower().endswith(('.ts', '.tsx'))]
|
||||||
|
|
||||||
|
# Allocate frontend persona
|
||||||
|
from persona_system import allocate_code_persona, build_code_analysis_persona_prompt
|
||||||
|
|
||||||
|
# Determine if it's UI or state management focused
|
||||||
|
has_state_files = len(state_files) > 0
|
||||||
|
sample_file = frontend_files[0] if frontend_files else None
|
||||||
|
sample_path = sample_file.path if sample_file else ""
|
||||||
|
sample_content = getattr(sample_file, 'content', '')[:1000] if sample_file else ""
|
||||||
|
|
||||||
|
# Allocate persona - prefer state management if state files exist
|
||||||
|
if has_state_files:
|
||||||
|
# Try to get state management persona
|
||||||
|
persona = allocate_code_persona("store/state.ts", sample_content, "frontend_state")
|
||||||
|
if "state" not in persona.get("role", "").lower():
|
||||||
|
# Fallback to UI persona
|
||||||
|
persona = allocate_code_persona(sample_path, sample_content, "frontend_ui")
|
||||||
|
else:
|
||||||
|
persona = allocate_code_persona(sample_path, sample_content, "frontend_ui")
|
||||||
|
|
||||||
|
assignment_context = f"CTO has assigned you to analyze the frontend codebase for this project. You are analyzing {len(frontend_files)} frontend files including components, routing, state management, and configuration."
|
||||||
|
|
||||||
front_end_prompt = f"""
|
front_end_prompt = f"""
|
||||||
You are a Senior Frontend Architect and Technical Writer with 20+ years of experience. Analyze this frontend codebase and produce a comprehensive, technically precise report. The audience includes senior engineers and stakeholders who expect evidence-based, objective findings.
|
Analyze this frontend codebase and produce a comprehensive, technically precise report. The audience includes senior engineers and stakeholders who expect evidence-based, objective findings.
|
||||||
|
|
||||||
STRICT STYLE RULES:
|
STRICT STYLE RULES:
|
||||||
- Use professional, technical language only. Do not use analogies, metaphors, storytelling, or colloquial comparisons.
|
- Use professional, technical language only. Do not use analogies, metaphors, storytelling, or colloquial comparisons.
|
||||||
@ -7211,6 +7232,9 @@ FINAL REQUIREMENTS:
|
|||||||
- Ensure total length between 2000-3000 words.
|
- Ensure total length between 2000-3000 words.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Enhance prompt with persona
|
||||||
|
enhanced_prompt = build_code_analysis_persona_prompt(front_end_prompt, persona, assignment_context)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
print(f"🤖 [FRONTEND AI] Calling Claude API for comprehensive frontend analysis...")
|
print(f"🤖 [FRONTEND AI] Calling Claude API for comprehensive frontend analysis...")
|
||||||
print(f"🤖 [FRONTEND AI] Analyzing {len(frontend_files)} frontend files...")
|
print(f"🤖 [FRONTEND AI] Analyzing {len(frontend_files)} frontend files...")
|
||||||
@ -7220,7 +7244,7 @@ FINAL REQUIREMENTS:
|
|||||||
model=os.getenv("CLAUDE_MODEL", "claude-3-5-haiku-latest"),
|
model=os.getenv("CLAUDE_MODEL", "claude-3-5-haiku-latest"),
|
||||||
max_tokens=8000, # Increased from 6000 to 8000 for more detailed analysis
|
max_tokens=8000, # Increased from 6000 to 8000 for more detailed analysis
|
||||||
temperature=0.1,
|
temperature=0.1,
|
||||||
messages=[{"role": "user", "content": front_end_prompt}]
|
messages=[{"role": "user", "content": enhanced_prompt}]
|
||||||
)
|
)
|
||||||
|
|
||||||
ai_analysis = message.content[0].text.strip()
|
ai_analysis = message.content[0].text.strip()
|
||||||
@ -7230,7 +7254,7 @@ FINAL REQUIREMENTS:
|
|||||||
if not ai_analysis or len(ai_analysis) < 100:
|
if not ai_analysis or len(ai_analysis) < 100:
|
||||||
print("⚠️ [FRONTEND AI] AI analysis too short, regenerating...")
|
print("⚠️ [FRONTEND AI] AI analysis too short, regenerating...")
|
||||||
# Retry with more emphasis on detail
|
# Retry with more emphasis on detail
|
||||||
retry_prompt = front_end_prompt + "\n\nIMPORTANT: Provide a VERY DETAILED analysis. The previous response was too short. Please provide at least 2000 words of detailed explanation."
|
retry_prompt = enhanced_prompt + "\n\nIMPORTANT: Provide a VERY DETAILED analysis. The previous response was too short. Please provide at least 2000 words of detailed explanation."
|
||||||
message = self.client.messages.create(
|
message = self.client.messages.create(
|
||||||
model=os.getenv("CLAUDE_MODEL", "claude-3-5-haiku-latest"),
|
model=os.getenv("CLAUDE_MODEL", "claude-3-5-haiku-latest"),
|
||||||
max_tokens=8000,
|
max_tokens=8000,
|
||||||
|
|||||||
@ -524,7 +524,11 @@ class ChunkAnalyzer:
|
|||||||
def _build_chunk_analysis_prompt(self, file_path: str, chunk: ChunkInfo,
|
def _build_chunk_analysis_prompt(self, file_path: str, chunk: ChunkInfo,
|
||||||
chunk_index: int, total_chunks: int,
|
chunk_index: int, total_chunks: int,
|
||||||
context_memories: Dict[str, Any]) -> str:
|
context_memories: Dict[str, Any]) -> str:
|
||||||
"""Build comprehensive analysis prompt for a chunk."""
|
"""Build comprehensive analysis prompt for a chunk with persona."""
|
||||||
|
from persona_system import allocate_code_persona, build_code_analysis_persona_prompt
|
||||||
|
|
||||||
|
# Allocate persona based on file path and chunk content
|
||||||
|
persona = allocate_code_persona(file_path, chunk.content, chunk.chunk_type)
|
||||||
|
|
||||||
# Build context information
|
# Build context information
|
||||||
context_info = ""
|
context_info = ""
|
||||||
@ -538,8 +542,10 @@ class ChunkAnalyzer:
|
|||||||
for practice in context_memories['best_practices'][:3]:
|
for practice in context_memories['best_practices'][:3]:
|
||||||
context_info += f"- {practice['content'][:100]}...\n"
|
context_info += f"- {practice['content'][:100]}...\n"
|
||||||
|
|
||||||
|
assignment_context = f"CTO has assigned you to analyze chunk {chunk_index + 1} of {total_chunks} from file: {file_path}. This is a {chunk.chunk_type} chunk covering lines {chunk.start_line}-{chunk.end_line}."
|
||||||
|
|
||||||
prompt = f"""
|
prompt = f"""
|
||||||
You are a senior software engineer analyzing chunk {chunk_index + 1} of {total_chunks} from file: {file_path}
|
Analyzing chunk {chunk_index + 1} of {total_chunks} from file: {file_path}
|
||||||
|
|
||||||
CHUNK INFORMATION:
|
CHUNK INFORMATION:
|
||||||
- Chunk Type: {chunk.chunk_type}
|
- Chunk Type: {chunk.chunk_type}
|
||||||
@ -564,7 +570,10 @@ Provide a focused analysis of this specific chunk, considering:
|
|||||||
|
|
||||||
Focus on actionable insights for this specific code section.
|
Focus on actionable insights for this specific code section.
|
||||||
"""
|
"""
|
||||||
return prompt
|
|
||||||
|
# Enhance with persona
|
||||||
|
enhanced_prompt = build_code_analysis_persona_prompt(prompt, persona, assignment_context)
|
||||||
|
return enhanced_prompt
|
||||||
|
|
||||||
def _detect_language_from_path(self, file_path: str) -> str:
|
def _detect_language_from_path(self, file_path: str) -> str:
|
||||||
"""Detect language from file path."""
|
"""Detect language from file path."""
|
||||||
|
|||||||
755
services/ai-analysis-service/persona_system.py
Normal file
755
services/ai-analysis-service/persona_system.py
Normal file
@ -0,0 +1,755 @@
|
|||||||
|
"""
|
||||||
|
World-Class Persona System for AI Analysis
|
||||||
|
Simulates real-world team allocation with domain-specific experts from top companies.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# CODE ANALYSIS PERSONAS (for AI Analysis Service)
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
CODE_ANALYSIS_PERSONAS = {
|
||||||
|
# BACKEND DOMAINS
|
||||||
|
"backend_api": {
|
||||||
|
"role": "Senior Backend API Architect",
|
||||||
|
"companies": ["Google", "Amazon", "Stripe"],
|
||||||
|
"expertise": ["REST APIs", "GraphQL", "gRPC", "API Gateway", "Microservices"],
|
||||||
|
"experience_years": "18+",
|
||||||
|
"achievements": [
|
||||||
|
"Designed APIs at Google Cloud Platform handling 10M+ requests/day",
|
||||||
|
"Built scalable API infrastructure at Amazon AWS serving millions of customers",
|
||||||
|
"Led API architecture at Stripe processing billions in transactions"
|
||||||
|
],
|
||||||
|
"detection_keywords": ["api", "controller", "route", "endpoint", "service", "rest", "graphql"],
|
||||||
|
"focus_areas": [
|
||||||
|
"API design patterns and best practices",
|
||||||
|
"API versioning and backward compatibility",
|
||||||
|
"Rate limiting and throttling strategies",
|
||||||
|
"API documentation quality",
|
||||||
|
"Security vulnerabilities in API endpoints"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
|
||||||
|
"backend_database": {
|
||||||
|
"role": "Senior Database Architect",
|
||||||
|
"companies": ["Amazon", "Oracle", "MongoDB"],
|
||||||
|
"expertise": ["SQL", "NoSQL", "Database Design", "Query Optimization", "Data Modeling"],
|
||||||
|
"experience_years": "20+",
|
||||||
|
"achievements": [
|
||||||
|
"Designed database systems at Amazon handling petabytes of data",
|
||||||
|
"Optimized databases at Oracle for enterprise-scale applications",
|
||||||
|
"Built distributed databases at MongoDB for global scale"
|
||||||
|
],
|
||||||
|
"detection_keywords": ["database", "db", "model", "schema", "migration", "repository", "orm", "query"],
|
||||||
|
"focus_areas": [
|
||||||
|
"Database schema design and normalization",
|
||||||
|
"Query performance and optimization",
|
||||||
|
"Data integrity and constraints",
|
||||||
|
"Indexing strategies",
|
||||||
|
"Transaction management"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
|
||||||
|
"backend_business": {
|
||||||
|
"role": "Senior Backend Business Logic Architect",
|
||||||
|
"companies": ["Microsoft", "Salesforce", "SAP"],
|
||||||
|
"expertise": ["Business Logic", "Domain Modeling", "Design Patterns", "Service Layer"],
|
||||||
|
"experience_years": "17+",
|
||||||
|
"achievements": [
|
||||||
|
"Architected business logic systems at Microsoft for enterprise applications",
|
||||||
|
"Designed domain models at Salesforce for CRM platforms",
|
||||||
|
"Built service layers at SAP for ERP systems"
|
||||||
|
],
|
||||||
|
"detection_keywords": ["service", "business", "logic", "domain", "entity", "dto", "handler"],
|
||||||
|
"focus_areas": [
|
||||||
|
"Code organization and structure",
|
||||||
|
"Design patterns implementation",
|
||||||
|
"Business logic maintainability",
|
||||||
|
"Domain modeling quality",
|
||||||
|
"Service layer architecture"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
|
||||||
|
# FRONTEND DOMAINS
|
||||||
|
"frontend_ui": {
|
||||||
|
"role": "Senior Frontend UI Architect",
|
||||||
|
"companies": ["Apple", "Meta", "Netflix"],
|
||||||
|
"expertise": ["React", "Vue", "Angular", "Component Design", "UI/UX"],
|
||||||
|
"experience_years": "15+",
|
||||||
|
"achievements": [
|
||||||
|
"Built user interfaces at Apple used by millions daily",
|
||||||
|
"Led React architecture at Meta (Facebook) for large-scale applications",
|
||||||
|
"Designed performance-optimized UIs at Netflix for 200M+ users"
|
||||||
|
],
|
||||||
|
"detection_keywords": ["component", "ui", "view", "page", "jsx", "tsx", "vue", "template"],
|
||||||
|
"focus_areas": [
|
||||||
|
"Component architecture and reusability",
|
||||||
|
"User experience and accessibility",
|
||||||
|
"UI performance optimization",
|
||||||
|
"Design system consistency",
|
||||||
|
"Responsive design implementation"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
|
||||||
|
"frontend_state": {
|
||||||
|
"role": "Senior Frontend State Management Architect",
|
||||||
|
"companies": ["Meta", "Netflix", "Airbnb"],
|
||||||
|
"expertise": ["Redux", "Zustand", "Context API", "State Management", "Data Flow"],
|
||||||
|
"experience_years": "14+",
|
||||||
|
"achievements": [
|
||||||
|
"Architected state management at Meta for complex applications",
|
||||||
|
"Designed data flow patterns at Netflix for real-time updates",
|
||||||
|
"Built state systems at Airbnb for booking platforms"
|
||||||
|
],
|
||||||
|
"detection_keywords": ["store", "state", "redux", "context", "recoil", "zustand", "mobx"],
|
||||||
|
"focus_areas": [
|
||||||
|
"State architecture and patterns",
|
||||||
|
"Data flow optimization",
|
||||||
|
"State synchronization",
|
||||||
|
"Performance in state updates",
|
||||||
|
"State management best practices"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
|
||||||
|
# DEVOPS DOMAINS
|
||||||
|
"devops_ci_cd": {
|
||||||
|
"role": "Senior DevOps CI/CD Architect",
|
||||||
|
"companies": ["Google", "Netflix", "Uber"],
|
||||||
|
"expertise": ["CI/CD", "Jenkins", "GitHub Actions", "GitLab CI", "Deployment Automation"],
|
||||||
|
"experience_years": "12+",
|
||||||
|
"achievements": [
|
||||||
|
"Built CI/CD pipelines at Google handling 50K+ deployments/day",
|
||||||
|
"Designed deployment systems at Netflix for zero-downtime releases",
|
||||||
|
"Architected automation at Uber for global scale"
|
||||||
|
],
|
||||||
|
"detection_keywords": ["ci", "cd", "pipeline", "jenkins", "github-actions", "gitlab", "deploy"],
|
||||||
|
"focus_areas": [
|
||||||
|
"CI/CD pipeline efficiency",
|
||||||
|
"Deployment strategy and automation",
|
||||||
|
"Quality gates and testing",
|
||||||
|
"Rollback strategies",
|
||||||
|
"Build optimization"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
|
||||||
|
"devops_infrastructure": {
|
||||||
|
"role": "Senior Infrastructure Architect",
|
||||||
|
"companies": ["Amazon", "Google", "Microsoft"],
|
||||||
|
"expertise": ["Kubernetes", "Docker", "Terraform", "Cloud Infrastructure", "Scalability"],
|
||||||
|
"experience_years": "16+",
|
||||||
|
"achievements": [
|
||||||
|
"Designed infrastructure at Amazon AWS for global scale",
|
||||||
|
"Built container orchestration at Google for millions of containers",
|
||||||
|
"Architected cloud systems at Microsoft Azure with 99.99% uptime"
|
||||||
|
],
|
||||||
|
"detection_keywords": ["docker", "kubernetes", "terraform", "infrastructure", "cloud", "aws", "gcp", "azure"],
|
||||||
|
"focus_areas": [
|
||||||
|
"Infrastructure scalability",
|
||||||
|
"System reliability and uptime",
|
||||||
|
"Cost optimization",
|
||||||
|
"Security in infrastructure",
|
||||||
|
"Monitoring and observability"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
|
||||||
|
# SECURITY DOMAINS
|
||||||
|
"security_engineer": {
|
||||||
|
"role": "Senior Security Engineer",
|
||||||
|
"companies": ["Google", "Microsoft", "Cloudflare"],
|
||||||
|
"expertise": ["Security", "Vulnerability Assessment", "Penetration Testing", "Security Architecture"],
|
||||||
|
"experience_years": "15+",
|
||||||
|
"achievements": [
|
||||||
|
"Led security initiatives at Google protecting billions of users",
|
||||||
|
"Designed security systems at Microsoft for enterprise applications",
|
||||||
|
"Built security infrastructure at Cloudflare for DDoS protection"
|
||||||
|
],
|
||||||
|
"detection_keywords": ["security", "auth", "encryption", "jwt", "oauth", "ssl", "tls", "cors"],
|
||||||
|
"focus_areas": [
|
||||||
|
"Security vulnerabilities and threats",
|
||||||
|
"Authentication and authorization",
|
||||||
|
"Data encryption and protection",
|
||||||
|
"Security best practices",
|
||||||
|
"Compliance and regulations"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
|
||||||
|
# DATA DOMAINS
|
||||||
|
"data_engineer": {
|
||||||
|
"role": "Senior Data Engineer",
|
||||||
|
"companies": ["Google", "Netflix", "Uber"],
|
||||||
|
"expertise": ["Data Pipelines", "ETL", "Big Data", "Data Warehousing", "Spark"],
|
||||||
|
"experience_years": "13+",
|
||||||
|
"achievements": [
|
||||||
|
"Built data pipelines at Google processing petabytes daily",
|
||||||
|
"Designed ETL systems at Netflix for real-time analytics",
|
||||||
|
"Architected data infrastructure at Uber for millions of rides"
|
||||||
|
],
|
||||||
|
"detection_keywords": ["data", "pipeline", "etl", "warehouse", "spark", "hadoop", "kafka"],
|
||||||
|
"focus_areas": [
|
||||||
|
"Data architecture and pipelines",
|
||||||
|
"ETL performance and optimization",
|
||||||
|
"Data quality and validation",
|
||||||
|
"Scalability in data processing",
|
||||||
|
"Data governance"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
|
||||||
|
"ml_engineer": {
|
||||||
|
"role": "Senior ML/AI Engineer",
|
||||||
|
"companies": ["OpenAI", "Anthropic", "Google DeepMind"],
|
||||||
|
"expertise": ["Machine Learning", "Deep Learning", "AI Systems", "Model Training"],
|
||||||
|
"experience_years": "12+",
|
||||||
|
"achievements": [
|
||||||
|
"Developed ML models at OpenAI for language understanding",
|
||||||
|
"Built AI systems at Anthropic for safety-critical applications",
|
||||||
|
"Designed training pipelines at Google DeepMind for large-scale models"
|
||||||
|
],
|
||||||
|
"detection_keywords": ["ml", "ai", "model", "training", "neural", "tensorflow", "pytorch", "learning"],
|
||||||
|
"focus_areas": [
|
||||||
|
"ML model architecture",
|
||||||
|
"Training pipeline optimization",
|
||||||
|
"Model performance and accuracy",
|
||||||
|
"Scalability in ML systems",
|
||||||
|
"AI safety and ethics"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
|
||||||
|
# TESTING DOMAINS
|
||||||
|
"qa_automation": {
|
||||||
|
"role": "Senior QA Automation Architect",
|
||||||
|
"companies": ["Google", "Microsoft", "Amazon"],
|
||||||
|
"expertise": ["Test Automation", "Selenium", "Cypress", "Jest", "Testing Strategy"],
|
||||||
|
"experience_years": "14+",
|
||||||
|
"achievements": [
|
||||||
|
"Built test automation at Google for thousands of test cases",
|
||||||
|
"Designed testing frameworks at Microsoft for enterprise software",
|
||||||
|
"Architected QA systems at Amazon for e-commerce platforms"
|
||||||
|
],
|
||||||
|
"detection_keywords": ["test", "spec", "jest", "cypress", "selenium", "pytest", "testing"],
|
||||||
|
"focus_areas": [
|
||||||
|
"Test coverage and quality",
|
||||||
|
"Automation strategy",
|
||||||
|
"Test maintainability",
|
||||||
|
"Performance testing",
|
||||||
|
"Testing best practices"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
|
||||||
|
"performance_engineer": {
|
||||||
|
"role": "Senior Performance Engineer",
|
||||||
|
"companies": ["Google", "Netflix", "Amazon"],
|
||||||
|
"expertise": ["Performance Optimization", "Load Testing", "Profiling", "Scalability"],
|
||||||
|
"experience_years": "16+",
|
||||||
|
"achievements": [
|
||||||
|
"Optimized systems at Google handling billions of requests",
|
||||||
|
"Designed performance solutions at Netflix for streaming at scale",
|
||||||
|
"Built performance infrastructure at Amazon for peak traffic"
|
||||||
|
],
|
||||||
|
"detection_keywords": ["performance", "load", "stress", "benchmark", "profiling", "optimization"],
|
||||||
|
"focus_areas": [
|
||||||
|
"Performance bottlenecks",
|
||||||
|
"Optimization strategies",
|
||||||
|
"Scalability concerns",
|
||||||
|
"Resource utilization",
|
||||||
|
"Performance testing"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
|
||||||
|
# CTO (for synthesis)
|
||||||
|
"cto": {
|
||||||
|
"role": "Chief Technology Officer",
|
||||||
|
"companies": ["Google", "Microsoft", "Amazon"],
|
||||||
|
"expertise": ["Strategic Planning", "System Architecture", "Team Leadership", "Technology Strategy"],
|
||||||
|
"experience_years": "25+",
|
||||||
|
"achievements": [
|
||||||
|
"Former VP of Engineering at Google, leading teams of 500+ engineers",
|
||||||
|
"CTO at Microsoft Azure, responsible for cloud infrastructure strategy",
|
||||||
|
"Strategic advisor at Amazon Web Services for enterprise architecture"
|
||||||
|
],
|
||||||
|
"focus_areas": [
|
||||||
|
"Strategic technology insights",
|
||||||
|
"System-wide risk assessment",
|
||||||
|
"Architectural recommendations",
|
||||||
|
"Cross-domain synthesis",
|
||||||
|
"Executive-level analysis"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# DOCUMENT ANALYSIS PERSONAS (for Multi-Document Upload Service)
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
DOCUMENT_ANALYSIS_PERSONAS = {
|
||||||
|
"technical_doc_analyst": {
|
||||||
|
"role": "Senior Technical Documentation Analyst",
|
||||||
|
"companies": ["Google", "Stripe", "Microsoft"],
|
||||||
|
"expertise_domain": "technical documentation and API specifications",
|
||||||
|
"document_types": ["API docs", "technical specs", "developer guides"],
|
||||||
|
"experience_years": "15+",
|
||||||
|
"achievements": [
|
||||||
|
"Analyzed technical documentation at Google for millions of API integrations",
|
||||||
|
"Led documentation analysis at Stripe for developer experience",
|
||||||
|
"Mapped technical relationships at Microsoft for enterprise systems"
|
||||||
|
],
|
||||||
|
"focus_areas": [
|
||||||
|
"Technical dependencies and relationships",
|
||||||
|
"System integration points",
|
||||||
|
"API contract relationships",
|
||||||
|
"Technical process flows",
|
||||||
|
"Code-to-documentation mappings"
|
||||||
|
],
|
||||||
|
"visual_focus_areas": [
|
||||||
|
"API flow diagrams",
|
||||||
|
"System integration diagrams",
|
||||||
|
"Technical architecture flows"
|
||||||
|
],
|
||||||
|
"detection_keywords": ["api", "technical", "specification", "documentation", "guide", "reference", "developer"]
|
||||||
|
},
|
||||||
|
|
||||||
|
"business_process_analyst": {
|
||||||
|
"role": "Senior Business Process Analyst",
|
||||||
|
"companies": ["McKinsey", "Deloitte", "Accenture"],
|
||||||
|
"expertise_domain": "business processes and stakeholder requirements",
|
||||||
|
"document_types": ["business requirements", "user stories", "business plans"],
|
||||||
|
"experience_years": "18+",
|
||||||
|
"achievements": [
|
||||||
|
"Analyzed business processes at McKinsey for Fortune 500 companies",
|
||||||
|
"Led process mapping at Deloitte for enterprise transformations",
|
||||||
|
"Mapped stakeholder relationships at Accenture for global projects"
|
||||||
|
],
|
||||||
|
"focus_areas": [
|
||||||
|
"Business process flows",
|
||||||
|
"Requirement dependencies",
|
||||||
|
"Stakeholder impact chains",
|
||||||
|
"Business decision consequences",
|
||||||
|
"Organizational impact analysis"
|
||||||
|
],
|
||||||
|
"visual_focus_areas": [
|
||||||
|
"Business process diagrams",
|
||||||
|
"Stakeholder impact maps",
|
||||||
|
"Decision flowcharts"
|
||||||
|
],
|
||||||
|
"detection_keywords": ["business", "requirement", "stakeholder", "user story", "process", "workflow", "business plan"]
|
||||||
|
},
|
||||||
|
|
||||||
|
"system_architecture_analyst": {
|
||||||
|
"role": "Senior System Architecture Document Analyst",
|
||||||
|
"companies": ["Google", "Amazon", "Microsoft"],
|
||||||
|
"expertise_domain": "system architecture and design documents",
|
||||||
|
"document_types": ["architecture docs", "design documents", "system designs"],
|
||||||
|
"experience_years": "20+",
|
||||||
|
"achievements": [
|
||||||
|
"Analyzed architecture documents at Google for large-scale distributed systems",
|
||||||
|
"Mapped system relationships at Amazon for cloud infrastructure",
|
||||||
|
"Led architecture analysis at Microsoft for enterprise solutions"
|
||||||
|
],
|
||||||
|
"focus_areas": [
|
||||||
|
"Architecture relationships",
|
||||||
|
"Component dependencies",
|
||||||
|
"System interaction flows",
|
||||||
|
"Design decision impacts",
|
||||||
|
"Scalability relationships"
|
||||||
|
],
|
||||||
|
"visual_focus_areas": [
|
||||||
|
"Architecture diagrams",
|
||||||
|
"Component interaction diagrams",
|
||||||
|
"System dependency maps"
|
||||||
|
],
|
||||||
|
"detection_keywords": ["architecture", "design", "system", "component", "diagram", "architectural"]
|
||||||
|
},
|
||||||
|
|
||||||
|
"requirements_analyst": {
|
||||||
|
"role": "Senior Requirements & Specification Analyst",
|
||||||
|
"companies": ["IBM", "Oracle", "SAP"],
|
||||||
|
"expertise_domain": "requirements and functional specifications",
|
||||||
|
"document_types": ["requirements docs", "functional specs", "feature specs"],
|
||||||
|
"experience_years": "17+",
|
||||||
|
"achievements": [
|
||||||
|
"Analyzed requirements at IBM for enterprise software implementations",
|
||||||
|
"Mapped specifications at Oracle for database systems",
|
||||||
|
"Led requirement analysis at SAP for ERP platforms"
|
||||||
|
],
|
||||||
|
"focus_areas": [
|
||||||
|
"Requirement dependencies",
|
||||||
|
"Feature relationships",
|
||||||
|
"Specification impacts",
|
||||||
|
"Change propagation",
|
||||||
|
"Implementation dependencies"
|
||||||
|
],
|
||||||
|
"visual_focus_areas": [
|
||||||
|
"Requirement traceability diagrams",
|
||||||
|
"Feature dependency maps",
|
||||||
|
"Impact analysis charts"
|
||||||
|
],
|
||||||
|
"detection_keywords": ["requirement", "specification", "feature", "functional", "traceability", "spec"]
|
||||||
|
},
|
||||||
|
|
||||||
|
"process_flow_analyst": {
|
||||||
|
"role": "Senior Process Flow Analyst",
|
||||||
|
"companies": ["Amazon", "Netflix", "Uber"],
|
||||||
|
"expertise_domain": "operational processes and workflows",
|
||||||
|
"document_types": ["process docs", "workflows", "operational manuals"],
|
||||||
|
"experience_years": "14+",
|
||||||
|
"achievements": [
|
||||||
|
"Analyzed processes at Amazon for fulfillment operations",
|
||||||
|
"Mapped workflows at Netflix for content delivery",
|
||||||
|
"Led process analysis at Uber for ride-sharing operations"
|
||||||
|
],
|
||||||
|
"focus_areas": [
|
||||||
|
"Process step relationships",
|
||||||
|
"Workflow dependencies",
|
||||||
|
"Sequential cause-effects",
|
||||||
|
"Decision impacts",
|
||||||
|
"Operational dependencies"
|
||||||
|
],
|
||||||
|
"visual_focus_areas": [
|
||||||
|
"Process flowcharts",
|
||||||
|
"Workflow diagrams",
|
||||||
|
"Decision trees",
|
||||||
|
"Operational flow maps"
|
||||||
|
],
|
||||||
|
"detection_keywords": ["process", "workflow", "procedure", "operational", "manual", "step", "flow"]
|
||||||
|
},
|
||||||
|
|
||||||
|
"visual_architecture_analyst": {
|
||||||
|
"role": "Senior Visual Architecture Analyst",
|
||||||
|
"companies": ["Google", "Microsoft", "Apple"],
|
||||||
|
"expertise_domain": "visual diagrams and architecture drawings",
|
||||||
|
"document_types": ["diagrams", "flowcharts", "architecture drawings"],
|
||||||
|
"experience_years": "16+",
|
||||||
|
"achievements": [
|
||||||
|
"Analyzed visual diagrams at Google for complex system mappings",
|
||||||
|
"Mapped architecture drawings at Microsoft for enterprise solutions",
|
||||||
|
"Led visual analysis at Apple for product architecture"
|
||||||
|
],
|
||||||
|
"focus_areas": [
|
||||||
|
"Visual relationship extraction",
|
||||||
|
"Diagram dependency mapping",
|
||||||
|
"Flow analysis",
|
||||||
|
"Component interactions",
|
||||||
|
"Visual pattern recognition"
|
||||||
|
],
|
||||||
|
"visual_focus_areas": [
|
||||||
|
"All types of visual diagrams",
|
||||||
|
"Architecture drawings",
|
||||||
|
"Flowcharts and process diagrams",
|
||||||
|
"Component and sequence diagrams"
|
||||||
|
],
|
||||||
|
"detection_keywords": ["diagram", "flowchart", "visual", "drawing", "chart", "map", "image"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# DOCUMENT TYPE MAPPING
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
DOCUMENT_PERSONA_MAPPING = {
|
||||||
|
# Technical Documents
|
||||||
|
"api_documentation": "technical_doc_analyst",
|
||||||
|
"technical_specification": "technical_doc_analyst",
|
||||||
|
"code_documentation": "technical_doc_analyst",
|
||||||
|
"developer_guide": "technical_doc_analyst",
|
||||||
|
|
||||||
|
# Business Documents
|
||||||
|
"business_requirements": "business_process_analyst",
|
||||||
|
"user_stories": "business_process_analyst",
|
||||||
|
"business_plan": "business_process_analyst",
|
||||||
|
"product_specification": "business_process_analyst",
|
||||||
|
"stakeholder_document": "business_process_analyst",
|
||||||
|
|
||||||
|
# Architecture Documents
|
||||||
|
"architecture_document": "system_architecture_analyst",
|
||||||
|
"system_design": "system_architecture_analyst",
|
||||||
|
"design_document": "system_architecture_analyst",
|
||||||
|
"technical_design": "system_architecture_analyst",
|
||||||
|
|
||||||
|
# Requirements Documents
|
||||||
|
"requirements_document": "requirements_analyst",
|
||||||
|
"functional_specification": "requirements_analyst",
|
||||||
|
"feature_specification": "requirements_analyst",
|
||||||
|
|
||||||
|
# Process Documents
|
||||||
|
"process_document": "process_flow_analyst",
|
||||||
|
"workflow_document": "process_flow_analyst",
|
||||||
|
"procedure_guide": "process_flow_analyst",
|
||||||
|
"operational_manual": "process_flow_analyst",
|
||||||
|
|
||||||
|
# Visual/Diagram Documents
|
||||||
|
"architecture_diagram": "visual_architecture_analyst",
|
||||||
|
"flowchart": "visual_architecture_analyst",
|
||||||
|
"sequence_diagram": "visual_architecture_analyst",
|
||||||
|
"component_diagram": "visual_architecture_analyst",
|
||||||
|
"process_diagram": "visual_architecture_analyst",
|
||||||
|
"system_diagram": "visual_architecture_analyst",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# PERSONA ALLOCATION FUNCTIONS
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def allocate_code_persona(file_path: str, content: str, chunk_type: str = "module") -> Dict:
|
||||||
|
"""
|
||||||
|
Intelligently allocates code analysis persona based on file path, content, and type.
|
||||||
|
Returns persona config with prompt context.
|
||||||
|
"""
|
||||||
|
file_lower = file_path.lower()
|
||||||
|
content_lower = content.lower()[:2000] if content else "" # Sample content
|
||||||
|
|
||||||
|
# Score each persona based on detection rules
|
||||||
|
persona_scores = {}
|
||||||
|
|
||||||
|
for persona_id, persona_config in CODE_ANALYSIS_PERSONAS.items():
|
||||||
|
if persona_id == "cto": # Skip CTO for individual analysis
|
||||||
|
continue
|
||||||
|
|
||||||
|
score = 0
|
||||||
|
detection_keywords = persona_config.get("detection_keywords", [])
|
||||||
|
|
||||||
|
# Check file path (higher weight)
|
||||||
|
for keyword in detection_keywords:
|
||||||
|
if keyword in file_lower:
|
||||||
|
score += 15
|
||||||
|
|
||||||
|
# Check content (medium weight)
|
||||||
|
for keyword in detection_keywords:
|
||||||
|
if keyword in content_lower:
|
||||||
|
score += 8
|
||||||
|
|
||||||
|
# Check chunk type
|
||||||
|
if chunk_type and chunk_type.lower() in detection_keywords:
|
||||||
|
score += 10
|
||||||
|
|
||||||
|
# Domain-specific boosts
|
||||||
|
if "test" in file_lower and "qa" in persona_id:
|
||||||
|
score += 20
|
||||||
|
if "security" in file_lower and "security" in persona_id:
|
||||||
|
score += 20
|
||||||
|
if "performance" in file_lower and "performance" in persona_id:
|
||||||
|
score += 20
|
||||||
|
|
||||||
|
if score > 0:
|
||||||
|
persona_scores[persona_id] = score
|
||||||
|
|
||||||
|
# Select top persona
|
||||||
|
if persona_scores:
|
||||||
|
selected_id = max(persona_scores, key=persona_scores.get)
|
||||||
|
return CODE_ANALYSIS_PERSONAS[selected_id]
|
||||||
|
|
||||||
|
# Default fallback to backend business logic
|
||||||
|
return CODE_ANALYSIS_PERSONAS.get("backend_business", {})
|
||||||
|
|
||||||
|
|
||||||
|
def allocate_document_persona(file_path: str, content: str, file_type: str = "text") -> Dict:
|
||||||
|
"""
|
||||||
|
Intelligently allocates document analysis persona based on file path, content, and type.
|
||||||
|
Returns persona config for document analysis.
|
||||||
|
"""
|
||||||
|
file_lower = file_path.lower()
|
||||||
|
content_lower = content.lower()[:2000] if content else ""
|
||||||
|
|
||||||
|
# Check if it's an image/diagram
|
||||||
|
if file_type == "image" or any(ext in file_lower for ext in [".png", ".jpg", ".jpeg", ".gif", ".svg", ".pdf"]):
|
||||||
|
return DOCUMENT_ANALYSIS_PERSONAS.get("visual_architecture_analyst", {})
|
||||||
|
|
||||||
|
# Score each persona based on detection rules
|
||||||
|
persona_scores = {}
|
||||||
|
|
||||||
|
for persona_id, persona_config in DOCUMENT_ANALYSIS_PERSONAS.items():
|
||||||
|
score = 0
|
||||||
|
detection_keywords = persona_config.get("detection_keywords", [])
|
||||||
|
|
||||||
|
# Check file path (higher weight)
|
||||||
|
for keyword in detection_keywords:
|
||||||
|
if keyword in file_lower:
|
||||||
|
score += 15
|
||||||
|
|
||||||
|
# Check content (medium weight)
|
||||||
|
for keyword in detection_keywords:
|
||||||
|
if keyword in content_lower:
|
||||||
|
score += 8
|
||||||
|
|
||||||
|
# Check document type mapping
|
||||||
|
for doc_type, mapped_persona in DOCUMENT_PERSONA_MAPPING.items():
|
||||||
|
if doc_type in file_lower and mapped_persona == persona_id:
|
||||||
|
score += 20
|
||||||
|
|
||||||
|
if score > 0:
|
||||||
|
persona_scores[persona_id] = score
|
||||||
|
|
||||||
|
# Select top persona
|
||||||
|
if persona_scores:
|
||||||
|
selected_id = max(persona_scores, key=persona_scores.get)
|
||||||
|
return DOCUMENT_ANALYSIS_PERSONAS[selected_id]
|
||||||
|
|
||||||
|
# Default fallback to technical doc analyst
|
||||||
|
return DOCUMENT_ANALYSIS_PERSONAS.get("technical_doc_analyst", {})
|
||||||
|
|
||||||
|
|
||||||
|
def get_cto_persona() -> Dict:
|
||||||
|
"""Returns CTO persona for synthesis and high-level analysis."""
|
||||||
|
return CODE_ANALYSIS_PERSONAS.get("cto", {})
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# PROMPT BUILDING FUNCTIONS
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def build_persona_intro(persona: Dict, assignment_context: str = "", analysis_type: str = "code") -> str:
|
||||||
|
"""
|
||||||
|
Builds persona introduction section for prompts.
|
||||||
|
Works for both code and document analysis.
|
||||||
|
"""
|
||||||
|
if not persona:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
role = persona.get("role", "Senior Engineer")
|
||||||
|
companies = persona.get("companies", [])
|
||||||
|
experience = persona.get("experience_years", "15+")
|
||||||
|
achievements = persona.get("achievements", [])
|
||||||
|
focus_areas = persona.get("focus_areas", [])
|
||||||
|
|
||||||
|
# Build company background
|
||||||
|
company_bg = ""
|
||||||
|
if companies:
|
||||||
|
company_bg = f"- Previously worked at {', '.join(companies[:2])}"
|
||||||
|
if len(companies) > 2:
|
||||||
|
company_bg += f" and {companies[2]}"
|
||||||
|
|
||||||
|
# Build achievements section
|
||||||
|
achievements_text = ""
|
||||||
|
if achievements:
|
||||||
|
achievements_text = "\n".join([f"- {achievement}" for achievement in achievements[:2]])
|
||||||
|
|
||||||
|
# Build focus areas
|
||||||
|
focus_text = ""
|
||||||
|
if focus_areas:
|
||||||
|
focus_text = "\n".join([f"- {focus}" for focus in focus_areas[:5]])
|
||||||
|
|
||||||
|
intro = f"""You are {role} with {experience} years of experience.
|
||||||
|
|
||||||
|
COMPANY BACKGROUND:
|
||||||
|
{company_bg}
|
||||||
|
|
||||||
|
KEY ACHIEVEMENTS:
|
||||||
|
{achievements_text}
|
||||||
|
|
||||||
|
YOUR ASSIGNMENT:
|
||||||
|
{assignment_context if assignment_context else 'Analyze the provided code/document for quality, issues, and recommendations.'}
|
||||||
|
|
||||||
|
YOUR FOCUS AREAS:
|
||||||
|
{focus_text}
|
||||||
|
|
||||||
|
---
|
||||||
|
"""
|
||||||
|
return intro
|
||||||
|
|
||||||
|
|
||||||
|
def build_code_analysis_persona_prompt(base_prompt: str, persona: Dict,
|
||||||
|
assignment_context: str = "") -> str:
|
||||||
|
"""
|
||||||
|
Enhances code analysis prompt with persona context.
|
||||||
|
"""
|
||||||
|
if not persona:
|
||||||
|
return base_prompt
|
||||||
|
|
||||||
|
persona_intro = build_persona_intro(persona, assignment_context, "code")
|
||||||
|
return persona_intro + base_prompt
|
||||||
|
|
||||||
|
|
||||||
|
def build_document_analysis_persona_prompt(base_prompt: str, persona: Dict,
|
||||||
|
document_type: str = "document",
|
||||||
|
assignment_context: str = "") -> str:
|
||||||
|
"""
|
||||||
|
Enhances document analysis prompt with persona context.
|
||||||
|
"""
|
||||||
|
if not persona:
|
||||||
|
return base_prompt
|
||||||
|
|
||||||
|
role = persona.get("role", "Senior Analyst")
|
||||||
|
companies = persona.get("companies", [])
|
||||||
|
expertise_domain = persona.get("expertise_domain", "document analysis")
|
||||||
|
experience = persona.get("experience_years", "15+")
|
||||||
|
achievements = persona.get("achievements", [])
|
||||||
|
focus_areas = persona.get("focus_areas", [])
|
||||||
|
|
||||||
|
company_bg = f"- Previously worked at {', '.join(companies[:2])}" if companies else ""
|
||||||
|
achievements_text = "\n".join([f"- {achievement}" for achievement in achievements[:2]]) if achievements else ""
|
||||||
|
focus_text = "\n".join([f"- {focus}" for focus in focus_areas[:5]]) if focus_areas else ""
|
||||||
|
|
||||||
|
intro = f"""You are {role}, a specialist in analyzing {expertise_domain} with {experience} years of experience.
|
||||||
|
|
||||||
|
COMPANY BACKGROUND:
|
||||||
|
{company_bg}
|
||||||
|
|
||||||
|
KEY ACHIEVEMENTS:
|
||||||
|
{achievements_text}
|
||||||
|
|
||||||
|
YOUR SPECIALIZATION:
|
||||||
|
You excel at identifying:
|
||||||
|
{focus_text}
|
||||||
|
|
||||||
|
YOUR ASSIGNMENT:
|
||||||
|
{assignment_context if assignment_context else f'Analyze this {document_type} to extract causal relationships and dependencies.'}
|
||||||
|
|
||||||
|
---
|
||||||
|
"""
|
||||||
|
return intro + base_prompt
|
||||||
|
|
||||||
|
|
||||||
|
def build_cto_synthesis_prompt(base_prompt: str, team_findings: List[Dict] = None) -> str:
|
||||||
|
"""
|
||||||
|
Builds CTO-level synthesis prompt with team allocation context.
|
||||||
|
"""
|
||||||
|
cto_persona = get_cto_persona()
|
||||||
|
|
||||||
|
if not cto_persona:
|
||||||
|
return base_prompt
|
||||||
|
|
||||||
|
role = cto_persona.get("role", "Chief Technology Officer")
|
||||||
|
companies = cto_persona.get("companies", [])
|
||||||
|
experience = cto_persona.get("experience_years", "25+")
|
||||||
|
achievements = cto_persona.get("achievements", [])
|
||||||
|
focus_areas = cto_persona.get("focus_areas", [])
|
||||||
|
|
||||||
|
company_bg = f"- Former VP of Engineering at {companies[0] if companies else 'Google'}, leading teams of 500+ engineers"
|
||||||
|
if len(companies) > 1:
|
||||||
|
company_bg += f"\n- CTO at {companies[1]}, responsible for cloud infrastructure strategy"
|
||||||
|
|
||||||
|
achievements_text = "\n".join([f"- {achievement}" for achievement in achievements[:2]]) if achievements else ""
|
||||||
|
focus_text = "\n".join([f"- {focus}" for focus in focus_areas[:5]]) if focus_areas else ""
|
||||||
|
|
||||||
|
team_allocation = ""
|
||||||
|
if team_findings:
|
||||||
|
team_allocation = "\n\nTEAM ALLOCATION:\n"
|
||||||
|
team_allocation += "You have allocated your expert team to analyze different domains:\n"
|
||||||
|
for finding in team_findings[:5]:
|
||||||
|
domain = finding.get("domain", "unknown")
|
||||||
|
team_allocation += f"- {domain}: Expert analysis completed\n"
|
||||||
|
|
||||||
|
intro = f"""You are {role} with {experience} years of experience.
|
||||||
|
|
||||||
|
COMPANY BACKGROUND:
|
||||||
|
{company_bg}
|
||||||
|
|
||||||
|
KEY ACHIEVEMENTS:
|
||||||
|
{achievements_text}
|
||||||
|
{team_allocation}
|
||||||
|
|
||||||
|
YOUR ROLE:
|
||||||
|
You have received this project and allocated your expert team to analyze different domains.
|
||||||
|
Now, synthesize all team findings into strategic recommendations.
|
||||||
|
|
||||||
|
YOUR FOCUS AREAS:
|
||||||
|
{focus_text}
|
||||||
|
|
||||||
|
---
|
||||||
|
"""
|
||||||
|
return intro + base_prompt
|
||||||
|
|
||||||
@ -2673,8 +2673,10 @@ def build_intelligent_chunk_prompt(chunk: Dict, analysis_state: Optional[Dict] =
|
|||||||
"""
|
"""
|
||||||
Build comprehensive prompt for analyzing a semantically grouped chunk.
|
Build comprehensive prompt for analyzing a semantically grouped chunk.
|
||||||
Generates detailed module-level analysis with context awareness.
|
Generates detailed module-level analysis with context awareness.
|
||||||
Now includes progressive context from previous chunks.
|
Now includes progressive context from previous chunks and world-class persona.
|
||||||
"""
|
"""
|
||||||
|
from persona_system import allocate_code_persona, build_code_analysis_persona_prompt
|
||||||
|
|
||||||
chunk_name = chunk.get('name', 'unknown')
|
chunk_name = chunk.get('name', 'unknown')
|
||||||
chunk_type = chunk.get('chunk_type', 'module')
|
chunk_type = chunk.get('chunk_type', 'module')
|
||||||
files_batch = chunk.get('files', [])
|
files_batch = chunk.get('files', [])
|
||||||
@ -2694,15 +2696,22 @@ def build_intelligent_chunk_prompt(chunk: Dict, analysis_state: Optional[Dict] =
|
|||||||
|
|
||||||
optimized_files.append((file_path, optimized_content))
|
optimized_files.append((file_path, optimized_content))
|
||||||
|
|
||||||
|
# Allocate appropriate persona based on files in chunk
|
||||||
|
# Use the first file to determine persona (or combine if multiple domains)
|
||||||
|
primary_file_path = optimized_files[0][0] if optimized_files else ""
|
||||||
|
primary_content = optimized_files[0][1] if optimized_files else ""
|
||||||
|
persona = allocate_code_persona(primary_file_path, primary_content, chunk_type)
|
||||||
|
|
||||||
# Build context from previous analyses (progressive learning)
|
# Build context from previous analyses (progressive learning)
|
||||||
context_section = build_context_from_state(analysis_state, chunk)
|
context_section = build_context_from_state(analysis_state, chunk)
|
||||||
|
|
||||||
|
# Build assignment context
|
||||||
|
assignment_context = f"CTO has assigned you to analyze the '{chunk_name}' module/chunk for this project. This is a {chunk_type} type chunk containing {len(optimized_files)} files."
|
||||||
|
|
||||||
# Build comprehensive prompt with module context
|
# Build comprehensive prompt with module context
|
||||||
prompt_parts = [
|
prompt_parts = [
|
||||||
f"# COMPREHENSIVE ANALYSIS: {chunk_name.upper()}",
|
f"# COMPREHENSIVE ANALYSIS: {chunk_name.upper()}",
|
||||||
f"Chunk Type: {chunk_type}",
|
f"Chunk Type: {chunk_type}",
|
||||||
"",
|
|
||||||
"You are a senior software architect with 30+ years of experience. Analyze this module/chunk comprehensively.",
|
|
||||||
""
|
""
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -2794,7 +2803,12 @@ def build_intelligent_chunk_prompt(chunk: Dict, analysis_state: Optional[Dict] =
|
|||||||
"Focus on providing detailed, actionable insights that help understand the complete module context."
|
"Focus on providing detailed, actionable insights that help understand the complete module context."
|
||||||
])
|
])
|
||||||
|
|
||||||
return "\n".join(prompt_parts)
|
base_prompt = "\n".join(prompt_parts)
|
||||||
|
|
||||||
|
# Enhance with persona
|
||||||
|
enhanced_prompt = build_code_analysis_persona_prompt(base_prompt, persona, assignment_context)
|
||||||
|
|
||||||
|
return enhanced_prompt
|
||||||
|
|
||||||
def build_smart_batch_prompt(files_batch: List[Tuple[str, str]]) -> str:
|
def build_smart_batch_prompt(files_batch: List[Tuple[str, str]]) -> str:
|
||||||
"""Legacy function: Build prompt for simple batch (backward compatibility)."""
|
"""Legacy function: Build prompt for simple batch (backward compatibility)."""
|
||||||
@ -4719,13 +4733,13 @@ def build_synthesis_prompt(analysis_state: Dict, all_chunk_analyses: List[Dict]
|
|||||||
"""
|
"""
|
||||||
Build comprehensive prompt for cross-module synthesis analysis.
|
Build comprehensive prompt for cross-module synthesis analysis.
|
||||||
Synthesizes all individual module analyses into system-level insights.
|
Synthesizes all individual module analyses into system-level insights.
|
||||||
|
Uses CTO persona for executive-level synthesis.
|
||||||
"""
|
"""
|
||||||
|
from persona_system import get_cto_persona, build_cto_synthesis_prompt
|
||||||
|
|
||||||
prompt_parts = [
|
prompt_parts = [
|
||||||
"# CROSS-MODULE SYNTHESIS ANALYSIS",
|
"# CROSS-MODULE SYNTHESIS ANALYSIS",
|
||||||
"",
|
"",
|
||||||
"You are a senior software architect with 30+ years of experience. Your task is to synthesize",
|
|
||||||
"findings from multiple module-level analyses into comprehensive system-level insights.",
|
|
||||||
"",
|
|
||||||
"## CONTEXT: PREVIOUSLY ANALYZED MODULES",
|
"## CONTEXT: PREVIOUSLY ANALYZED MODULES",
|
||||||
""
|
""
|
||||||
]
|
]
|
||||||
@ -4842,7 +4856,19 @@ def build_synthesis_prompt(analysis_state: Dict, all_chunk_analyses: List[Dict]
|
|||||||
"across all analyzed modules, not just repeating individual module findings."
|
"across all analyzed modules, not just repeating individual module findings."
|
||||||
])
|
])
|
||||||
|
|
||||||
return "\n".join(prompt_parts)
|
base_prompt = "\n".join(prompt_parts)
|
||||||
|
|
||||||
|
# Get team findings for CTO context
|
||||||
|
team_findings = []
|
||||||
|
if all_chunk_analyses:
|
||||||
|
for chunk_analysis in all_chunk_analyses:
|
||||||
|
module_name = chunk_analysis.get('module_name', 'unknown')
|
||||||
|
team_findings.append({"domain": module_name, "analysis": chunk_analysis})
|
||||||
|
|
||||||
|
# Enhance with CTO persona
|
||||||
|
enhanced_prompt = build_cto_synthesis_prompt(base_prompt, team_findings)
|
||||||
|
|
||||||
|
return enhanced_prompt
|
||||||
|
|
||||||
def parse_synthesis_response(response_text: str) -> Dict:
|
def parse_synthesis_response(response_text: str) -> Dict:
|
||||||
"""Parse synthesis response from Claude API."""
|
"""Parse synthesis response from Claude API."""
|
||||||
|
|||||||
@ -141,17 +141,19 @@ router.get('/auth/github/callback', async (req, res) => {
|
|||||||
setImmediate(async () => {
|
setImmediate(async () => {
|
||||||
try {
|
try {
|
||||||
console.log('[GitHub OAuth] Starting background repository attachment for:', repoContext.repoUrl);
|
console.log('[GitHub OAuth] Starting background repository attachment for:', repoContext.repoUrl);
|
||||||
|
console.log('[GitHub OAuth] Using newly stored token for user:', user_id);
|
||||||
const GitHubIntegrationService = require('../services/github-integration.service');
|
const GitHubIntegrationService = require('../services/github-integration.service');
|
||||||
const database = require('../config/database');
|
const database = require('../config/database');
|
||||||
const githubService = new GitHubIntegrationService();
|
const githubService = new GitHubIntegrationService();
|
||||||
const { owner, repo, branch } = githubService.parseGitHubUrl(repoContext.repoUrl);
|
const { owner, repo, branch } = githubService.parseGitHubUrl(repoContext.repoUrl);
|
||||||
|
|
||||||
// Get metadata using authenticated Octokit
|
// Get metadata using authenticated Octokit with the specific user's token
|
||||||
const repositoryData = await githubService.fetchRepositoryMetadata(owner, repo);
|
// Pass userId to ensure we use the newly stored token
|
||||||
|
const repositoryData = await githubService.fetchRepositoryMetadata(owner, repo, false, user_id);
|
||||||
let actualBranch = repoContext.branchName || branch || repositoryData.default_branch || 'main';
|
let actualBranch = repoContext.branchName || branch || repositoryData.default_branch || 'main';
|
||||||
|
|
||||||
// Attempt analysis and sync with fallback
|
// Attempt analysis and sync with fallback - use userId to ensure correct token
|
||||||
const codebaseAnalysis = await githubService.analyzeCodebase(owner, repo, actualBranch, false);
|
const codebaseAnalysis = await githubService.analyzeCodebase(owner, repo, actualBranch, false, user_id);
|
||||||
const insertQuery = `
|
const insertQuery = `
|
||||||
INSERT INTO all_repositories (
|
INSERT INTO all_repositories (
|
||||||
repository_url, repository_name, owner_name,
|
repository_url, repository_name, owner_name,
|
||||||
@ -170,14 +172,14 @@ router.get('/auth/github/callback', async (req, res) => {
|
|||||||
JSON.stringify(codebaseAnalysis),
|
JSON.stringify(codebaseAnalysis),
|
||||||
'syncing',
|
'syncing',
|
||||||
repositoryData.visibility === 'private',
|
repositoryData.visibility === 'private',
|
||||||
repoContext.userId || null,
|
user_id || repoContext.userId || null, // Use user_id from OAuth callback (most reliable)
|
||||||
'github' // This is GitHub OAuth callback, so provider is always github
|
'github' // This is GitHub OAuth callback, so provider is always github
|
||||||
];
|
];
|
||||||
const insertResult = await database.query(insertQuery, insertValues);
|
const insertResult = await database.query(insertQuery, insertValues);
|
||||||
const repositoryRecord = insertResult.rows[0];
|
const repositoryRecord = insertResult.rows[0];
|
||||||
|
|
||||||
// Clone repository
|
// Clone repository - use userId to ensure correct token
|
||||||
const downloadResult = await githubService.syncRepositoryWithFallback(owner, repo, actualBranch, repositoryRecord.id, repositoryData.visibility !== 'private');
|
const downloadResult = await githubService.syncRepositoryWithFallback(owner, repo, actualBranch, repositoryRecord.id, repositoryData.visibility !== 'private', user_id);
|
||||||
const finalSyncStatus = downloadResult.success ? 'synced' : 'error';
|
const finalSyncStatus = downloadResult.success ? 'synced' : 'error';
|
||||||
await database.query('UPDATE all_repositories SET sync_status = $1, updated_at = NOW() WHERE id = $2', [finalSyncStatus, repositoryRecord.id]);
|
await database.query('UPDATE all_repositories SET sync_status = $1, updated_at = NOW() WHERE id = $2', [finalSyncStatus, repositoryRecord.id]);
|
||||||
|
|
||||||
|
|||||||
@ -163,12 +163,28 @@ router.post('/:provider/attach-repository', async (req, res) => {
|
|||||||
const { template_id, repository_url, branch_name } = req.body;
|
const { template_id, repository_url, branch_name } = req.body;
|
||||||
const userId = req.headers['x-user-id'] || req.query.user_id || req.body.user_id || (req.user && (req.user.id || req.user.userId));
|
const userId = req.headers['x-user-id'] || req.query.user_id || req.body.user_id || (req.user && (req.user.id || req.user.userId));
|
||||||
|
|
||||||
|
console.log(`[VCS Attach] Extracted userId:`, userId, `from headers:`, req.headers['x-user-id'], `query:`, req.query.user_id, `body:`, req.body.user_id);
|
||||||
|
|
||||||
// Validate input - only repository_url is required (like GitHub)
|
// Validate input - only repository_url is required (like GitHub)
|
||||||
if (!repository_url) {
|
if (!repository_url) {
|
||||||
return res.status(400).json({ success: false, message: 'Repository URL is required' });
|
return res.status(400).json({ success: false, message: 'Repository URL is required' });
|
||||||
}
|
}
|
||||||
|
|
||||||
const { owner, repo, branch } = provider.parseRepoUrl(repository_url);
|
// Clean and normalize the repository URL (trim whitespace, decode URL encoding)
|
||||||
|
let cleanedUrl = repository_url.trim();
|
||||||
|
// Decode URL-encoded characters (like %20 for spaces)
|
||||||
|
try {
|
||||||
|
cleanedUrl = decodeURIComponent(cleanedUrl);
|
||||||
|
} catch (e) {
|
||||||
|
// If decoding fails, use original URL
|
||||||
|
console.warn(`[VCS Attach] Failed to decode URL, using original: ${cleanedUrl}`);
|
||||||
|
}
|
||||||
|
// Trim again after decoding
|
||||||
|
cleanedUrl = cleanedUrl.trim();
|
||||||
|
|
||||||
|
console.log(`[VCS Attach] Original URL: ${repository_url}, Cleaned URL: ${cleanedUrl}`);
|
||||||
|
|
||||||
|
const { owner, repo, branch } = provider.parseRepoUrl(cleanedUrl);
|
||||||
|
|
||||||
// Enhanced flow: Detect private repos and redirect to OAuth immediately
|
// Enhanced flow: Detect private repos and redirect to OAuth immediately
|
||||||
const providerKey = (req.params.provider || '').toLowerCase();
|
const providerKey = (req.params.provider || '').toLowerCase();
|
||||||
@ -248,7 +264,44 @@ router.post('/:provider/attach-repository', async (req, res) => {
|
|||||||
// For public repos or authenticated private repos, proceed with normal flow
|
// For public repos or authenticated private repos, proceed with normal flow
|
||||||
const accessCheck = await provider.checkRepositoryAccess(owner, repo, userId);
|
const accessCheck = await provider.checkRepositoryAccess(owner, repo, userId);
|
||||||
|
|
||||||
|
console.log(`[VCS Attach] Access check result for ${owner}/${repo}:`, {
|
||||||
|
hasAccess: accessCheck.hasAccess,
|
||||||
|
requiresAuth: accessCheck.requiresAuth,
|
||||||
|
authError: accessCheck.authError,
|
||||||
|
error: accessCheck.error,
|
||||||
|
exists: accessCheck.exists,
|
||||||
|
github_username: accessCheck.github_username
|
||||||
|
});
|
||||||
|
|
||||||
if (!accessCheck.hasAccess) {
|
if (!accessCheck.hasAccess) {
|
||||||
|
// If access check failed but requires auth, trigger OAuth flow
|
||||||
|
if (accessCheck.requiresAuth || accessCheck.authError) {
|
||||||
|
const oauthService = getOAuthService(providerKey);
|
||||||
|
if (oauthService) {
|
||||||
|
console.log(`🔒 [VCS Attach] Token exists but cannot access repository (or no valid token), redirecting to OAuth: ${repository_url}`);
|
||||||
|
console.log(`🔒 [VCS Attach] Reason: ${accessCheck.error || 'Authentication required'}, userId: ${userId}`);
|
||||||
|
|
||||||
|
// Generate OAuth URL with repository context in state
|
||||||
|
const stateBase = Math.random().toString(36).substring(7);
|
||||||
|
const state = `${stateBase}|uid=${userId || 'unknown'}|repo=${encodeURIComponent(repository_url)}|branch=${encodeURIComponent(branch_name || 'main')}|private_repo=true`;
|
||||||
|
|
||||||
|
const authUrl = oauthService.getAuthUrl(state, userId);
|
||||||
|
|
||||||
|
console.log(`🔒 [VCS Attach] Generated OAuth URL for ${providerKey}, returning requires_auth response`);
|
||||||
|
|
||||||
|
return res.json({
|
||||||
|
success: false,
|
||||||
|
message: `${providerKey.charAt(0).toUpperCase() + providerKey.slice(1)} authentication required for private repository`,
|
||||||
|
requires_auth: true,
|
||||||
|
is_private_repo: true,
|
||||||
|
auth_url: authUrl,
|
||||||
|
state: state
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If it's not an auth issue, return 404
|
||||||
|
console.log(`[VCS Attach] Access check failed without auth requirement, returning 404`);
|
||||||
return res.status(404).json({ success: false, message: accessCheck.error || 'Repository not accessible' });
|
return res.status(404).json({ success: false, message: accessCheck.error || 'Repository not accessible' });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -21,8 +21,8 @@ class GitHubIntegrationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Get authenticated Octokit instance
|
// Get authenticated Octokit instance
|
||||||
async getAuthenticatedOctokit() {
|
async getAuthenticatedOctokit(userId = null) {
|
||||||
return await this.oauthService.getAuthenticatedOctokit();
|
return await this.oauthService.getAuthenticatedOctokit(userId);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extract owner, repo, and branch from GitHub URL using parse-github-url library
|
// Extract owner, repo, and branch from GitHub URL using parse-github-url library
|
||||||
@ -31,8 +31,15 @@ class GitHubIntegrationService {
|
|||||||
throw new Error('URL must be a non-empty string');
|
throw new Error('URL must be a non-empty string');
|
||||||
}
|
}
|
||||||
|
|
||||||
// Normalize the URL first
|
// Normalize the URL first - trim and decode URL encoding
|
||||||
let normalizedUrl = url.trim();
|
let normalizedUrl = url.trim();
|
||||||
|
// Decode URL-encoded characters (like %20 for spaces)
|
||||||
|
try {
|
||||||
|
normalizedUrl = decodeURIComponent(normalizedUrl).trim();
|
||||||
|
} catch (e) {
|
||||||
|
// If decoding fails, just trim
|
||||||
|
normalizedUrl = normalizedUrl.trim();
|
||||||
|
}
|
||||||
|
|
||||||
// Remove trailing slashes and .git extensions
|
// Remove trailing slashes and .git extensions
|
||||||
normalizedUrl = normalizedUrl.replace(/\/+$/, '').replace(/\.git$/, '');
|
normalizedUrl = normalizedUrl.replace(/\/+$/, '').replace(/\.git$/, '');
|
||||||
@ -216,7 +223,7 @@ class GitHubIntegrationService {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
// No token found - try unauthenticated access first to check if it's public
|
// No token found that can access this repo - try unauthenticated access to check if it's public
|
||||||
try {
|
try {
|
||||||
const unauthenticatedOctokit = new Octokit({
|
const unauthenticatedOctokit = new Octokit({
|
||||||
userAgent: 'CodeNuk-GitIntegration/1.0.0',
|
userAgent: 'CodeNuk-GitIntegration/1.0.0',
|
||||||
@ -234,13 +241,18 @@ class GitHubIntegrationService {
|
|||||||
};
|
};
|
||||||
} catch (unauthenticatedError) {
|
} catch (unauthenticatedError) {
|
||||||
if (unauthenticatedError.status === 404) {
|
if (unauthenticatedError.status === 404) {
|
||||||
// Repository truly doesn't exist
|
// 404 from unauthenticated access could mean:
|
||||||
|
// 1. Repository truly doesn't exist
|
||||||
|
// 2. Repository is private and requires authentication
|
||||||
|
// Since we already tried to find a token and none could access it,
|
||||||
|
// and we're being called from a private repo flow, assume it requires auth
|
||||||
|
console.log(`🔒 [GitHub] 404 from unauthenticated access - assuming private repo requires authentication`);
|
||||||
return {
|
return {
|
||||||
exists: false,
|
exists: null, // Unknown - could be missing or private
|
||||||
isPrivate: null,
|
isPrivate: null,
|
||||||
hasAccess: false,
|
hasAccess: false,
|
||||||
requiresAuth: false,
|
requiresAuth: true, // Changed from false to true - trigger OAuth
|
||||||
error: 'Repository not found'
|
error: 'Repository not found or requires authentication'
|
||||||
};
|
};
|
||||||
} else if (unauthenticatedError.status === 401 || unauthenticatedError.status === 403) {
|
} else if (unauthenticatedError.status === 401 || unauthenticatedError.status === 403) {
|
||||||
// Repository exists but requires authentication (private) - generate auth URL
|
// Repository exists but requires authentication (private) - generate auth URL
|
||||||
@ -289,13 +301,13 @@ class GitHubIntegrationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Get repository information from GitHub
|
// Get repository information from GitHub
|
||||||
async fetchRepositoryMetadata(owner, repo, skipAuth = false) {
|
async fetchRepositoryMetadata(owner, repo, skipAuth = false, userId = null) {
|
||||||
// If skipAuth is true, try with unauthenticated octokit first to check visibility
|
// If skipAuth is true, try with unauthenticated octokit first to check visibility
|
||||||
let octokit;
|
let octokit;
|
||||||
if (skipAuth) {
|
if (skipAuth) {
|
||||||
octokit = this.octokit; // Use unauthenticated instance
|
octokit = this.octokit; // Use unauthenticated instance
|
||||||
} else {
|
} else {
|
||||||
octokit = await this.getAuthenticatedOctokit();
|
octokit = await this.getAuthenticatedOctokit(userId);
|
||||||
}
|
}
|
||||||
|
|
||||||
const safe = async (fn, fallback) => {
|
const safe = async (fn, fallback) => {
|
||||||
@ -309,26 +321,41 @@ class GitHubIntegrationService {
|
|||||||
|
|
||||||
let repoData;
|
let repoData;
|
||||||
try {
|
try {
|
||||||
|
console.log(`🔍 [GitHub] fetchRepositoryMetadata: skipAuth=${skipAuth}, calling octokit.repos.get for ${owner}/${repo}`);
|
||||||
const response = await octokit.repos.get({ owner, repo });
|
const response = await octokit.repos.get({ owner, repo });
|
||||||
if (skipAuth) {
|
|
||||||
if (response.status === 401 || response.status === 403) {
|
|
||||||
throw new Error('Authentication required to access repository');
|
|
||||||
} else if (response.status === 404) {
|
|
||||||
throw new Error('Repository not found');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
repoData = response.data;
|
repoData = response.data;
|
||||||
|
console.log(`✅ [GitHub] Successfully fetched repository data: ${repoData?.full_name || 'no full_name'}`);
|
||||||
|
|
||||||
|
// Validate we got real data
|
||||||
|
if (!repoData || !repoData.full_name) {
|
||||||
|
console.log(`❌ [GitHub] Invalid repository data received, throwing error`);
|
||||||
|
throw new Error('Invalid repository data received');
|
||||||
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.log(`🔍 [GitHub] Error in fetchRepositoryMetadata:`, error.message, error.status);
|
// Check error status from various possible locations
|
||||||
|
const status = error.status || error.response?.status || error.code;
|
||||||
|
const errorMessage = error.message || '';
|
||||||
|
const is404 = status === 404 || status === '404' || errorMessage.includes('404') || errorMessage.includes('Not Found');
|
||||||
|
const isAuthError = status === 401 || status === 403 || status === '401' || status === '403';
|
||||||
|
|
||||||
|
console.log(`🔍 [GitHub] Error in fetchRepositoryMetadata CATCH BLOCK:`, errorMessage, `Status: ${status || 'unknown'}`, `is404: ${is404}`, `isAuthError: ${isAuthError}`, `skipAuth: ${skipAuth}`);
|
||||||
|
console.log(`🔍 [GitHub] Error object:`, JSON.stringify({
|
||||||
|
status: error.status,
|
||||||
|
responseStatus: error.response?.status,
|
||||||
|
code: error.code,
|
||||||
|
message: error.message,
|
||||||
|
name: error.name
|
||||||
|
}));
|
||||||
|
|
||||||
if (skipAuth) {
|
if (skipAuth) {
|
||||||
// For GitHub, any error when skipAuth=true likely means private repo
|
// For GitHub, any error when skipAuth=true means private repo or doesn't exist
|
||||||
if (error.status === 401 || error.status === 403 || error.status === 404) {
|
// Always throw authentication required - let the caller decide if it's truly missing or private
|
||||||
|
console.log(`🔒 [GitHub] skipAuth=true, THROWING authentication required error - NOT using safe fallback`);
|
||||||
throw new Error('Authentication required to access repository');
|
throw new Error('Authentication required to access repository');
|
||||||
}
|
}
|
||||||
// For other errors, also assume private repo
|
|
||||||
throw new Error('Authentication required to access repository');
|
// For authenticated requests, use safe fallback (but only if skipAuth is false)
|
||||||
}
|
console.log(`⚠️ [GitHub] skipAuth=false, using safe fallback`);
|
||||||
// For other errors, use safe fallback
|
|
||||||
repoData = await safe(
|
repoData = await safe(
|
||||||
async () => {
|
async () => {
|
||||||
const response = await octokit.repos.get({ owner, repo });
|
const response = await octokit.repos.get({ owner, repo });
|
||||||
@ -336,6 +363,12 @@ class GitHubIntegrationService {
|
|||||||
},
|
},
|
||||||
{}
|
{}
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// If safe fallback also failed, throw
|
||||||
|
if (!repoData || !repoData.full_name) {
|
||||||
|
console.log(`❌ [GitHub] Safe fallback also failed, throwing Repository not found`);
|
||||||
|
throw new Error('Repository not found');
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const languages = await safe(
|
const languages = await safe(
|
||||||
@ -364,7 +397,7 @@ class GitHubIntegrationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Analyze codebase structure
|
// Analyze codebase structure
|
||||||
async analyzeCodebase(owner, repo, branch, isPublicRepo = false) {
|
async analyzeCodebase(owner, repo, branch, isPublicRepo = false, userId = null) {
|
||||||
try {
|
try {
|
||||||
// Use appropriate octokit instance based on repository type
|
// Use appropriate octokit instance based on repository type
|
||||||
let octokit;
|
let octokit;
|
||||||
@ -374,8 +407,8 @@ class GitHubIntegrationService {
|
|||||||
userAgent: 'CodeNuk-GitIntegration/1.0.0',
|
userAgent: 'CodeNuk-GitIntegration/1.0.0',
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
// For private repos, use authenticated octokit
|
// For private repos, use authenticated octokit with userId
|
||||||
octokit = await this.getAuthenticatedOctokit();
|
octokit = await this.getAuthenticatedOctokit(userId);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get the commit SHA for the branch
|
// Get the commit SHA for the branch
|
||||||
@ -519,7 +552,7 @@ class GitHubIntegrationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Git-based: clone or update local repo and re-index into DB
|
// Git-based: clone or update local repo and re-index into DB
|
||||||
async syncRepositoryWithGit(owner, repo, branch, repositoryId, isPublicRepo = false) {
|
async syncRepositoryWithGit(owner, repo, branch, repositoryId, isPublicRepo = false, userId = null) {
|
||||||
const database = require('../config/database');
|
const database = require('../config/database');
|
||||||
const localPath = this.gitRepoService.getLocalRepoPath(owner, repo, branch);
|
const localPath = this.gitRepoService.getLocalRepoPath(owner, repo, branch);
|
||||||
let storageRecord = null;
|
let storageRecord = null;
|
||||||
@ -544,7 +577,7 @@ class GitHubIntegrationService {
|
|||||||
console.warn(`Failed to clone public repo without auth: ${error.message}`);
|
console.warn(`Failed to clone public repo without auth: ${error.message}`);
|
||||||
// Fallback to authenticated clone if available
|
// Fallback to authenticated clone if available
|
||||||
try {
|
try {
|
||||||
const tokenRecord = await this.oauthService.getToken();
|
const tokenRecord = userId ? await this.oauthService.getTokenForUser(userId) : await this.oauthService.getToken();
|
||||||
if (tokenRecord?.access_token) {
|
if (tokenRecord?.access_token) {
|
||||||
repoPath = await this.gitRepoService.cloneIfMissingWithAuth(
|
repoPath = await this.gitRepoService.cloneIfMissingWithAuth(
|
||||||
owner,
|
owner,
|
||||||
@ -560,7 +593,7 @@ class GitHubIntegrationService {
|
|||||||
} else {
|
} else {
|
||||||
// For private repos, try authenticated clone first
|
// For private repos, try authenticated clone first
|
||||||
try {
|
try {
|
||||||
const tokenRecord = await this.oauthService.getToken();
|
const tokenRecord = userId ? await this.oauthService.getTokenForUser(userId) : await this.oauthService.getToken();
|
||||||
if (tokenRecord?.access_token) {
|
if (tokenRecord?.access_token) {
|
||||||
repoPath = await this.gitRepoService.cloneIfMissingWithAuth(
|
repoPath = await this.gitRepoService.cloneIfMissingWithAuth(
|
||||||
owner,
|
owner,
|
||||||
@ -628,7 +661,7 @@ class GitHubIntegrationService {
|
|||||||
try {
|
try {
|
||||||
// Try to ensure repo exists for the preferred branch
|
// Try to ensure repo exists for the preferred branch
|
||||||
try {
|
try {
|
||||||
const tokenRecord = await this.oauthService.getToken().catch(() => null);
|
const tokenRecord = userId ? await this.oauthService.getTokenForUser(userId).catch(() => null) : await this.oauthService.getToken().catch(() => null);
|
||||||
if (tokenRecord?.access_token) {
|
if (tokenRecord?.access_token) {
|
||||||
repoPath = await this.gitRepoService.cloneIfMissingWithAuth(owner, repo, preferredBranch, 'github.com', tokenRecord.access_token, 'oauth2');
|
repoPath = await this.gitRepoService.cloneIfMissingWithAuth(owner, repo, preferredBranch, 'github.com', tokenRecord.access_token, 'oauth2');
|
||||||
} else {
|
} else {
|
||||||
@ -637,7 +670,7 @@ class GitHubIntegrationService {
|
|||||||
} catch (cloneErr) {
|
} catch (cloneErr) {
|
||||||
// If the branch doesn't exist (e.g., refs/heads not found), try the alternate branch
|
// If the branch doesn't exist (e.g., refs/heads not found), try the alternate branch
|
||||||
try {
|
try {
|
||||||
const tokenRecordAlt = await this.oauthService.getToken().catch(() => null);
|
const tokenRecordAlt = userId ? await this.oauthService.getTokenForUser(userId).catch(() => null) : await this.oauthService.getToken().catch(() => null);
|
||||||
repoPath = tokenRecordAlt?.access_token
|
repoPath = tokenRecordAlt?.access_token
|
||||||
? await this.gitRepoService.cloneIfMissingWithAuth(owner, repo, alternateBranch, 'github.com', tokenRecordAlt.access_token, 'oauth2')
|
? await this.gitRepoService.cloneIfMissingWithAuth(owner, repo, alternateBranch, 'github.com', tokenRecordAlt.access_token, 'oauth2')
|
||||||
: await this.gitRepoService.cloneIfMissing(owner, repo, alternateBranch);
|
: await this.gitRepoService.cloneIfMissing(owner, repo, alternateBranch);
|
||||||
@ -679,7 +712,7 @@ class GitHubIntegrationService {
|
|||||||
try {
|
try {
|
||||||
// Ensure repo exists similarly to diff flow
|
// Ensure repo exists similarly to diff flow
|
||||||
try {
|
try {
|
||||||
const tokenRecord = await this.oauthService.getToken().catch(() => null);
|
const tokenRecord = userId ? await this.oauthService.getTokenForUser(userId).catch(() => null) : await this.oauthService.getToken().catch(() => null);
|
||||||
if (tokenRecord?.access_token) {
|
if (tokenRecord?.access_token) {
|
||||||
repoPath = await this.gitRepoService.cloneIfMissingWithAuth(owner, repo, preferredBranch, 'github.com', tokenRecord.access_token, 'oauth2');
|
repoPath = await this.gitRepoService.cloneIfMissingWithAuth(owner, repo, preferredBranch, 'github.com', tokenRecord.access_token, 'oauth2');
|
||||||
} else {
|
} else {
|
||||||
@ -687,7 +720,7 @@ class GitHubIntegrationService {
|
|||||||
}
|
}
|
||||||
} catch (_) {
|
} catch (_) {
|
||||||
try {
|
try {
|
||||||
const tokenRecordAlt = await this.oauthService.getToken().catch(() => null);
|
const tokenRecordAlt = userId ? await this.oauthService.getTokenForUser(userId).catch(() => null) : await this.oauthService.getToken().catch(() => null);
|
||||||
repoPath = tokenRecordAlt?.access_token
|
repoPath = tokenRecordAlt?.access_token
|
||||||
? await this.gitRepoService.cloneIfMissingWithAuth(owner, repo, alternateBranch, 'github.com', tokenRecordAlt.access_token, 'oauth2')
|
? await this.gitRepoService.cloneIfMissingWithAuth(owner, repo, alternateBranch, 'github.com', tokenRecordAlt.access_token, 'oauth2')
|
||||||
: await this.gitRepoService.cloneIfMissing(owner, repo, alternateBranch);
|
: await this.gitRepoService.cloneIfMissing(owner, repo, alternateBranch);
|
||||||
@ -720,15 +753,15 @@ class GitHubIntegrationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Try git-based sync first, fall back to GitHub API download on failure
|
// Try git-based sync first, fall back to GitHub API download on failure
|
||||||
async syncRepositoryWithFallback(owner, repo, branch, repositoryId, isPublicRepo = false) {
|
async syncRepositoryWithFallback(owner, repo, branch, repositoryId, isPublicRepo = false, userId = null) {
|
||||||
// First attempt: full git clone/fetch and index
|
// First attempt: full git clone/fetch and index
|
||||||
const gitResult = await this.syncRepositoryWithGit(owner, repo, branch, repositoryId, isPublicRepo);
|
const gitResult = await this.syncRepositoryWithGit(owner, repo, branch, repositoryId, isPublicRepo, userId);
|
||||||
if (gitResult && gitResult.success) {
|
if (gitResult && gitResult.success) {
|
||||||
return { method: 'git', ...gitResult };
|
return { method: 'git', ...gitResult };
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fallback: API-based download and storage
|
// Fallback: API-based download and storage
|
||||||
const apiResult = await this.downloadRepositoryWithStorage(owner, repo, branch, repositoryId, isPublicRepo);
|
const apiResult = await this.downloadRepositoryWithStorage(owner, repo, branch, repositoryId, isPublicRepo, userId);
|
||||||
if (apiResult && apiResult.success) {
|
if (apiResult && apiResult.success) {
|
||||||
return { method: 'api', ...apiResult, git_error: gitResult?.error };
|
return { method: 'api', ...apiResult, git_error: gitResult?.error };
|
||||||
}
|
}
|
||||||
@ -737,7 +770,7 @@ class GitHubIntegrationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Download repository files locally and store in database
|
// Download repository files locally and store in database
|
||||||
async downloadRepositoryWithStorage(owner, repo, branch, repositoryId, isPublicRepo = false) {
|
async downloadRepositoryWithStorage(owner, repo, branch, repositoryId, isPublicRepo = false, userId = null) {
|
||||||
const targetDir = path.join(
|
const targetDir = path.join(
|
||||||
process.env.ATTACHED_REPOS_DIR,
|
process.env.ATTACHED_REPOS_DIR,
|
||||||
`${owner}__${repo}__${branch}`
|
`${owner}__${repo}__${branch}`
|
||||||
@ -765,8 +798,8 @@ class GitHubIntegrationService {
|
|||||||
userAgent: 'CodeNuk-GitIntegration/1.0.0',
|
userAgent: 'CodeNuk-GitIntegration/1.0.0',
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
// For private repos, use authenticated octokit
|
// For private repos, use authenticated octokit with userId
|
||||||
octokit = await this.getAuthenticatedOctokit();
|
octokit = await this.getAuthenticatedOctokit(userId);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get the commit SHA for the branch
|
// Get the commit SHA for the branch
|
||||||
|
|||||||
@ -199,8 +199,16 @@ class GitHubOAuthService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Create authenticated Octokit instance
|
// Create authenticated Octokit instance
|
||||||
async getAuthenticatedOctokit() {
|
async getAuthenticatedOctokit(userId = null) {
|
||||||
const tokenRecord = await this.getToken();
|
// If userId is provided, get the newest token for that user
|
||||||
|
// Otherwise, get the newest token overall
|
||||||
|
let tokenRecord;
|
||||||
|
if (userId) {
|
||||||
|
tokenRecord = await this.getTokenForUser(userId);
|
||||||
|
console.log(`[GitHub OAuth] Using token for user ${userId}: ${tokenRecord?.github_username || 'none'}`);
|
||||||
|
} else {
|
||||||
|
tokenRecord = await this.getToken();
|
||||||
|
}
|
||||||
|
|
||||||
if (!tokenRecord) {
|
if (!tokenRecord) {
|
||||||
throw new Error('No GitHub token found. Please authenticate with GitHub first.');
|
throw new Error('No GitHub token found. Please authenticate with GitHub first.');
|
||||||
|
|||||||
@ -15,7 +15,11 @@ class GithubAdapter {
|
|||||||
return this.impl.parseGitHubUrl(url);
|
return this.impl.parseGitHubUrl(url);
|
||||||
}
|
}
|
||||||
|
|
||||||
async checkRepositoryAccess(owner, repo) {
|
async checkRepositoryAccess(owner, repo, userId = null) {
|
||||||
|
// Use user-specific method if userId is provided
|
||||||
|
if (userId) {
|
||||||
|
return await this.impl.checkRepositoryAccessWithUser(owner, repo, userId);
|
||||||
|
}
|
||||||
return await this.impl.checkRepositoryAccess(owner, repo);
|
return await this.impl.checkRepositoryAccess(owner, repo);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
58
services/multi-document-upload-service/.dockerignore
Normal file
58
services/multi-document-upload-service/.dockerignore
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
# Python
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
*.so
|
||||||
|
.Python
|
||||||
|
*.egg-info/
|
||||||
|
dist/
|
||||||
|
build/
|
||||||
|
*.egg
|
||||||
|
|
||||||
|
# Virtual environments
|
||||||
|
venv/
|
||||||
|
env/
|
||||||
|
ENV/
|
||||||
|
.venv
|
||||||
|
|
||||||
|
# IDE
|
||||||
|
.vscode/
|
||||||
|
.idea/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
*~
|
||||||
|
|
||||||
|
# Documentation
|
||||||
|
*.md
|
||||||
|
!README.md
|
||||||
|
|
||||||
|
# Testing
|
||||||
|
.pytest_cache/
|
||||||
|
.coverage
|
||||||
|
htmlcov/
|
||||||
|
*.log
|
||||||
|
|
||||||
|
# Storage and temporary files
|
||||||
|
storage/
|
||||||
|
*.tmp
|
||||||
|
*.temp
|
||||||
|
|
||||||
|
# Git
|
||||||
|
.git/
|
||||||
|
.gitignore
|
||||||
|
|
||||||
|
# Docker
|
||||||
|
Dockerfile*
|
||||||
|
docker-compose*.yml
|
||||||
|
.dockerignore
|
||||||
|
|
||||||
|
# Environment files
|
||||||
|
.env
|
||||||
|
.env.local
|
||||||
|
*.env
|
||||||
|
|
||||||
|
# OS
|
||||||
|
.DS_Store
|
||||||
|
Thumbs.db
|
||||||
|
|
||||||
|
|
||||||
@ -1,29 +1,60 @@
|
|||||||
FROM python:3.11-slim
|
# Build stage - install dependencies that require compilation
|
||||||
|
FROM python:3.11-slim as builder
|
||||||
|
|
||||||
ENV PYTHONDONTWRITEBYTECODE=1 \
|
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||||
PYTHONUNBUFFERED=1
|
PYTHONUNBUFFERED=1
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install build dependencies only
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y --no-install-recommends \
|
apt-get install -y --no-install-recommends \
|
||||||
build-essential \
|
build-essential \
|
||||||
|
curl \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Copy and install Python dependencies
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir --user -r requirements.txt && \
|
||||||
|
pip cache purge
|
||||||
|
|
||||||
|
# Download SpaCy English model
|
||||||
|
RUN python -m spacy download en_core_web_sm
|
||||||
|
|
||||||
|
# Runtime stage - minimal image with only runtime dependencies
|
||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||||
|
PYTHONUNBUFFERED=1 \
|
||||||
|
PYTHONPATH=/app/src \
|
||||||
|
PATH=/root/.local/bin:$PATH \
|
||||||
|
MULTI_DOC_STORAGE_ROOT=/app/storage \
|
||||||
|
MULTI_DOC_CLAUDE_MODEL=claude-3-5-haiku-latest \
|
||||||
|
CLAUDE_MODEL=claude-3-5-haiku-latest \
|
||||||
|
PORT=8024
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install only runtime dependencies (no build tools)
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y --no-install-recommends \
|
||||||
poppler-utils \
|
poppler-utils \
|
||||||
tesseract-ocr \
|
tesseract-ocr \
|
||||||
ffmpeg \
|
ffmpeg \
|
||||||
libmagic1 \
|
libmagic1 \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
curl \
|
||||||
|
# Required for some Python packages at runtime
|
||||||
|
libgomp1 \
|
||||||
|
libglib2.0-0 \
|
||||||
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
|
&& apt-get clean
|
||||||
|
|
||||||
COPY requirements.txt .
|
# Copy Python packages from builder stage (includes spacy model)
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
COPY --from=builder /root/.local /root/.local
|
||||||
|
|
||||||
|
# Copy application code
|
||||||
COPY src ./src
|
COPY src ./src
|
||||||
|
|
||||||
ENV PYTHONPATH=/app/src \
|
|
||||||
MULTI_DOC_STORAGE_ROOT=/app/storage \
|
|
||||||
MULTI_DOC_CLAUDE_MODEL=claude-3-5-sonnet-20241022 \
|
|
||||||
PORT=8024
|
|
||||||
|
|
||||||
EXPOSE 8024
|
EXPOSE 8024
|
||||||
|
|
||||||
CMD ["sh", "-c", "uvicorn multi_document_upload_service.main:app --host 0.0.0.0 --port ${PORT:-8024}"]
|
CMD ["sh", "-c", "uvicorn multi_document_upload_service.main:app --host 0.0.0.0 --port ${PORT:-8024}"]
|
||||||
|
|||||||
@ -1,144 +0,0 @@
|
|||||||
# Fix: Empty Graph in Neo4j (No Relationships Found)
|
|
||||||
|
|
||||||
## Problem
|
|
||||||
|
|
||||||
When querying Neo4j for `CAUSES` relationships, you get "(no changes, no records)" because:
|
|
||||||
|
|
||||||
1. **PDF extraction failed** - Missing dependencies (`unstructured[pdf]`)
|
|
||||||
2. **0 relations extracted** - No text was extracted, so no analysis happened
|
|
||||||
3. **0 relations written** - Nothing was written to Neo4j (correct behavior)
|
|
||||||
|
|
||||||
## Root Cause
|
|
||||||
|
|
||||||
The service completed with 0 relations because:
|
|
||||||
- PDF file extraction failed: `partition_pdf() is not available because one or more dependencies are not installed`
|
|
||||||
- No text was extracted from the PDF
|
|
||||||
- No chunks were created
|
|
||||||
- No Claude analysis happened
|
|
||||||
- 0 relations were extracted
|
|
||||||
- 0 relations were written to Neo4j
|
|
||||||
|
|
||||||
## Solution
|
|
||||||
|
|
||||||
### Step 1: Update Dependencies
|
|
||||||
|
|
||||||
The `requirements.txt` has been updated to include:
|
|
||||||
```
|
|
||||||
unstructured[pdf]>=0.15.0
|
|
||||||
unstructured[docx]>=0.15.0
|
|
||||||
unstructured[pptx]>=0.15.0
|
|
||||||
unstructured[xlsx]>=0.15.0
|
|
||||||
```
|
|
||||||
|
|
||||||
### Step 2: Rebuild the Service
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine
|
|
||||||
|
|
||||||
# Rebuild the service with new dependencies
|
|
||||||
docker-compose build multi-document-upload-service
|
|
||||||
|
|
||||||
# Restart the service
|
|
||||||
docker-compose restart multi-document-upload-service
|
|
||||||
|
|
||||||
# Check logs to verify it's working
|
|
||||||
docker-compose logs -f multi-document-upload-service
|
|
||||||
```
|
|
||||||
|
|
||||||
### Step 3: Verify Dependencies
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Check if unstructured[pdf] is installed
|
|
||||||
docker-compose exec multi-document-upload-service pip list | grep unstructured
|
|
||||||
```
|
|
||||||
|
|
||||||
### Step 4: Re-upload Documents
|
|
||||||
|
|
||||||
1. Go to Project Builder in the frontend
|
|
||||||
2. Click on "Upload Documents for Knowledge Graph"
|
|
||||||
3. Upload a PDF or other document
|
|
||||||
4. Wait for processing to complete
|
|
||||||
5. Check Neo4j for relationships
|
|
||||||
|
|
||||||
### Step 5: Check Neo4j
|
|
||||||
|
|
||||||
Run these queries in Neo4j Browser:
|
|
||||||
|
|
||||||
```cypher
|
|
||||||
// Check if any nodes exist
|
|
||||||
MATCH (n)
|
|
||||||
RETURN count(n) as node_count
|
|
||||||
|
|
||||||
// Check for CAUSES relationships
|
|
||||||
MATCH (n:Concept)-[r:CAUSES]->(m:Concept)
|
|
||||||
RETURN n.name as cause, m.name as effect, r.confidence as confidence
|
|
||||||
LIMIT 50
|
|
||||||
```
|
|
||||||
|
|
||||||
## Expected Behavior After Fix
|
|
||||||
|
|
||||||
1. **PDF extraction succeeds** - Text is extracted from PDF files
|
|
||||||
2. **Text is chunked** - Document is split into manageable chunks
|
|
||||||
3. **Claude analyzes** - Causal relationships are extracted
|
|
||||||
4. **Relations are written** - Relationships are stored in Neo4j
|
|
||||||
5. **Query returns results** - Neo4j query shows relationships
|
|
||||||
|
|
||||||
## Verification Steps
|
|
||||||
|
|
||||||
1. **Check service logs**:
|
|
||||||
```bash
|
|
||||||
docker-compose logs multi-document-upload-service | grep -i "extracted\|relation\|neo4j"
|
|
||||||
```
|
|
||||||
|
|
||||||
2. **Check job status**:
|
|
||||||
```bash
|
|
||||||
curl http://localhost:8000/api/multi-docs/jobs/{job_id}
|
|
||||||
```
|
|
||||||
Should show: `"processed_files": 1` and relations count > 0
|
|
||||||
|
|
||||||
3. **Check Neo4j**:
|
|
||||||
```cypher
|
|
||||||
MATCH (n:Concept)-[r:CAUSES]->(m:Concept)
|
|
||||||
RETURN count(r) as relation_count
|
|
||||||
```
|
|
||||||
|
|
||||||
## Improvements Made
|
|
||||||
|
|
||||||
1. ✅ **Added PDF dependencies** - `unstructured[pdf]`, `unstructured[docx]`, etc.
|
|
||||||
2. ✅ **Added fallback extractors** - Uses `pdfplumber` if unstructured fails
|
|
||||||
3. ✅ **Better error handling** - Shows actual errors in job status
|
|
||||||
4. ✅ **Improved logging** - More detailed logs for debugging
|
|
||||||
5. ✅ **Better Neo4j query** - Validates data before writing
|
|
||||||
|
|
||||||
## Troubleshooting
|
|
||||||
|
|
||||||
If you still see 0 relations after rebuilding:
|
|
||||||
|
|
||||||
1. **Check extraction logs**:
|
|
||||||
```bash
|
|
||||||
docker-compose logs multi-document-upload-service | grep -i "extract"
|
|
||||||
```
|
|
||||||
|
|
||||||
2. **Check Claude analysis**:
|
|
||||||
```bash
|
|
||||||
docker-compose logs multi-document-upload-service | grep -i "claude\|analyze"
|
|
||||||
```
|
|
||||||
|
|
||||||
3. **Check Neo4j connection**:
|
|
||||||
```bash
|
|
||||||
docker-compose logs multi-document-upload-service | grep -i "neo4j\|graph"
|
|
||||||
```
|
|
||||||
|
|
||||||
4. **Verify document has causal language**:
|
|
||||||
- Not all documents contain causal relationships
|
|
||||||
- Try uploading a document with clear cause-effect statements
|
|
||||||
- Example: "Smoking causes lung cancer" or "Rain causes flooding"
|
|
||||||
|
|
||||||
## Next Steps
|
|
||||||
|
|
||||||
1. Rebuild the service with new dependencies
|
|
||||||
2. Re-upload documents
|
|
||||||
3. Check Neo4j for relationships
|
|
||||||
4. If still no results, check service logs for errors
|
|
||||||
5. Verify the document contains causal language
|
|
||||||
|
|
||||||
@ -1,176 +0,0 @@
|
|||||||
# Neo4j Diagnostic Queries
|
|
||||||
|
|
||||||
## Issue: No relationships found in Neo4j
|
|
||||||
|
|
||||||
If you're seeing "(no changes, no records)" when querying for `CAUSES` relationships, here are diagnostic queries to check what's actually in the database.
|
|
||||||
|
|
||||||
## Diagnostic Queries
|
|
||||||
|
|
||||||
### 1. Check if any nodes exist
|
|
||||||
```cypher
|
|
||||||
MATCH (n)
|
|
||||||
RETURN count(n) as node_count
|
|
||||||
LIMIT 1
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Check if Concept nodes exist
|
|
||||||
```cypher
|
|
||||||
MATCH (n:Concept)
|
|
||||||
RETURN count(n) as concept_count,
|
|
||||||
collect(DISTINCT labels(n)) as labels,
|
|
||||||
collect(DISTINCT keys(n)) as properties
|
|
||||||
LIMIT 10
|
|
||||||
```
|
|
||||||
|
|
||||||
### 3. Check all relationship types
|
|
||||||
```cypher
|
|
||||||
CALL db.relationshipTypes() YIELD relationshipType
|
|
||||||
RETURN relationshipType
|
|
||||||
```
|
|
||||||
|
|
||||||
### 4. Check all node labels
|
|
||||||
```cypher
|
|
||||||
CALL db.labels() YIELD label
|
|
||||||
RETURN label
|
|
||||||
```
|
|
||||||
|
|
||||||
### 5. Check all relationships (any type)
|
|
||||||
```cypher
|
|
||||||
MATCH (n)-[r]->(m)
|
|
||||||
RETURN type(r) as relationship_type,
|
|
||||||
count(r) as count,
|
|
||||||
labels(n) as from_labels,
|
|
||||||
labels(m) as to_labels
|
|
||||||
LIMIT 50
|
|
||||||
```
|
|
||||||
|
|
||||||
### 6. Check for CAUSES relationships specifically
|
|
||||||
```cypher
|
|
||||||
MATCH (n)-[r:CAUSES]->(m)
|
|
||||||
RETURN n, r, m
|
|
||||||
LIMIT 50
|
|
||||||
```
|
|
||||||
|
|
||||||
### 7. Check for relationships with lowercase "causes"
|
|
||||||
```cypher
|
|
||||||
MATCH (n)-[r]->(m)
|
|
||||||
WHERE type(r) =~ '(?i)causes'
|
|
||||||
RETURN type(r) as relationship_type, n, r, m
|
|
||||||
LIMIT 50
|
|
||||||
```
|
|
||||||
|
|
||||||
### 8. Check all nodes and their relationships
|
|
||||||
```cypher
|
|
||||||
MATCH (n)
|
|
||||||
OPTIONAL MATCH (n)-[r]->(m)
|
|
||||||
RETURN n, labels(n) as node_labels,
|
|
||||||
type(r) as relationship_type,
|
|
||||||
m, labels(m) as target_labels
|
|
||||||
LIMIT 50
|
|
||||||
```
|
|
||||||
|
|
||||||
### 9. Check for nodes created by the service (by job_id property)
|
|
||||||
```cypher
|
|
||||||
MATCH (n)-[r]->(m)
|
|
||||||
WHERE r.job_id IS NOT NULL
|
|
||||||
RETURN n, r, m, r.job_id as job_id
|
|
||||||
LIMIT 50
|
|
||||||
```
|
|
||||||
|
|
||||||
### 10. Check database statistics
|
|
||||||
```cypher
|
|
||||||
MATCH (n)
|
|
||||||
RETURN count(n) as total_nodes,
|
|
||||||
size([(n)-[r]->() | r]) as total_relationships
|
|
||||||
```
|
|
||||||
|
|
||||||
## Common Issues and Solutions
|
|
||||||
|
|
||||||
### Issue 1: No nodes at all
|
|
||||||
**Symptom**: Query 1 returns 0 nodes
|
|
||||||
**Cause**: Service hasn't written anything to Neo4j, or connection failed
|
|
||||||
**Solution**:
|
|
||||||
- Check service logs: `docker-compose logs multi-document-upload-service`
|
|
||||||
- Verify Neo4j connection in service configuration
|
|
||||||
- Check if job completed with 0 relations (extraction failed)
|
|
||||||
|
|
||||||
### Issue 2: Nodes exist but no relationships
|
|
||||||
**Symptom**: Query 1 returns nodes, but Query 6 returns no relationships
|
|
||||||
**Cause**: Relationships weren't created, or different relationship type
|
|
||||||
**Solution**:
|
|
||||||
- Check Query 5 to see what relationship types actually exist
|
|
||||||
- Check service logs for graph writing errors
|
|
||||||
- Verify the job actually extracted relations (check job status)
|
|
||||||
|
|
||||||
### Issue 3: Different relationship type
|
|
||||||
**Symptom**: Query 5 shows relationships but not `CAUSES`
|
|
||||||
**Cause**: Service might be using a different relationship type
|
|
||||||
**Solution**:
|
|
||||||
- Check Query 3 to see all relationship types
|
|
||||||
- Update query to use the correct relationship type
|
|
||||||
|
|
||||||
### Issue 4: Different node labels
|
|
||||||
**Symptom**: Query 6 returns no results, but Query 2 shows different labels
|
|
||||||
**Cause**: Service might be using different node labels
|
|
||||||
**Solution**:
|
|
||||||
- Check Query 2 to see what labels exist
|
|
||||||
- Update query to match actual labels
|
|
||||||
|
|
||||||
## Expected Structure
|
|
||||||
|
|
||||||
After a successful upload, you should see:
|
|
||||||
|
|
||||||
### Nodes
|
|
||||||
- **Label**: `Concept`
|
|
||||||
- **Properties**: `name`, `lastSeen`
|
|
||||||
|
|
||||||
### Relationships
|
|
||||||
- **Type**: `CAUSES`
|
|
||||||
- **Properties**: `confidence`, `explanation`, `source_file_id`, `source_snippet`, `job_id`, `model`, `updated_at`
|
|
||||||
|
|
||||||
### Example Query
|
|
||||||
```cypher
|
|
||||||
MATCH (cause:Concept)-[r:CAUSES]->(effect:Concept)
|
|
||||||
RETURN cause.name as cause,
|
|
||||||
effect.name as effect,
|
|
||||||
r.confidence as confidence,
|
|
||||||
r.job_id as job_id,
|
|
||||||
r.source_file_id as source_file
|
|
||||||
LIMIT 50
|
|
||||||
```
|
|
||||||
|
|
||||||
## Troubleshooting Steps
|
|
||||||
|
|
||||||
1. **Check service logs**:
|
|
||||||
```bash
|
|
||||||
docker-compose logs -f multi-document-upload-service
|
|
||||||
```
|
|
||||||
|
|
||||||
2. **Check if job completed successfully**:
|
|
||||||
```bash
|
|
||||||
curl http://localhost:8000/api/multi-docs/jobs/{job_id}
|
|
||||||
```
|
|
||||||
|
|
||||||
3. **Check Neo4j connection**:
|
|
||||||
```bash
|
|
||||||
docker-compose logs neo4j | grep -i error
|
|
||||||
```
|
|
||||||
|
|
||||||
4. **Verify Neo4j is running**:
|
|
||||||
```bash
|
|
||||||
docker-compose ps neo4j
|
|
||||||
```
|
|
||||||
|
|
||||||
5. **Test Neo4j connection manually**:
|
|
||||||
```bash
|
|
||||||
docker-compose exec neo4j cypher-shell -u neo4j -p password "MATCH (n) RETURN count(n)"
|
|
||||||
```
|
|
||||||
|
|
||||||
## Next Steps
|
|
||||||
|
|
||||||
1. Run the diagnostic queries above
|
|
||||||
2. Check the service logs for errors
|
|
||||||
3. Verify the job status via API
|
|
||||||
4. Re-upload documents after fixing dependencies
|
|
||||||
5. Check if relations were actually extracted (job status should show relation count)
|
|
||||||
|
|
||||||
@ -1,85 +0,0 @@
|
|||||||
# Quick Testing Guide - Multi-Document Upload
|
|
||||||
|
|
||||||
## 🚀 Quick Start Testing
|
|
||||||
|
|
||||||
### 1. Start Services
|
|
||||||
```bash
|
|
||||||
cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine
|
|
||||||
docker-compose up -d multi-document-upload-service neo4j redis postgres api-gateway
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Verify Services
|
|
||||||
```bash
|
|
||||||
# Check health
|
|
||||||
curl http://localhost:8024/health
|
|
||||||
curl http://localhost:8000/api/multi-docs/health
|
|
||||||
```
|
|
||||||
|
|
||||||
### 3. Test via Frontend
|
|
||||||
|
|
||||||
1. **Open Frontend**: `http://localhost:3001`
|
|
||||||
2. **Login** (if required)
|
|
||||||
3. **Go to Project Builder**
|
|
||||||
4. **Complete Steps 1-2** (Project Type & Features)
|
|
||||||
5. **Step 3: Multi Docs Upload** appears
|
|
||||||
6. **Upload files**:
|
|
||||||
- Click upload area
|
|
||||||
- Select multiple files (PDF, DOCX, etc.)
|
|
||||||
- Click "Start Upload"
|
|
||||||
7. **Watch Progress**:
|
|
||||||
- Progress bar updates
|
|
||||||
- Status messages appear
|
|
||||||
- Polls every 4 seconds
|
|
||||||
8. **Auto-proceeds** when completed
|
|
||||||
|
|
||||||
### 4. Verify in Neo4j
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Open Neo4j Browser: http://localhost:7474
|
|
||||||
# Login: neo4j / password
|
|
||||||
|
|
||||||
# Query causal relationships:
|
|
||||||
MATCH (n)-[r:CAUSES]->(m)
|
|
||||||
RETURN n, r, m
|
|
||||||
LIMIT 50
|
|
||||||
```
|
|
||||||
|
|
||||||
## 📝 Test Checklist
|
|
||||||
|
|
||||||
- [ ] Service starts successfully
|
|
||||||
- [ ] Health endpoint works
|
|
||||||
- [ ] Frontend component renders
|
|
||||||
- [ ] File upload works
|
|
||||||
- [ ] Progress updates correctly
|
|
||||||
- [ ] Job completes successfully
|
|
||||||
- [ ] Neo4j graph contains relationships
|
|
||||||
- [ ] Error handling works
|
|
||||||
- [ ] Skip button works
|
|
||||||
|
|
||||||
## 🔍 Debug Commands
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# View service logs
|
|
||||||
docker-compose logs -f multi-document-upload-service
|
|
||||||
|
|
||||||
# Check job status (replace {job_id})
|
|
||||||
curl http://localhost:8000/api/multi-docs/jobs/{job_id}
|
|
||||||
|
|
||||||
# Check graph summary
|
|
||||||
curl http://localhost:8000/api/multi-docs/jobs/{job_id}/graph
|
|
||||||
```
|
|
||||||
|
|
||||||
## ⚠️ Common Issues
|
|
||||||
|
|
||||||
1. **502 Bad Gateway**: Service not running → `docker-compose ps`
|
|
||||||
2. **413 Too Large**: File too big → Reduce file size
|
|
||||||
3. **No progress**: Check browser console → Check network tab
|
|
||||||
4. **No relationships**: Check Claude API key → Check service logs
|
|
||||||
|
|
||||||
## 🎯 Expected Flow
|
|
||||||
|
|
||||||
```
|
|
||||||
Upload Files → Job Created → Files Saved → Content Extracted →
|
|
||||||
Claude Analysis → Graph Built → Completed → Auto-proceed to Next Step
|
|
||||||
```
|
|
||||||
|
|
||||||
File diff suppressed because it is too large
Load Diff
@ -1,152 +0,0 @@
|
|||||||
# Rebuild Instructions - Multi-Document Upload Service
|
|
||||||
|
|
||||||
## Issue: Empty Graph in Neo4j
|
|
||||||
|
|
||||||
**Problem**: Query returns "(no changes, no records)" because the job completed with 0 relations.
|
|
||||||
|
|
||||||
**Root Cause**: PDF extraction failed due to missing dependencies (`unstructured[pdf]`).
|
|
||||||
|
|
||||||
## Fixes Applied
|
|
||||||
|
|
||||||
1. ✅ Added PDF dependencies (`unstructured[pdf]`, `unstructured[docx]`, etc.)
|
|
||||||
2. ✅ Added fallback extractors (pdfplumber, python-docx, python-pptx)
|
|
||||||
3. ✅ Improved error handling and logging
|
|
||||||
4. ✅ Fixed Neo4j query syntax
|
|
||||||
5. ✅ Better status messages
|
|
||||||
|
|
||||||
## Rebuild Steps
|
|
||||||
|
|
||||||
### Step 1: Rebuild the Service
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine
|
|
||||||
|
|
||||||
# Stop the service
|
|
||||||
docker-compose stop multi-document-upload-service
|
|
||||||
|
|
||||||
# Rebuild with new dependencies
|
|
||||||
docker-compose build --no-cache multi-document-upload-service
|
|
||||||
|
|
||||||
# Start the service
|
|
||||||
docker-compose up -d multi-document-upload-service
|
|
||||||
|
|
||||||
# Check logs to verify it's starting correctly
|
|
||||||
docker-compose logs -f multi-document-upload-service
|
|
||||||
```
|
|
||||||
|
|
||||||
### Step 2: Verify Dependencies
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Check if unstructured[pdf] is installed
|
|
||||||
docker-compose exec multi-document-upload-service pip list | grep unstructured
|
|
||||||
|
|
||||||
# You should see:
|
|
||||||
# unstructured
|
|
||||||
# unstructured-pdf
|
|
||||||
# unstructured-docx
|
|
||||||
# etc.
|
|
||||||
```
|
|
||||||
|
|
||||||
### Step 3: Test the Service
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Check health endpoint
|
|
||||||
curl http://localhost:8024/health
|
|
||||||
|
|
||||||
# Should return:
|
|
||||||
# {
|
|
||||||
# "status": "ok",
|
|
||||||
# "claude_model": "claude-3-5-haiku-latest",
|
|
||||||
# ...
|
|
||||||
# }
|
|
||||||
```
|
|
||||||
|
|
||||||
### Step 4: Re-upload Documents
|
|
||||||
|
|
||||||
1. Open frontend: `http://localhost:3001/project-builder`
|
|
||||||
2. Go to Step 1: Project Type
|
|
||||||
3. Find "Upload Documents for Knowledge Graph" section
|
|
||||||
4. Upload a PDF or other document
|
|
||||||
5. Wait for processing to complete
|
|
||||||
6. Check status - should show relation count > 0
|
|
||||||
|
|
||||||
### Step 5: Verify in Neo4j
|
|
||||||
|
|
||||||
Run these queries in Neo4j Browser (`http://localhost:7474`):
|
|
||||||
|
|
||||||
```cypher
|
|
||||||
// Check if any nodes exist
|
|
||||||
MATCH (n)
|
|
||||||
RETURN count(n) as node_count
|
|
||||||
|
|
||||||
// Check for CAUSES relationships
|
|
||||||
MATCH (n:Concept)-[r:CAUSES]->(m:Concept)
|
|
||||||
RETURN n.name as cause,
|
|
||||||
m.name as effect,
|
|
||||||
r.confidence as confidence,
|
|
||||||
r.job_id as job_id
|
|
||||||
LIMIT 50
|
|
||||||
```
|
|
||||||
|
|
||||||
## Expected Results
|
|
||||||
|
|
||||||
After rebuilding and re-uploading:
|
|
||||||
|
|
||||||
1. **PDF extraction succeeds** ✅
|
|
||||||
2. **Text is extracted** ✅
|
|
||||||
3. **Relations are extracted** ✅
|
|
||||||
4. **Relations are written to Neo4j** ✅
|
|
||||||
5. **Query returns results** ✅
|
|
||||||
|
|
||||||
## Troubleshooting
|
|
||||||
|
|
||||||
If you still see 0 relations:
|
|
||||||
|
|
||||||
1. **Check service logs**:
|
|
||||||
```bash
|
|
||||||
docker-compose logs multi-document-upload-service | tail -50
|
|
||||||
```
|
|
||||||
|
|
||||||
2. **Check extraction logs**:
|
|
||||||
```bash
|
|
||||||
docker-compose logs multi-document-upload-service | grep -i "extract\|pdf"
|
|
||||||
```
|
|
||||||
|
|
||||||
3. **Check Claude analysis**:
|
|
||||||
```bash
|
|
||||||
docker-compose logs multi-document-upload-service | grep -i "claude\|analyze\|relation"
|
|
||||||
```
|
|
||||||
|
|
||||||
4. **Check Neo4j connection**:
|
|
||||||
```bash
|
|
||||||
docker-compose logs multi-document-upload-service | grep -i "neo4j\|graph\|write"
|
|
||||||
```
|
|
||||||
|
|
||||||
5. **Verify document has causal language**:
|
|
||||||
- Not all documents contain causal relationships
|
|
||||||
- Try uploading a document with clear cause-effect statements
|
|
||||||
- Example: "Smoking causes lung cancer"
|
|
||||||
|
|
||||||
## Quick Test
|
|
||||||
|
|
||||||
Test with a simple text file:
|
|
||||||
|
|
||||||
1. Create a test file `test_causal.txt`:
|
|
||||||
```
|
|
||||||
Smoking cigarettes causes lung cancer.
|
|
||||||
Heavy rain causes flooding.
|
|
||||||
Exercise improves health.
|
|
||||||
```
|
|
||||||
|
|
||||||
2. Upload it via the frontend
|
|
||||||
3. Check Neo4j for relationships
|
|
||||||
4. Should see 3 causal relationships
|
|
||||||
|
|
||||||
## Next Steps
|
|
||||||
|
|
||||||
1. Rebuild the service
|
|
||||||
2. Re-upload documents
|
|
||||||
3. Check Neo4j for relationships
|
|
||||||
4. If still no results, check service logs
|
|
||||||
5. Verify the document contains causal language
|
|
||||||
|
|
||||||
@ -1,300 +0,0 @@
|
|||||||
# Multi-Document Upload Service - Frontend Testing Guide
|
|
||||||
|
|
||||||
## Prerequisites
|
|
||||||
|
|
||||||
1. **Backend Services Running**:
|
|
||||||
```bash
|
|
||||||
cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine
|
|
||||||
docker-compose up -d
|
|
||||||
```
|
|
||||||
|
|
||||||
2. **Verify Services are Running**:
|
|
||||||
- API Gateway: `http://localhost:8000/health`
|
|
||||||
- Multi-Document Upload Service: `http://localhost:8024/health`
|
|
||||||
- Neo4j: `http://localhost:7474` (Browser interface)
|
|
||||||
- Frontend: `http://localhost:3001` (or your frontend port)
|
|
||||||
|
|
||||||
3. **Check Service Health**:
|
|
||||||
```bash
|
|
||||||
# Check API Gateway
|
|
||||||
curl http://localhost:8000/health
|
|
||||||
|
|
||||||
# Check Multi-Document Upload Service directly
|
|
||||||
curl http://localhost:8024/health
|
|
||||||
|
|
||||||
# Check via API Gateway proxy
|
|
||||||
curl http://localhost:8000/api/multi-docs/health
|
|
||||||
```
|
|
||||||
|
|
||||||
## Frontend Testing Steps
|
|
||||||
|
|
||||||
### Step 1: Navigate to Project Builder
|
|
||||||
|
|
||||||
1. Open your browser and go to: `http://localhost:3001` (or your frontend URL)
|
|
||||||
2. Log in if required
|
|
||||||
3. Click on **"Project Builder"** in the navigation
|
|
||||||
|
|
||||||
### Step 2: Go to Multi Docs Upload Step
|
|
||||||
|
|
||||||
1. In the Project Builder, you should see the workflow steps:
|
|
||||||
- **Step 1**: Project Type
|
|
||||||
- **Step 2**: Features
|
|
||||||
- **Step 3**: Multi Docs Upload ← **This is the new step**
|
|
||||||
- **Step 4**: Business Context
|
|
||||||
- **Step 5**: Generate
|
|
||||||
- **Step 6**: Architecture
|
|
||||||
|
|
||||||
2. Complete Steps 1 and 2 (Project Type and Features selection)
|
|
||||||
3. You will automatically be taken to **Step 3: Multi Docs Upload**
|
|
||||||
|
|
||||||
### Step 3: Upload Documents
|
|
||||||
|
|
||||||
1. **Click on the upload area** or **drag and drop files**
|
|
||||||
2. **Select multiple files** (you can mix different formats):
|
|
||||||
- PDF files (`.pdf`)
|
|
||||||
- Word documents (`.doc`, `.docx`)
|
|
||||||
- PowerPoint (`.ppt`, `.pptx`)
|
|
||||||
- Excel files (`.xls`, `.xlsx`)
|
|
||||||
- JSON files (`.json`)
|
|
||||||
- XML files (`.xml`)
|
|
||||||
- Markdown files (`.md`)
|
|
||||||
- Images (`.png`, `.jpg`, `.jpeg`) - will use OCR
|
|
||||||
- Audio files (`.mp3`, `.wav`) - will be transcribed
|
|
||||||
- Video files (`.mp4`, `.avi`) - will be transcribed
|
|
||||||
|
|
||||||
3. **View selected files**: You should see a list of all selected files with:
|
|
||||||
- File icon
|
|
||||||
- File name
|
|
||||||
- Remove button for each file
|
|
||||||
|
|
||||||
4. **Click "Start Upload"** button
|
|
||||||
|
|
||||||
### Step 4: Monitor Upload Progress
|
|
||||||
|
|
||||||
After clicking "Start Upload", you should see:
|
|
||||||
|
|
||||||
1. **Upload Status**:
|
|
||||||
- Button shows "Uploading..." with spinner
|
|
||||||
- Progress bar appears
|
|
||||||
- Stage messages appear:
|
|
||||||
- "Job received"
|
|
||||||
- "Saving files"
|
|
||||||
- "Extracting document content"
|
|
||||||
- "Calling Claude for causal relations"
|
|
||||||
- "Writing to Neo4j knowledge graph"
|
|
||||||
- "Completed"
|
|
||||||
|
|
||||||
2. **Progress Indicators**:
|
|
||||||
- Progress percentage (0-100%)
|
|
||||||
- Status message showing current stage
|
|
||||||
- Processed files count vs total files count
|
|
||||||
|
|
||||||
3. **Polling**: The frontend automatically polls the job status every 4 seconds
|
|
||||||
|
|
||||||
### Step 5: Verify Results
|
|
||||||
|
|
||||||
Once the job is completed:
|
|
||||||
|
|
||||||
1. **Check Neo4j Graph**:
|
|
||||||
- Open Neo4j Browser: `http://localhost:7474`
|
|
||||||
- Login with:
|
|
||||||
- Username: `neo4j`
|
|
||||||
- Password: `password`
|
|
||||||
- Run Cypher query to see the graph:
|
|
||||||
```cypher
|
|
||||||
MATCH (n)-[r:CAUSES]->(m)
|
|
||||||
RETURN n, r, m
|
|
||||||
LIMIT 50
|
|
||||||
```
|
|
||||||
|
|
||||||
2. **Check Job Status via API**:
|
|
||||||
```bash
|
|
||||||
# Replace {job_id} with the actual job ID from the frontend
|
|
||||||
curl http://localhost:8000/api/multi-docs/jobs/{job_id}
|
|
||||||
```
|
|
||||||
|
|
||||||
3. **Get Graph Summary**:
|
|
||||||
```bash
|
|
||||||
curl http://localhost:8000/api/multi-docs/jobs/{job_id}/graph
|
|
||||||
```
|
|
||||||
|
|
||||||
## Testing Different Scenarios
|
|
||||||
|
|
||||||
### Scenario 1: Single PDF File
|
|
||||||
- Upload one PDF file
|
|
||||||
- Verify it processes correctly
|
|
||||||
- Check Neo4j for causal relationships
|
|
||||||
|
|
||||||
### Scenario 2: Multiple Mixed Format Files
|
|
||||||
- Upload 3-5 files of different formats (PDF, DOCX, JSON, image)
|
|
||||||
- Verify all files are processed
|
|
||||||
- Check that progress updates correctly
|
|
||||||
|
|
||||||
### Scenario 3: Large Files
|
|
||||||
- Upload a large PDF (10+ MB)
|
|
||||||
- Verify it handles large files correctly
|
|
||||||
- Check processing time
|
|
||||||
|
|
||||||
### Scenario 4: Error Handling
|
|
||||||
- Try uploading an unsupported file type
|
|
||||||
- Verify error message appears
|
|
||||||
- Check that the error is displayed clearly
|
|
||||||
|
|
||||||
### Scenario 5: Skip Option
|
|
||||||
- Upload files
|
|
||||||
- Click "Skip" button before completion
|
|
||||||
- Verify you can proceed to the next step
|
|
||||||
- Job continues processing in the background
|
|
||||||
|
|
||||||
## Browser Developer Tools
|
|
||||||
|
|
||||||
### Check Network Requests
|
|
||||||
|
|
||||||
1. **Open Developer Tools** (F12)
|
|
||||||
2. **Go to Network tab**
|
|
||||||
3. **Filter by "multi-docs"**
|
|
||||||
4. **Monitor requests**:
|
|
||||||
- `POST /api/multi-docs/jobs` - Upload files
|
|
||||||
- `GET /api/multi-docs/jobs/{job_id}` - Poll job status
|
|
||||||
- `GET /api/multi-docs/jobs/{job_id}/graph` - Get graph summary
|
|
||||||
|
|
||||||
### Check Console Logs
|
|
||||||
|
|
||||||
1. **Open Console tab**
|
|
||||||
2. **Look for**:
|
|
||||||
- Upload progress logs
|
|
||||||
- Job status updates
|
|
||||||
- Any error messages
|
|
||||||
|
|
||||||
### Check Response Data
|
|
||||||
|
|
||||||
Verify the API responses:
|
|
||||||
|
|
||||||
```javascript
|
|
||||||
// Upload response should be:
|
|
||||||
{
|
|
||||||
"job_id": "uuid-here",
|
|
||||||
"stage": "received",
|
|
||||||
"total_files": 3,
|
|
||||||
"created_at": "2024-01-01T00:00:00Z"
|
|
||||||
}
|
|
||||||
|
|
||||||
// Status response should be:
|
|
||||||
{
|
|
||||||
"job_id": "uuid-here",
|
|
||||||
"stage": "extracting",
|
|
||||||
"status_message": "Extracting document content",
|
|
||||||
"total_files": 3,
|
|
||||||
"processed_files": 1,
|
|
||||||
"error": null,
|
|
||||||
"created_at": "2024-01-01T00:00:00Z",
|
|
||||||
"updated_at": "2024-01-01T00:01:00Z",
|
|
||||||
"files": [...]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## Troubleshooting
|
|
||||||
|
|
||||||
### Issue: Upload fails with 502 Bad Gateway
|
|
||||||
**Solution**:
|
|
||||||
- Check if multi-document-upload-service is running:
|
|
||||||
```bash
|
|
||||||
docker-compose ps multi-document-upload-service
|
|
||||||
```
|
|
||||||
- Check service logs:
|
|
||||||
```bash
|
|
||||||
docker-compose logs multi-document-upload-service
|
|
||||||
```
|
|
||||||
|
|
||||||
### Issue: Upload fails with 413 Request Entity Too Large
|
|
||||||
**Solution**:
|
|
||||||
- Check file sizes (max 500MB total per job)
|
|
||||||
- Reduce number of files or file sizes
|
|
||||||
- Check API Gateway body size limits
|
|
||||||
|
|
||||||
### Issue: Status polling stops working
|
|
||||||
**Solution**:
|
|
||||||
- Check browser console for errors
|
|
||||||
- Verify job ID is correct
|
|
||||||
- Check if job completed or failed
|
|
||||||
- Check network tab for failed requests
|
|
||||||
|
|
||||||
### Issue: No causal relationships found
|
|
||||||
**Solution**:
|
|
||||||
- Check Claude API key is configured correctly
|
|
||||||
- Check service logs for Claude API errors
|
|
||||||
- Verify documents contain causal language
|
|
||||||
- Check Neo4j connection
|
|
||||||
|
|
||||||
### Issue: Frontend shows "Failed" status
|
|
||||||
**Solution**:
|
|
||||||
- Check the error message in the frontend
|
|
||||||
- Check backend service logs:
|
|
||||||
```bash
|
|
||||||
docker-compose logs -f multi-document-upload-service
|
|
||||||
```
|
|
||||||
- Verify all dependencies are running (Neo4j, Redis, Postgres)
|
|
||||||
|
|
||||||
## Expected Behavior
|
|
||||||
|
|
||||||
### Successful Flow:
|
|
||||||
1. ✅ Files upload successfully
|
|
||||||
2. ✅ Job ID is returned
|
|
||||||
3. ✅ Status polling starts automatically
|
|
||||||
4. ✅ Progress updates every 4 seconds
|
|
||||||
5. ✅ Stage changes are displayed
|
|
||||||
6. ✅ Progress bar updates
|
|
||||||
7. ✅ Job completes successfully
|
|
||||||
8. ✅ Frontend automatically proceeds to next step
|
|
||||||
9. ✅ Neo4j contains causal relationships
|
|
||||||
|
|
||||||
### Error Flow:
|
|
||||||
1. ✅ Error message is displayed clearly
|
|
||||||
2. ✅ User can retry upload
|
|
||||||
3. ✅ User can skip and proceed
|
|
||||||
4. ✅ Error details are logged in console
|
|
||||||
|
|
||||||
## API Endpoints Reference
|
|
||||||
|
|
||||||
### Upload Files
|
|
||||||
```bash
|
|
||||||
POST /api/multi-docs/jobs
|
|
||||||
Content-Type: multipart/form-data
|
|
||||||
|
|
||||||
Form Data:
|
|
||||||
- files: File[] (multiple files)
|
|
||||||
- job_name: string (optional)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Get Job Status
|
|
||||||
```bash
|
|
||||||
GET /api/multi-docs/jobs/{job_id}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Get Graph Summary
|
|
||||||
```bash
|
|
||||||
GET /api/multi-docs/jobs/{job_id}/graph
|
|
||||||
```
|
|
||||||
|
|
||||||
### Health Check
|
|
||||||
```bash
|
|
||||||
GET /api/multi-docs/health
|
|
||||||
```
|
|
||||||
|
|
||||||
## Next Steps After Testing
|
|
||||||
|
|
||||||
1. **Verify Neo4j Graph**: Check that causal relationships are stored correctly
|
|
||||||
2. **Check Storage**: Verify files are stored in the persistent volume
|
|
||||||
3. **Monitor Performance**: Check processing times for different file types
|
|
||||||
4. **Test Error Scenarios**: Verify error handling works correctly
|
|
||||||
5. **Test Large Batches**: Upload 50+ files to test scalability
|
|
||||||
|
|
||||||
## Support
|
|
||||||
|
|
||||||
If you encounter issues:
|
|
||||||
1. Check service logs: `docker-compose logs multi-document-upload-service`
|
|
||||||
2. Check API Gateway logs: `docker-compose logs api-gateway`
|
|
||||||
3. Check Neo4j logs: `docker-compose logs neo4j`
|
|
||||||
4. Verify all environment variables are set correctly
|
|
||||||
5. Check network connectivity between services
|
|
||||||
|
|
||||||
@ -8,10 +8,6 @@ pydantic-settings>=2.2.1
|
|||||||
aiofiles>=23.2.1
|
aiofiles>=23.2.1
|
||||||
tenacity>=8.2.3
|
tenacity>=8.2.3
|
||||||
python-dotenv>=1.0.1
|
python-dotenv>=1.0.1
|
||||||
unstructured[pdf]>=0.15.0
|
|
||||||
unstructured[docx]>=0.15.0
|
|
||||||
unstructured[pptx]>=0.15.0
|
|
||||||
unstructured[xlsx]>=0.15.0
|
|
||||||
pdfplumber>=0.11.0
|
pdfplumber>=0.11.0
|
||||||
python-docx>=1.1.0
|
python-docx>=1.1.0
|
||||||
python-pptx>=0.6.23
|
python-pptx>=0.6.23
|
||||||
@ -30,5 +26,13 @@ beautifulsoup4>=4.12.3
|
|||||||
lxml>=5.2.1
|
lxml>=5.2.1
|
||||||
sqlalchemy>=2.0.25
|
sqlalchemy>=2.0.25
|
||||||
httpx>=0.27.0
|
httpx>=0.27.0
|
||||||
tiktoken>=0.7.0
|
dowhy>=0.11.0
|
||||||
|
qdrant-client>=1.7.0
|
||||||
|
sentence-transformers>=2.2.0
|
||||||
|
numpy>=1.24.0
|
||||||
|
scipy>=1.11.0
|
||||||
|
networkx>=3.1
|
||||||
|
spacy>=3.7.0
|
||||||
|
markdown>=3.5.0
|
||||||
|
weasyprint>=60.0
|
||||||
|
|
||||||
|
|||||||
@ -1,328 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import base64
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
import re
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Iterable, List
|
|
||||||
|
|
||||||
from anthropic import Anthropic, BadRequestError
|
|
||||||
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential, RetryCallState
|
|
||||||
|
|
||||||
from .models import CausalRelation
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def is_billing_error(exception: Exception) -> bool:
|
|
||||||
"""Check if the exception is a billing/credit related error that shouldn't be retried."""
|
|
||||||
if isinstance(exception, BadRequestError):
|
|
||||||
error_message = str(exception).lower()
|
|
||||||
billing_keywords = ["credit", "balance", "too low", "billing", "upgrade", "purchase credits"]
|
|
||||||
return any(keyword in error_message for keyword in billing_keywords)
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def should_retry_exception(retry_state: RetryCallState) -> bool:
|
|
||||||
"""Custom retry condition that excludes billing errors."""
|
|
||||||
exception = retry_state.outcome.exception()
|
|
||||||
if exception is None:
|
|
||||||
return False
|
|
||||||
# Don't retry billing errors - they won't be resolved by retrying
|
|
||||||
if is_billing_error(exception):
|
|
||||||
return False
|
|
||||||
# Retry other exceptions
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
CLAUDE_PROMPT_TEMPLATE = """You are an expert analyst extracting causal relationships from documents.
|
|
||||||
|
|
||||||
Given the following text chunk, identify all explicit or strongly implied cause and effect pairs.
|
|
||||||
Return JSON with the schema:
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"cause": "<short phrase>",
|
|
||||||
"effect": "<short phrase>",
|
|
||||||
"confidence": 0-1 float,
|
|
||||||
"explanation": "<why this is causal>",
|
|
||||||
"source_snippet": "<exact quote or paraphrase>"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
Only include items when the causal direction is clear.
|
|
||||||
If none are found, return an empty list [].
|
|
||||||
|
|
||||||
Text chunk:
|
|
||||||
```
|
|
||||||
<<<CHUNK_PLACEHOLDER>>>
|
|
||||||
```"""
|
|
||||||
|
|
||||||
IMAGE_PROMPT_TEMPLATE = """You are an expert analyst extracting causal relationships from images, diagrams, and visual content.
|
|
||||||
|
|
||||||
Analyze this image/diagram for causal relationships. Look for:
|
|
||||||
- Architecture flows (A → B → C)
|
|
||||||
- Dependency relationships
|
|
||||||
- Cause-effect chains in diagrams
|
|
||||||
- Process flows
|
|
||||||
- System interactions
|
|
||||||
- Data flows
|
|
||||||
- Sequential relationships
|
|
||||||
- Visual connections between components
|
|
||||||
|
|
||||||
Return JSON with the schema:
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"cause": "<short phrase describing the cause>",
|
|
||||||
"effect": "<short phrase describing the effect>",
|
|
||||||
"confidence": 0-1 float,
|
|
||||||
"explanation": "<why this is causal, referencing visual elements>",
|
|
||||||
"source_snippet": "<description of what you see in the image that shows this relationship>"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
Only include items when the causal direction is clear from the visual structure.
|
|
||||||
If none are found, return an empty list []."""
|
|
||||||
|
|
||||||
|
|
||||||
class ClaudeCausalExtractor:
|
|
||||||
def __init__(self, api_key: str, model: str, max_output_tokens: int = 4000):
|
|
||||||
self.client = Anthropic(api_key=api_key)
|
|
||||||
self.model = model
|
|
||||||
self.max_output_tokens = max_output_tokens
|
|
||||||
|
|
||||||
@retry(
|
|
||||||
retry=should_retry_exception,
|
|
||||||
wait=wait_exponential(multiplier=1, min=1, max=10),
|
|
||||||
stop=stop_after_attempt(3),
|
|
||||||
reraise=True,
|
|
||||||
)
|
|
||||||
def analyze_chunk(self, chunk: str, source_file_id: str) -> List[CausalRelation]:
|
|
||||||
logger.debug("Analyzing chunk with Claude model %s", self.model)
|
|
||||||
|
|
||||||
# Validate chunk is not empty and is readable text
|
|
||||||
if not chunk or not chunk.strip():
|
|
||||||
logger.warning("Empty or whitespace-only chunk, skipping")
|
|
||||||
return []
|
|
||||||
|
|
||||||
# Check if chunk contains mostly readable text (not binary data)
|
|
||||||
# Simple heuristic: if >50% of characters are non-printable or control chars, skip it
|
|
||||||
printable_chars = sum(1 for c in chunk if c.isprintable() or c.isspace())
|
|
||||||
if len(chunk) > 100 and printable_chars / len(chunk) < 0.5:
|
|
||||||
logger.warning("Chunk appears to contain binary data, skipping analysis")
|
|
||||||
return []
|
|
||||||
|
|
||||||
# Use string replacement with a unique placeholder to avoid KeyError with braces in content
|
|
||||||
# This prevents Python's .format() from interpreting braces in the chunk text as format placeholders
|
|
||||||
prompt_text = CLAUDE_PROMPT_TEMPLATE.replace("<<<CHUNK_PLACEHOLDER>>>", chunk)
|
|
||||||
|
|
||||||
try:
|
|
||||||
message = self.client.messages.create(
|
|
||||||
model=self.model,
|
|
||||||
max_tokens=self.max_output_tokens,
|
|
||||||
temperature=0.0,
|
|
||||||
system="You extract causal (cause→effect) relations with high precision.",
|
|
||||||
messages=[
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": [{"type": "text", "text": prompt_text}],
|
|
||||||
}
|
|
||||||
],
|
|
||||||
)
|
|
||||||
except BadRequestError as e:
|
|
||||||
# Check if it's a billing error
|
|
||||||
if is_billing_error(e):
|
|
||||||
error_msg = (
|
|
||||||
"Anthropic API credit balance is too low. "
|
|
||||||
"Please go to Plans & Billing to upgrade or purchase credits. "
|
|
||||||
f"Error: {str(e)}"
|
|
||||||
)
|
|
||||||
logger.error(error_msg)
|
|
||||||
raise RuntimeError(error_msg) from e
|
|
||||||
# Re-raise other BadRequestErrors
|
|
||||||
raise
|
|
||||||
|
|
||||||
content_blocks = message.content or []
|
|
||||||
raw_text = "".join(block.text for block in content_blocks if hasattr(block, "text")) # type: ignore[attr-defined]
|
|
||||||
if not raw_text:
|
|
||||||
return []
|
|
||||||
|
|
||||||
# Try to extract JSON from markdown code blocks if present
|
|
||||||
json_text = raw_text.strip()
|
|
||||||
|
|
||||||
# Look for JSON in markdown code blocks (```json ... ```)
|
|
||||||
json_match = re.search(r'```(?:json)?\s*(\[.*?\])\s*```', json_text, re.DOTALL)
|
|
||||||
if json_match:
|
|
||||||
json_text = json_match.group(1)
|
|
||||||
else:
|
|
||||||
# Look for JSON array/object at the start or end
|
|
||||||
json_match = re.search(r'(\[.*?\]|{.*?})', json_text, re.DOTALL)
|
|
||||||
if json_match:
|
|
||||||
json_text = json_match.group(1)
|
|
||||||
|
|
||||||
try:
|
|
||||||
data = json.loads(json_text)
|
|
||||||
if not isinstance(data, list):
|
|
||||||
logger.warning("Claude response is not a list: %s", type(data))
|
|
||||||
return []
|
|
||||||
|
|
||||||
relations: List[CausalRelation] = []
|
|
||||||
for item in data:
|
|
||||||
if not isinstance(item, dict):
|
|
||||||
continue
|
|
||||||
cause = item.get("cause", "").strip()
|
|
||||||
effect = item.get("effect", "").strip()
|
|
||||||
if not cause or not effect:
|
|
||||||
continue # Skip invalid relations
|
|
||||||
|
|
||||||
relations.append(
|
|
||||||
CausalRelation(
|
|
||||||
cause=cause,
|
|
||||||
effect=effect,
|
|
||||||
confidence=float(item.get("confidence", 0.0)),
|
|
||||||
explanation=item.get("explanation"),
|
|
||||||
source_file_id=source_file_id,
|
|
||||||
source_snippet=item.get("source_snippet"),
|
|
||||||
metadata={"model": self.model},
|
|
||||||
)
|
|
||||||
)
|
|
||||||
logger.info("Extracted %d relations from Claude response", len(relations))
|
|
||||||
return relations
|
|
||||||
except json.JSONDecodeError as e:
|
|
||||||
logger.warning("Failed to parse Claude response as JSON: %s. Raw text: %s", e, raw_text[:200])
|
|
||||||
return []
|
|
||||||
|
|
||||||
def analyze(self, chunks: Iterable[str], source_file_id: str) -> List[CausalRelation]:
|
|
||||||
relations: List[CausalRelation] = []
|
|
||||||
for chunk in chunks:
|
|
||||||
relations.extend(self.analyze_chunk(chunk, source_file_id=source_file_id))
|
|
||||||
return relations
|
|
||||||
|
|
||||||
@retry(
|
|
||||||
retry=should_retry_exception,
|
|
||||||
wait=wait_exponential(multiplier=1, min=1, max=10),
|
|
||||||
stop=stop_after_attempt(3),
|
|
||||||
reraise=True,
|
|
||||||
)
|
|
||||||
def analyze_image(self, image_path: Path, source_file_id: str) -> List[CausalRelation]:
|
|
||||||
"""
|
|
||||||
Analyze an image using Claude Vision API to extract causal relationships.
|
|
||||||
Sends image directly to Claude (no OCR).
|
|
||||||
"""
|
|
||||||
logger.info("Analyzing image with Claude Vision: %s", image_path.name)
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Read and encode image as base64
|
|
||||||
with open(image_path, "rb") as image_file:
|
|
||||||
image_data = image_file.read()
|
|
||||||
|
|
||||||
# Determine media type
|
|
||||||
suffix = image_path.suffix.lower()
|
|
||||||
media_type_map = {
|
|
||||||
".png": "image/png",
|
|
||||||
".jpg": "image/jpeg",
|
|
||||||
".jpeg": "image/jpeg",
|
|
||||||
".gif": "image/gif",
|
|
||||||
".webp": "image/webp",
|
|
||||||
}
|
|
||||||
media_type = media_type_map.get(suffix, "image/png")
|
|
||||||
|
|
||||||
# Encode to base64
|
|
||||||
base64_image = base64.b64encode(image_data).decode("utf-8")
|
|
||||||
|
|
||||||
# Prepare content for Claude Vision API
|
|
||||||
content = [
|
|
||||||
{
|
|
||||||
"type": "image",
|
|
||||||
"source": {
|
|
||||||
"type": "base64",
|
|
||||||
"media_type": media_type,
|
|
||||||
"data": base64_image,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "text",
|
|
||||||
"text": IMAGE_PROMPT_TEMPLATE,
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
# Call Claude Vision API
|
|
||||||
try:
|
|
||||||
message = self.client.messages.create(
|
|
||||||
model=self.model, # Claude models support vision
|
|
||||||
max_tokens=self.max_output_tokens,
|
|
||||||
temperature=0.0,
|
|
||||||
system="You extract causal (cause→effect) relations from visual content with high precision.",
|
|
||||||
messages=[
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": content,
|
|
||||||
}
|
|
||||||
],
|
|
||||||
)
|
|
||||||
except BadRequestError as e:
|
|
||||||
# Check if it's a billing error
|
|
||||||
if is_billing_error(e):
|
|
||||||
error_msg = (
|
|
||||||
"Anthropic API credit balance is too low. "
|
|
||||||
"Please go to Plans & Billing to upgrade or purchase credits. "
|
|
||||||
f"Error: {str(e)}"
|
|
||||||
)
|
|
||||||
logger.error(error_msg)
|
|
||||||
raise RuntimeError(error_msg) from e
|
|
||||||
# Re-raise other BadRequestErrors
|
|
||||||
raise
|
|
||||||
|
|
||||||
# Parse response
|
|
||||||
content_blocks = message.content or []
|
|
||||||
raw_text = "".join(block.text for block in content_blocks if hasattr(block, "text")) # type: ignore[attr-defined]
|
|
||||||
if not raw_text:
|
|
||||||
logger.warning("No text response from Claude Vision for image %s", image_path.name)
|
|
||||||
return []
|
|
||||||
|
|
||||||
# Extract JSON from response
|
|
||||||
json_text = raw_text.strip()
|
|
||||||
json_match = re.search(r'```(?:json)?\s*(\[.*?\])\s*```', json_text, re.DOTALL)
|
|
||||||
if json_match:
|
|
||||||
json_text = json_match.group(1)
|
|
||||||
else:
|
|
||||||
json_match = re.search(r'(\[.*?\]|{.*?})', json_text, re.DOTALL)
|
|
||||||
if json_match:
|
|
||||||
json_text = json_match.group(1)
|
|
||||||
|
|
||||||
try:
|
|
||||||
data = json.loads(json_text)
|
|
||||||
if not isinstance(data, list):
|
|
||||||
logger.warning("Claude Vision response is not a list: %s", type(data))
|
|
||||||
return []
|
|
||||||
|
|
||||||
relations: List[CausalRelation] = []
|
|
||||||
for item in data:
|
|
||||||
if not isinstance(item, dict):
|
|
||||||
continue
|
|
||||||
cause = item.get("cause", "").strip()
|
|
||||||
effect = item.get("effect", "").strip()
|
|
||||||
if not cause or not effect:
|
|
||||||
continue
|
|
||||||
|
|
||||||
relations.append(
|
|
||||||
CausalRelation(
|
|
||||||
cause=cause,
|
|
||||||
effect=effect,
|
|
||||||
confidence=float(item.get("confidence", 0.0)),
|
|
||||||
explanation=item.get("explanation"),
|
|
||||||
source_file_id=source_file_id,
|
|
||||||
source_snippet=item.get("source_snippet") or f"Image: {image_path.name}",
|
|
||||||
metadata={"model": self.model, "content_type": "image", "image_path": str(image_path)},
|
|
||||||
)
|
|
||||||
)
|
|
||||||
logger.info("Extracted %d relations from image %s", len(relations), image_path.name)
|
|
||||||
return relations
|
|
||||||
except json.JSONDecodeError as e:
|
|
||||||
logger.warning("Failed to parse Claude Vision response as JSON: %s. Raw text: %s", e, raw_text[:200])
|
|
||||||
return []
|
|
||||||
|
|
||||||
except Exception as exc:
|
|
||||||
logger.exception("Failed to analyze image %s: %s", image_path, exc)
|
|
||||||
return []
|
|
||||||
|
|
||||||
@ -20,7 +20,7 @@ class Settings(BaseSettings):
|
|||||||
model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")
|
model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")
|
||||||
|
|
||||||
anthropic_api_key: str | None = Field(default=None, validation_alias="ANTHROPIC_API_KEY")
|
anthropic_api_key: str | None = Field(default=None, validation_alias="ANTHROPIC_API_KEY")
|
||||||
claude_model: str = Field(default=os.getenv("MULTI_DOC_CLAUDE_MODEL", "claude-3-5-sonnet-20241022"))
|
claude_model: str = Field(default=os.getenv("MULTI_DOC_CLAUDE_MODEL", os.getenv("CLAUDE_MODEL", "claude-3-5-haiku-latest")))
|
||||||
claude_max_input_tokens: int = Field(default=200_000)
|
claude_max_input_tokens: int = Field(default=200_000)
|
||||||
claude_max_output_tokens: int = Field(default=16_000)
|
claude_max_output_tokens: int = Field(default=16_000)
|
||||||
|
|
||||||
@ -37,6 +37,27 @@ class Settings(BaseSettings):
|
|||||||
|
|
||||||
job_retention_days: int = Field(default=30)
|
job_retention_days: int = Field(default=30)
|
||||||
|
|
||||||
|
# Qwen2.5-VL API configuration
|
||||||
|
qwen_api_key: str | None = Field(default=None, validation_alias="QWEN_API_KEY")
|
||||||
|
qwen_api_url: str = Field(default=os.getenv("QWEN_API_URL", "https://api.example.com/v1/chat/completions"))
|
||||||
|
qwen_model: str = Field(default=os.getenv("QWEN_MODEL", "qwen2.5-vl"))
|
||||||
|
|
||||||
|
# DoWhy configuration
|
||||||
|
dowhy_enabled: bool = Field(default=True)
|
||||||
|
dowhy_confidence_threshold: float = Field(default=0.05)
|
||||||
|
|
||||||
|
# Embedding configuration
|
||||||
|
embedding_model: str = Field(default="sentence-transformers/all-MiniLM-L6-v2")
|
||||||
|
embedding_dimension: int = Field(default=384)
|
||||||
|
|
||||||
|
# Qdrant configuration
|
||||||
|
qdrant_url: str = Field(default=os.getenv("QDRANT_URL", "http://localhost:6333"))
|
||||||
|
qdrant_collection_name: str = Field(default="kg_embeddings")
|
||||||
|
qdrant_vector_size: int = Field(default=384)
|
||||||
|
|
||||||
|
# Report generation configuration
|
||||||
|
report_format: str = Field(default="markdown")
|
||||||
|
|
||||||
def ensure_storage_dirs(self) -> None:
|
def ensure_storage_dirs(self) -> None:
|
||||||
(self.storage_root / "jobs").mkdir(parents=True, exist_ok=True)
|
(self.storage_root / "jobs").mkdir(parents=True, exist_ok=True)
|
||||||
(self.storage_root / "uploads").mkdir(parents=True, exist_ok=True)
|
(self.storage_root / "uploads").mkdir(parents=True, exist_ok=True)
|
||||||
|
|||||||
@ -1,168 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# Try to import unstructured, but fall back to alternatives if not available
|
|
||||||
try:
|
|
||||||
from unstructured.partition.auto import partition
|
|
||||||
HAS_UNSTRUCTURED = True
|
|
||||||
except ImportError:
|
|
||||||
HAS_UNSTRUCTURED = False
|
|
||||||
logger.warning("unstructured not available, will use fallback extractors")
|
|
||||||
|
|
||||||
# Fallback extractors
|
|
||||||
try:
|
|
||||||
import pdfplumber
|
|
||||||
HAS_PDFPLUMBER = True
|
|
||||||
except ImportError:
|
|
||||||
HAS_PDFPLUMBER = False
|
|
||||||
|
|
||||||
try:
|
|
||||||
from docx import Document as DocxDocument
|
|
||||||
HAS_DOCX = True
|
|
||||||
except ImportError:
|
|
||||||
HAS_DOCX = False
|
|
||||||
|
|
||||||
try:
|
|
||||||
from pptx import Presentation
|
|
||||||
HAS_PPTX = True
|
|
||||||
except ImportError:
|
|
||||||
HAS_PPTX = False
|
|
||||||
|
|
||||||
# Image processing libraries
|
|
||||||
try:
|
|
||||||
from PIL import Image
|
|
||||||
import pytesseract
|
|
||||||
HAS_OCR = True
|
|
||||||
except ImportError:
|
|
||||||
HAS_OCR = False
|
|
||||||
logger.warning("OCR libraries not available, image extraction will be limited")
|
|
||||||
|
|
||||||
|
|
||||||
def extract_text(path: Path) -> str:
|
|
||||||
"""
|
|
||||||
Extract text from a file using multiple strategies.
|
|
||||||
Falls back through: unstructured -> format-specific -> plain text read.
|
|
||||||
"""
|
|
||||||
suffix = path.suffix.lower()
|
|
||||||
|
|
||||||
# Validate PDF file before processing
|
|
||||||
if suffix == ".pdf":
|
|
||||||
# Quick validation: check if file starts with PDF magic bytes
|
|
||||||
try:
|
|
||||||
with path.open("rb") as f:
|
|
||||||
header = f.read(4)
|
|
||||||
if header != b"%PDF":
|
|
||||||
raise ValueError(
|
|
||||||
f"File {path.name} does not appear to be a valid PDF. "
|
|
||||||
f"PDF files must start with '%PDF' magic bytes. "
|
|
||||||
f"Got: {header[:20] if len(header) > 0 else 'empty file'}"
|
|
||||||
)
|
|
||||||
except Exception as exc:
|
|
||||||
if isinstance(exc, ValueError):
|
|
||||||
raise
|
|
||||||
logger.warning("Could not validate PDF header: %s", exc)
|
|
||||||
|
|
||||||
# Image files - return empty text (will be processed directly with Claude Vision)
|
|
||||||
# We skip OCR and send images directly to Claude Vision API
|
|
||||||
if suffix in {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}:
|
|
||||||
logger.info("Image file detected: %s. Will be processed directly with Claude Vision (no OCR)", path.name)
|
|
||||||
# Return empty string - images will be handled separately in pipeline
|
|
||||||
return ""
|
|
||||||
|
|
||||||
# Plain text files - direct read
|
|
||||||
if suffix in {".txt", ".md", ".json", ".xml", ".html", ".csv"}:
|
|
||||||
try:
|
|
||||||
return path.read_text(encoding="utf-8", errors="ignore")
|
|
||||||
except Exception as exc:
|
|
||||||
logger.warning("Failed to read %s as text: %s", path, exc)
|
|
||||||
raise
|
|
||||||
|
|
||||||
# Try unstructured first (if available)
|
|
||||||
if HAS_UNSTRUCTURED:
|
|
||||||
try:
|
|
||||||
elements = partition(filename=str(path))
|
|
||||||
lines: List[str] = []
|
|
||||||
for element in elements:
|
|
||||||
text = getattr(element, "text", None)
|
|
||||||
if text:
|
|
||||||
lines.append(text.strip())
|
|
||||||
if lines:
|
|
||||||
logger.info("Extracted %d lines using unstructured", len(lines))
|
|
||||||
return "\n".join(lines)
|
|
||||||
except Exception as exc:
|
|
||||||
logger.warning("unstructured extraction failed for %s: %s", path, exc)
|
|
||||||
# Continue to fallback methods
|
|
||||||
|
|
||||||
# Fallback: PDF with pdfplumber
|
|
||||||
if suffix == ".pdf" and HAS_PDFPLUMBER:
|
|
||||||
try:
|
|
||||||
with pdfplumber.open(path) as pdf:
|
|
||||||
text_parts = []
|
|
||||||
for page in pdf.pages:
|
|
||||||
page_text = page.extract_text()
|
|
||||||
if page_text:
|
|
||||||
text_parts.append(page_text)
|
|
||||||
if text_parts:
|
|
||||||
logger.info("Extracted PDF using pdfplumber")
|
|
||||||
return "\n".join(text_parts)
|
|
||||||
except Exception as exc:
|
|
||||||
logger.warning("pdfplumber extraction failed for %s: %s", path, exc)
|
|
||||||
|
|
||||||
# Fallback: DOCX
|
|
||||||
if suffix == ".docx" and HAS_DOCX:
|
|
||||||
try:
|
|
||||||
doc = DocxDocument(path)
|
|
||||||
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
|
||||||
if paragraphs:
|
|
||||||
logger.info("Extracted DOCX using python-docx")
|
|
||||||
return "\n".join(paragraphs)
|
|
||||||
except Exception as exc:
|
|
||||||
logger.warning("python-docx extraction failed for %s: %s", path, exc)
|
|
||||||
|
|
||||||
# Fallback: PPTX
|
|
||||||
if suffix in {".pptx", ".ppt"} and HAS_PPTX:
|
|
||||||
try:
|
|
||||||
prs = Presentation(path)
|
|
||||||
text_parts = []
|
|
||||||
for slide in prs.slides:
|
|
||||||
for shape in slide.shapes:
|
|
||||||
if hasattr(shape, "text") and shape.text:
|
|
||||||
text_parts.append(shape.text.strip())
|
|
||||||
if text_parts:
|
|
||||||
logger.info("Extracted PPTX using python-pptx")
|
|
||||||
return "\n".join(text_parts)
|
|
||||||
except Exception as exc:
|
|
||||||
logger.warning("python-pptx extraction failed for %s: %s", path, exc)
|
|
||||||
|
|
||||||
# Last resort: try to read as text anyway, but validate it's readable
|
|
||||||
try:
|
|
||||||
content = path.read_text(encoding="utf-8", errors="ignore")
|
|
||||||
if content.strip():
|
|
||||||
# Check if content is actually readable text (not binary data)
|
|
||||||
# Simple heuristic: if >30% of characters are printable, consider it text
|
|
||||||
printable_chars = sum(1 for c in content if c.isprintable() or c.isspace())
|
|
||||||
total_chars = len(content)
|
|
||||||
|
|
||||||
if total_chars > 0 and printable_chars / total_chars > 0.3:
|
|
||||||
logger.warning("Read %s as plain text (may contain binary data)", path)
|
|
||||||
return content
|
|
||||||
else:
|
|
||||||
logger.error("Content from %s appears to be binary data, cannot extract text", path)
|
|
||||||
raise ValueError(f"File {path} appears to be binary or corrupted. Cannot extract readable text.")
|
|
||||||
except Exception as exc:
|
|
||||||
if isinstance(exc, ValueError):
|
|
||||||
raise
|
|
||||||
logger.warning("Failed to read %s as text: %s", path, exc)
|
|
||||||
|
|
||||||
# If all else fails, raise an error
|
|
||||||
raise ValueError(
|
|
||||||
f"Could not extract text from {path}. "
|
|
||||||
f"File type may not be supported, file may be corrupted, or dependencies are missing. "
|
|
||||||
f"Supported formats: PDF, DOCX, PPTX, XLSX, TXT, MD, JSON, XML, HTML, CSV, PNG, JPG, JPEG (with OCR)"
|
|
||||||
)
|
|
||||||
|
|
||||||
@ -0,0 +1,320 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
try:
|
||||||
|
import fitz # PyMuPDF
|
||||||
|
HAS_PYMUPDF = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_PYMUPDF = False
|
||||||
|
logger.warning("PyMuPDF not available")
|
||||||
|
|
||||||
|
try:
|
||||||
|
from docx import Document as DocxDocument
|
||||||
|
HAS_DOCX = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_DOCX = False
|
||||||
|
logger.warning("python-docx not available")
|
||||||
|
|
||||||
|
try:
|
||||||
|
from pptx import Presentation
|
||||||
|
HAS_PPTX = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_PPTX = False
|
||||||
|
logger.warning("python-pptx not available")
|
||||||
|
|
||||||
|
try:
|
||||||
|
import pandas as pd
|
||||||
|
HAS_PANDAS = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_PANDAS = False
|
||||||
|
logger.warning("pandas not available")
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ExtractedText:
|
||||||
|
"""Structured text extraction with context."""
|
||||||
|
text: str
|
||||||
|
page_number: int
|
||||||
|
metadata: dict
|
||||||
|
context: Optional[str] = None # Surrounding context
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text_with_context(path: Path) -> List[ExtractedText]:
|
||||||
|
"""
|
||||||
|
Extract text from PDF using PyMuPDF with page-level context.
|
||||||
|
Returns structured text with metadata.
|
||||||
|
"""
|
||||||
|
if not HAS_PYMUPDF:
|
||||||
|
raise ImportError("PyMuPDF is required for text extraction")
|
||||||
|
|
||||||
|
if not path.exists():
|
||||||
|
raise FileNotFoundError(f"File not found: {path}")
|
||||||
|
|
||||||
|
if path.suffix.lower() != ".pdf":
|
||||||
|
# For non-PDF files, fall back to simple text reading
|
||||||
|
try:
|
||||||
|
text = path.read_text(encoding="utf-8", errors="ignore")
|
||||||
|
return [ExtractedText(
|
||||||
|
text=text,
|
||||||
|
page_number=1,
|
||||||
|
metadata={"file_type": path.suffix, "filename": path.name},
|
||||||
|
context=None
|
||||||
|
)]
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("Failed to read %s as text: %s", path, exc)
|
||||||
|
raise
|
||||||
|
|
||||||
|
extracted_pages: List[ExtractedText] = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
doc = fitz.open(path)
|
||||||
|
|
||||||
|
for page_num in range(len(doc)):
|
||||||
|
page = doc[page_num]
|
||||||
|
|
||||||
|
# Extract text
|
||||||
|
text = page.get_text()
|
||||||
|
|
||||||
|
# Extract metadata
|
||||||
|
metadata = {
|
||||||
|
"page_number": page_num + 1,
|
||||||
|
"page_count": len(doc),
|
||||||
|
"filename": path.name,
|
||||||
|
"file_type": "pdf",
|
||||||
|
"page_rect": {
|
||||||
|
"width": page.rect.width,
|
||||||
|
"height": page.rect.height
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Extract context (surrounding pages for better understanding)
|
||||||
|
context = None
|
||||||
|
if page_num > 0:
|
||||||
|
prev_page = doc[page_num - 1]
|
||||||
|
prev_text = prev_page.get_text()[:500] # Last 500 chars of previous page
|
||||||
|
context = f"Previous page context: {prev_text}"
|
||||||
|
|
||||||
|
if text.strip():
|
||||||
|
extracted_pages.append(ExtractedText(
|
||||||
|
text=text,
|
||||||
|
page_number=page_num + 1,
|
||||||
|
metadata=metadata,
|
||||||
|
context=context
|
||||||
|
))
|
||||||
|
|
||||||
|
doc.close()
|
||||||
|
logger.info("Extracted text from %d pages in %s", len(extracted_pages), path.name)
|
||||||
|
return extracted_pages
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception("Failed to extract text from PDF %s: %s", path, exc)
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text_from_docx(path: Path) -> str:
|
||||||
|
"""
|
||||||
|
Extract text from DOCX file using python-docx.
|
||||||
|
Reads paragraphs and tables as per README Step 2.2b.
|
||||||
|
"""
|
||||||
|
if not HAS_DOCX:
|
||||||
|
raise ImportError("python-docx is required for DOCX extraction")
|
||||||
|
|
||||||
|
try:
|
||||||
|
doc = DocxDocument(path)
|
||||||
|
text_parts = []
|
||||||
|
|
||||||
|
# Extract paragraphs
|
||||||
|
for paragraph in doc.paragraphs:
|
||||||
|
if paragraph.text.strip():
|
||||||
|
text_parts.append(paragraph.text.strip())
|
||||||
|
|
||||||
|
# Extract tables
|
||||||
|
for table in doc.tables:
|
||||||
|
table_text = []
|
||||||
|
for row in table.rows:
|
||||||
|
row_text = []
|
||||||
|
for cell in row.cells:
|
||||||
|
if cell.text.strip():
|
||||||
|
row_text.append(cell.text.strip())
|
||||||
|
if row_text:
|
||||||
|
table_text.append(" | ".join(row_text))
|
||||||
|
if table_text:
|
||||||
|
text_parts.append("\n".join(table_text))
|
||||||
|
|
||||||
|
result = "\n\n".join(text_parts)
|
||||||
|
logger.info("Extracted %d characters from DOCX %s", len(result), path.name)
|
||||||
|
return result
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception("Failed to extract text from DOCX %s: %s", path, exc)
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text_from_pptx(path: Path) -> str:
|
||||||
|
"""
|
||||||
|
Extract text from PPTX file using python-pptx.
|
||||||
|
Reads slides, titles, and notes as per README Step 2.2c.
|
||||||
|
"""
|
||||||
|
if not HAS_PPTX:
|
||||||
|
raise ImportError("python-pptx is required for PPTX extraction")
|
||||||
|
|
||||||
|
try:
|
||||||
|
prs = Presentation(path)
|
||||||
|
text_parts = []
|
||||||
|
|
||||||
|
for slide_num, slide in enumerate(prs.slides, 1):
|
||||||
|
slide_text = []
|
||||||
|
|
||||||
|
# Extract slide title
|
||||||
|
if slide.shapes.title and slide.shapes.title.text:
|
||||||
|
slide_text.append(f"Slide {slide_num} Title: {slide.shapes.title.text.strip()}")
|
||||||
|
|
||||||
|
# Extract content from shapes
|
||||||
|
for shape in slide.shapes:
|
||||||
|
if hasattr(shape, "text") and shape.text.strip():
|
||||||
|
# Skip title (already extracted)
|
||||||
|
if not (slide.shapes.title and shape == slide.shapes.title):
|
||||||
|
slide_text.append(shape.text.strip())
|
||||||
|
|
||||||
|
# Extract notes (if available)
|
||||||
|
if hasattr(slide, "notes_slide") and slide.notes_slide:
|
||||||
|
notes_text = ""
|
||||||
|
for shape in slide.notes_slide.shapes:
|
||||||
|
if hasattr(shape, "text") and shape.text.strip():
|
||||||
|
notes_text += shape.text.strip() + " "
|
||||||
|
if notes_text.strip():
|
||||||
|
slide_text.append(f"Notes: {notes_text.strip()}")
|
||||||
|
|
||||||
|
if slide_text:
|
||||||
|
text_parts.append("\n".join(slide_text))
|
||||||
|
|
||||||
|
result = "\n\n".join(text_parts)
|
||||||
|
logger.info("Extracted %d characters from PPTX %s (%d slides)",
|
||||||
|
len(result), path.name, len(prs.slides))
|
||||||
|
return result
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception("Failed to extract text from PPTX %s: %s", path, exc)
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text_from_spreadsheet(path: Path) -> str:
|
||||||
|
"""
|
||||||
|
Extract text from CSV/XLSX file using pandas.
|
||||||
|
Reads rows and columns, converts to text representation as per README Step 2.2d.
|
||||||
|
"""
|
||||||
|
if not HAS_PANDAS:
|
||||||
|
raise ImportError("pandas is required for spreadsheet extraction")
|
||||||
|
|
||||||
|
try:
|
||||||
|
suffix = path.suffix.lower()
|
||||||
|
text_parts = []
|
||||||
|
|
||||||
|
if suffix == ".csv":
|
||||||
|
df = pd.read_csv(path, encoding="utf-8", errors="ignore")
|
||||||
|
elif suffix in {".xlsx", ".xls"}:
|
||||||
|
# Read first sheet by default
|
||||||
|
df = pd.read_excel(path, engine="openpyxl" if suffix == ".xlsx" else None)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported spreadsheet format: {suffix}")
|
||||||
|
|
||||||
|
# Convert DataFrame to text representation
|
||||||
|
# Add column headers
|
||||||
|
headers = " | ".join(str(col) for col in df.columns)
|
||||||
|
text_parts.append(f"Columns: {headers}")
|
||||||
|
|
||||||
|
# Add rows (limit to first 1000 rows to avoid huge output)
|
||||||
|
max_rows = min(1000, len(df))
|
||||||
|
for idx, row in df.head(max_rows).iterrows():
|
||||||
|
row_values = " | ".join(str(val) if pd.notna(val) else "" for val in row)
|
||||||
|
text_parts.append(f"Row {idx + 1}: {row_values}")
|
||||||
|
|
||||||
|
if len(df) > max_rows:
|
||||||
|
text_parts.append(f"... ({len(df) - max_rows} more rows)")
|
||||||
|
|
||||||
|
result = "\n".join(text_parts)
|
||||||
|
logger.info("Extracted %d characters from spreadsheet %s (%d rows)",
|
||||||
|
len(result), path.name, len(df))
|
||||||
|
return result
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception("Failed to extract text from spreadsheet %s: %s", path, exc)
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def clean_text(text: str) -> str:
|
||||||
|
"""
|
||||||
|
Clean extracted text as per README Step 2.3.
|
||||||
|
- Remove extra whitespace
|
||||||
|
- Fix encoding issues
|
||||||
|
- Preserve important structure
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Fix encoding issues (remove non-printable characters except newlines and tabs)
|
||||||
|
cleaned = "".join(char for char in text if char.isprintable() or char in "\n\t\r")
|
||||||
|
|
||||||
|
# Remove extra whitespace (but preserve paragraph breaks)
|
||||||
|
# Replace multiple spaces with single space
|
||||||
|
cleaned = re.sub(r'[ \t]+', ' ', cleaned)
|
||||||
|
|
||||||
|
# Normalize line breaks (preserve double newlines for paragraphs)
|
||||||
|
cleaned = re.sub(r'\r\n', '\n', cleaned) # Windows line breaks
|
||||||
|
cleaned = re.sub(r'\r', '\n', cleaned) # Old Mac line breaks
|
||||||
|
|
||||||
|
# Preserve paragraph structure (double newlines)
|
||||||
|
# But remove excessive blank lines (more than 2 consecutive)
|
||||||
|
cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
|
||||||
|
|
||||||
|
# Remove leading/trailing whitespace from each line
|
||||||
|
lines = [line.strip() for line in cleaned.split('\n')]
|
||||||
|
cleaned = '\n'.join(lines)
|
||||||
|
|
||||||
|
# Remove leading/trailing whitespace overall
|
||||||
|
cleaned = cleaned.strip()
|
||||||
|
|
||||||
|
return cleaned
|
||||||
|
|
||||||
|
|
||||||
|
def extract_all_text(path: Path) -> str:
|
||||||
|
"""
|
||||||
|
Extract all text from a file based on type (as per README Step 2).
|
||||||
|
Routes to appropriate extractor: PDF, DOCX, PPTX, CSV/XLSX, or plain text.
|
||||||
|
"""
|
||||||
|
suffix = path.suffix.lower()
|
||||||
|
|
||||||
|
# Step 2.2a: PDF
|
||||||
|
if suffix == ".pdf" and HAS_PYMUPDF:
|
||||||
|
extracted_pages = extract_text_with_context(path)
|
||||||
|
text = "\n\n".join([page.text for page in extracted_pages])
|
||||||
|
|
||||||
|
# Step 2.2b: DOCX (Word)
|
||||||
|
elif suffix == ".docx" and HAS_DOCX:
|
||||||
|
text = extract_text_from_docx(path)
|
||||||
|
|
||||||
|
# Step 2.2c: PPTX (PowerPoint)
|
||||||
|
elif suffix in {".pptx", ".ppt"} and HAS_PPTX:
|
||||||
|
text = extract_text_from_pptx(path)
|
||||||
|
|
||||||
|
# Step 2.2d: CSV/XLSX (Spreadsheet)
|
||||||
|
elif suffix in {".csv", ".xlsx", ".xls"} and HAS_PANDAS:
|
||||||
|
text = extract_text_from_spreadsheet(path)
|
||||||
|
|
||||||
|
# Fallback: Plain text files
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
text = path.read_text(encoding="utf-8", errors="ignore")
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("Failed to read %s as text: %s", path, exc)
|
||||||
|
raise
|
||||||
|
|
||||||
|
# Step 2.3: TEXT CLEANING
|
||||||
|
text = clean_text(text)
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
@ -0,0 +1,153 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import base64
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from ..config import get_settings
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class QwenVisionClient:
|
||||||
|
"""Client for Qwen2.5-VL API to extract relationships from diagrams and ERDs."""
|
||||||
|
|
||||||
|
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None, model: Optional[str] = None):
|
||||||
|
settings = get_settings()
|
||||||
|
self.api_key = api_key or settings.qwen_api_key
|
||||||
|
self.api_url = api_url or settings.qwen_api_url
|
||||||
|
self.model = model or settings.qwen_model
|
||||||
|
|
||||||
|
if not self.api_key:
|
||||||
|
logger.warning("Qwen API key not configured")
|
||||||
|
|
||||||
|
def extract_relationships_from_image(self, image_path: Path, source_file_id: str) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Extract relationships (entities, connections, flows) from an image using Qwen2.5-VL.
|
||||||
|
Returns list of extracted relationships.
|
||||||
|
"""
|
||||||
|
if not self.api_key:
|
||||||
|
logger.warning("Qwen API key not configured, skipping image analysis")
|
||||||
|
return []
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Read and encode image
|
||||||
|
with open(image_path, "rb") as img_file:
|
||||||
|
image_data = img_file.read()
|
||||||
|
|
||||||
|
base64_image = base64.b64encode(image_data).decode("utf-8")
|
||||||
|
|
||||||
|
# Determine media type
|
||||||
|
suffix = image_path.suffix.lower()
|
||||||
|
media_type_map = {
|
||||||
|
".png": "image/png",
|
||||||
|
".jpg": "image/jpeg",
|
||||||
|
".jpeg": "image/jpeg",
|
||||||
|
".gif": "image/gif",
|
||||||
|
".webp": "image/webp",
|
||||||
|
}
|
||||||
|
media_type = media_type_map.get(suffix, "image/png")
|
||||||
|
|
||||||
|
# Prepare prompt for relationship extraction
|
||||||
|
prompt = """Analyze this diagram/ERD/image and extract all relationships, entities, and connections.
|
||||||
|
|
||||||
|
Extract:
|
||||||
|
1. Entities (boxes, nodes, components)
|
||||||
|
2. Relationships between entities (arrows, connections, flows)
|
||||||
|
3. Data flows and dependencies
|
||||||
|
4. Process flows
|
||||||
|
5. Architecture patterns
|
||||||
|
|
||||||
|
Return JSON with this structure:
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"entity1": "name of first entity",
|
||||||
|
"entity2": "name of second entity",
|
||||||
|
"relationship_type": "causes|depends_on|flows_to|contains|uses",
|
||||||
|
"description": "description of the relationship",
|
||||||
|
"confidence": 0.0-1.0
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
Focus on cause-effect relationships, dependencies, and flows."""
|
||||||
|
|
||||||
|
# Prepare API request
|
||||||
|
payload = {
|
||||||
|
"model": self.model,
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": f"data:{media_type};base64,{base64_image}"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": prompt
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"max_tokens": 4000,
|
||||||
|
"temperature": 0.0
|
||||||
|
}
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {self.api_key}",
|
||||||
|
"Content-Type": "application/json"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Make API call
|
||||||
|
with httpx.Client(timeout=60.0) as client:
|
||||||
|
response = client.post(self.api_url, json=payload, headers=headers)
|
||||||
|
response.raise_for_status()
|
||||||
|
result = response.json()
|
||||||
|
|
||||||
|
# Parse response
|
||||||
|
content = result.get("choices", [{}])[0].get("message", {}).get("content", "")
|
||||||
|
|
||||||
|
if not content:
|
||||||
|
logger.warning("Empty response from Qwen API for image %s", image_path.name)
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Extract JSON from response
|
||||||
|
json_text = content.strip()
|
||||||
|
|
||||||
|
# Try to find JSON in markdown code blocks
|
||||||
|
if "```json" in json_text:
|
||||||
|
json_text = json_text.split("```json")[1].split("```")[0].strip()
|
||||||
|
elif "```" in json_text:
|
||||||
|
json_text = json_text.split("```")[1].split("```")[0].strip()
|
||||||
|
|
||||||
|
# Parse JSON
|
||||||
|
try:
|
||||||
|
relationships = json.loads(json_text)
|
||||||
|
if not isinstance(relationships, list):
|
||||||
|
relationships = [relationships]
|
||||||
|
|
||||||
|
# Add source metadata
|
||||||
|
for rel in relationships:
|
||||||
|
rel["source_file_id"] = source_file_id
|
||||||
|
rel["source_image"] = str(image_path.name)
|
||||||
|
rel["extraction_method"] = "qwen2.5-vl"
|
||||||
|
|
||||||
|
logger.info("Extracted %d relationships from image %s using Qwen2.5-VL",
|
||||||
|
len(relationships), image_path.name)
|
||||||
|
return relationships
|
||||||
|
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
logger.warning("Failed to parse Qwen response as JSON: %s. Content: %s",
|
||||||
|
e, content[:200])
|
||||||
|
return []
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception("Failed to extract relationships from image %s: %s", image_path, exc)
|
||||||
|
return []
|
||||||
|
|
||||||
@ -2,15 +2,16 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import logging
|
import logging
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
from fastapi import BackgroundTasks, Depends, FastAPI, File, Form, HTTPException, UploadFile
|
from fastapi import BackgroundTasks, Depends, FastAPI, File, Form, HTTPException, UploadFile
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
from fastapi.responses import FileResponse
|
||||||
|
|
||||||
from .claude_client import ClaudeCausalExtractor
|
|
||||||
from .config import Settings, get_settings
|
from .config import Settings, get_settings
|
||||||
from .jobs import JobStore
|
from .jobs import JobStore
|
||||||
from .models import CreateJobResponse, JobGraphSummary, JobStage, JobStatusResponse
|
from .models import CreateJobResponse, JobGraphSummary, JobStage, JobStatusResponse, ProjectReport
|
||||||
from .processors.graph_writer import GraphWriter
|
from .processors.graph_writer import GraphWriter
|
||||||
from .storage import StorageManager
|
from .storage import StorageManager
|
||||||
from .workflows.pipeline import JobPipeline
|
from .workflows.pipeline import JobPipeline
|
||||||
@ -20,8 +21,8 @@ logging.basicConfig(level=logging.INFO)
|
|||||||
|
|
||||||
app = FastAPI(
|
app = FastAPI(
|
||||||
title="Multi Document Upload Service",
|
title="Multi Document Upload Service",
|
||||||
version="0.1.0",
|
version="0.2.0",
|
||||||
description="Processes multi-format documents to build causal knowledge graphs using Claude.",
|
description="Processes multi-format documents to build knowledge graphs and generate beginner-friendly onboarding reports.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -40,7 +41,6 @@ class ServiceContainer:
|
|||||||
storage: StorageManager
|
storage: StorageManager
|
||||||
job_store: JobStore
|
job_store: JobStore
|
||||||
graph_writer: GraphWriter
|
graph_writer: GraphWriter
|
||||||
claude_extractor: ClaudeCausalExtractor
|
|
||||||
pipeline: JobPipeline
|
pipeline: JobPipeline
|
||||||
|
|
||||||
|
|
||||||
@ -51,29 +51,24 @@ def get_container() -> ServiceContainer:
|
|||||||
global _container
|
global _container
|
||||||
if _container is None:
|
if _container is None:
|
||||||
settings = get_settings()
|
settings = get_settings()
|
||||||
if not settings.anthropic_api_key:
|
# Anthropic API key is only needed for report generation, not required at startup
|
||||||
raise HTTPException(status_code=500, detail="ANTHROPIC_API_KEY is not configured")
|
# if not settings.anthropic_api_key:
|
||||||
|
# raise HTTPException(status_code=500, detail="ANTHROPIC_API_KEY is not configured")
|
||||||
|
|
||||||
storage = StorageManager(settings.storage_root)
|
storage = StorageManager(settings.storage_root)
|
||||||
job_store = JobStore(settings.storage_root)
|
job_store = JobStore(settings.storage_root)
|
||||||
graph_writer = GraphWriter(settings.neo4j_uri, settings.neo4j_user, settings.neo4j_password)
|
graph_writer = GraphWriter(settings.neo4j_uri, settings.neo4j_user, settings.neo4j_password)
|
||||||
claude_extractor = ClaudeCausalExtractor(
|
|
||||||
api_key=settings.anthropic_api_key,
|
|
||||||
model=settings.claude_model,
|
|
||||||
max_output_tokens=min(settings.claude_max_output_tokens, 4000),
|
|
||||||
)
|
|
||||||
pipeline = JobPipeline(
|
pipeline = JobPipeline(
|
||||||
job_store=job_store,
|
job_store=job_store,
|
||||||
storage=storage,
|
storage=storage,
|
||||||
graph_writer=graph_writer,
|
graph_writer=graph_writer,
|
||||||
claude_extractor=claude_extractor,
|
|
||||||
)
|
)
|
||||||
_container = ServiceContainer(
|
_container = ServiceContainer(
|
||||||
settings=settings,
|
settings=settings,
|
||||||
storage=storage,
|
storage=storage,
|
||||||
job_store=job_store,
|
job_store=job_store,
|
||||||
graph_writer=graph_writer,
|
graph_writer=graph_writer,
|
||||||
claude_extractor=claude_extractor,
|
|
||||||
pipeline=pipeline,
|
pipeline=pipeline,
|
||||||
)
|
)
|
||||||
return _container
|
return _container
|
||||||
@ -170,14 +165,86 @@ async def get_job_graph(job_id: str, container: ServiceContainer = Depends(get_d
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/jobs/{job_id}/report", response_model=ProjectReport)
|
||||||
|
async def get_job_report(job_id: str, container: ServiceContainer = Depends(get_dependencies)) -> ProjectReport:
|
||||||
|
"""Get the generated beginner-friendly onboarding report."""
|
||||||
|
job_store = container.job_store
|
||||||
|
if not job_store.exists(job_id):
|
||||||
|
raise HTTPException(status_code=404, detail="Job not found")
|
||||||
|
job = job_store.get(job_id)
|
||||||
|
if job.stage != JobStage.COMPLETED:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=409,
|
||||||
|
detail="Report not ready yet. Job is still processing."
|
||||||
|
)
|
||||||
|
if not job.report:
|
||||||
|
# Check if there was an error during report generation
|
||||||
|
error_msg = "Report not found. "
|
||||||
|
if job.error:
|
||||||
|
# Check if error is specifically about report generation
|
||||||
|
if "report generation" in job.error.lower() or "claude" in job.error.lower():
|
||||||
|
error_msg = job.error
|
||||||
|
else:
|
||||||
|
error_msg += f"Error during generation: {job.error}"
|
||||||
|
else:
|
||||||
|
error_msg += "Report generation may have failed (check logs for details)."
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=404,
|
||||||
|
detail=error_msg
|
||||||
|
)
|
||||||
|
return job.report
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/jobs/{job_id}/report/pdf")
|
||||||
|
async def get_job_report_pdf(job_id: str, container: ServiceContainer = Depends(get_dependencies)):
|
||||||
|
"""Download the PDF version of the onboarding report (as per README Step 7.9)."""
|
||||||
|
job_store = container.job_store
|
||||||
|
if not job_store.exists(job_id):
|
||||||
|
raise HTTPException(status_code=404, detail="Job not found")
|
||||||
|
job = job_store.get(job_id)
|
||||||
|
if job.stage != JobStage.COMPLETED:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=409,
|
||||||
|
detail="Report not ready yet. Job is still processing."
|
||||||
|
)
|
||||||
|
if not job.report:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=404,
|
||||||
|
detail="Report not found. Job may have completed without generating report."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get PDF path from report metadata
|
||||||
|
pdf_path_str = job.report.metadata.get("pdf_path")
|
||||||
|
if not pdf_path_str:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=404,
|
||||||
|
detail="PDF not available. Report may have been generated without PDF conversion."
|
||||||
|
)
|
||||||
|
|
||||||
|
pdf_path = Path(pdf_path_str)
|
||||||
|
if not pdf_path.exists():
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=404,
|
||||||
|
detail="PDF file not found on server."
|
||||||
|
)
|
||||||
|
|
||||||
|
return FileResponse(
|
||||||
|
path=pdf_path,
|
||||||
|
media_type="application/pdf",
|
||||||
|
filename=f"onboarding_report_{job_id}.pdf"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.get("/health")
|
@app.get("/health")
|
||||||
async def healthcheck(container: ServiceContainer = Depends(get_dependencies)):
|
async def healthcheck(container: ServiceContainer = Depends(get_dependencies)):
|
||||||
settings = container.settings
|
settings = container.settings
|
||||||
return {
|
return {
|
||||||
"status": "ok",
|
"status": "ok",
|
||||||
"claude_model": settings.claude_model,
|
"claude_model": settings.claude_model,
|
||||||
"max_input_tokens_per_min": settings.claude_max_input_tokens,
|
"qwen_model": settings.qwen_model,
|
||||||
"max_output_tokens_per_min": settings.claude_max_output_tokens,
|
"embedding_model": settings.embedding_model,
|
||||||
|
"qdrant_url": settings.qdrant_url,
|
||||||
|
"dowhy_enabled": settings.dowhy_enabled,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -10,9 +10,10 @@ from pydantic import BaseModel, Field
|
|||||||
class JobStage(str, Enum):
|
class JobStage(str, Enum):
|
||||||
RECEIVED = "received"
|
RECEIVED = "received"
|
||||||
SAVING_FILES = "saving_files"
|
SAVING_FILES = "saving_files"
|
||||||
EXTRACTING = "extracting"
|
EXTRACTING = "extracting" # PyMuPDF + Qwen2.5-VL
|
||||||
ANALYZING = "analyzing"
|
BUILDING_GRAPH = "building_graph" # DoWhy + Neo4j
|
||||||
BUILDING_GRAPH = "building_graph"
|
INDEXING_VECTORS = "indexing_vectors" # Qdrant
|
||||||
|
GENERATING_REPORT = "generating_report" # Claude onboarding doc
|
||||||
COMPLETED = "completed"
|
COMPLETED = "completed"
|
||||||
FAILED = "failed"
|
FAILED = "failed"
|
||||||
|
|
||||||
@ -34,6 +35,7 @@ class CausalRelation(BaseModel):
|
|||||||
explanation: Optional[str] = None
|
explanation: Optional[str] = None
|
||||||
source_file_id: Optional[str] = None
|
source_file_id: Optional[str] = None
|
||||||
source_snippet: Optional[str] = None
|
source_snippet: Optional[str] = None
|
||||||
|
relationship_type: str = Field(default="CAUSES") # DEPENDS_ON, USES, IMPLEMENTS, etc.
|
||||||
metadata: Dict[str, Any] = Field(default_factory=dict)
|
metadata: Dict[str, Any] = Field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
@ -46,6 +48,7 @@ class JobRecord(BaseModel):
|
|||||||
total_files: int = 0
|
total_files: int = 0
|
||||||
processed_files: int = 0
|
processed_files: int = 0
|
||||||
relations: List[CausalRelation] = Field(default_factory=list)
|
relations: List[CausalRelation] = Field(default_factory=list)
|
||||||
|
report: Optional[ProjectReport] = None # Generated onboarding report
|
||||||
created_at: datetime = Field(default_factory=datetime.utcnow)
|
created_at: datetime = Field(default_factory=datetime.utcnow)
|
||||||
updated_at: datetime = Field(default_factory=datetime.utcnow)
|
updated_at: datetime = Field(default_factory=datetime.utcnow)
|
||||||
error: str | None = None
|
error: str | None = None
|
||||||
@ -82,3 +85,15 @@ class JobGraphSummary(BaseModel):
|
|||||||
edge_count: int
|
edge_count: int
|
||||||
generated_at: datetime
|
generated_at: datetime
|
||||||
|
|
||||||
|
|
||||||
|
class ProjectReport(BaseModel):
|
||||||
|
"""Beginner-friendly onboarding report generated from project documents."""
|
||||||
|
job_id: str
|
||||||
|
title: str = "Project Onboarding Guide"
|
||||||
|
content: str # Markdown content
|
||||||
|
sections: Dict[str, str] = Field(default_factory=dict) # Section name -> content
|
||||||
|
key_concepts: List[str] = Field(default_factory=list) # Important concepts covered
|
||||||
|
total_pages: int = 0 # Estimated pages
|
||||||
|
generated_at: datetime = Field(default_factory=datetime.utcnow)
|
||||||
|
metadata: Dict[str, Any] = Field(default_factory=dict)
|
||||||
|
|
||||||
|
|||||||
@ -1,24 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from typing import Iterable, List
|
|
||||||
|
|
||||||
import tiktoken
|
|
||||||
|
|
||||||
|
|
||||||
class TextChunker:
|
|
||||||
def __init__(self, model_name: str, token_target: int = 800, overlap: int = 200):
|
|
||||||
self.encoder = tiktoken.encoding_for_model("gpt-4o") if "claude" not in model_name else tiktoken.get_encoding("cl100k_base")
|
|
||||||
self.token_target = token_target
|
|
||||||
self.overlap = overlap
|
|
||||||
|
|
||||||
def chunk(self, text: str) -> Iterable[str]:
|
|
||||||
tokens = self.encoder.encode(text)
|
|
||||||
step = max(self.token_target - self.overlap, 1)
|
|
||||||
chunks: List[str] = []
|
|
||||||
for start in range(0, len(tokens), step):
|
|
||||||
end = min(start + self.token_target, len(tokens))
|
|
||||||
chunk_tokens = tokens[start:end]
|
|
||||||
chunk_text = self.encoder.decode(chunk_tokens)
|
|
||||||
chunks.append(chunk_text)
|
|
||||||
return chunks
|
|
||||||
|
|
||||||
@ -0,0 +1,187 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from ..config import get_settings
|
||||||
|
from ..models import CausalRelation
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
try:
|
||||||
|
import dowhy
|
||||||
|
from dowhy import CausalModel
|
||||||
|
HAS_DOWHY = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_DOWHY = False
|
||||||
|
logger.warning("DoWhy not available")
|
||||||
|
|
||||||
|
|
||||||
|
class DoWhyAnalyzer:
|
||||||
|
"""Validate causal relationships using DoWhy Structural Causal Models."""
|
||||||
|
|
||||||
|
def __init__(self, confidence_threshold: Optional[float] = None):
|
||||||
|
if not HAS_DOWHY:
|
||||||
|
raise ImportError("DoWhy is required for causal analysis")
|
||||||
|
|
||||||
|
settings = get_settings()
|
||||||
|
self.confidence_threshold = confidence_threshold or settings.dowhy_confidence_threshold
|
||||||
|
self.enabled = settings.dowhy_enabled
|
||||||
|
|
||||||
|
def validate_relationships(
|
||||||
|
self,
|
||||||
|
relationships: List[CausalRelation],
|
||||||
|
text_data: Optional[str] = None
|
||||||
|
) -> List[CausalRelation]:
|
||||||
|
"""
|
||||||
|
Validate causal relationships using DoWhy SCM.
|
||||||
|
Filters out relationships that don't pass validation.
|
||||||
|
"""
|
||||||
|
if not self.enabled:
|
||||||
|
logger.info("DoWhy validation is disabled, returning all relationships")
|
||||||
|
return relationships
|
||||||
|
|
||||||
|
if not relationships:
|
||||||
|
return []
|
||||||
|
|
||||||
|
validated: List[CausalRelation] = []
|
||||||
|
|
||||||
|
# Group relationships by cause to build SCM
|
||||||
|
cause_groups = {}
|
||||||
|
for rel in relationships:
|
||||||
|
cause = rel.cause
|
||||||
|
if cause not in cause_groups:
|
||||||
|
cause_groups[cause] = []
|
||||||
|
cause_groups[cause].append(rel)
|
||||||
|
|
||||||
|
# Validate each group
|
||||||
|
for cause, effects in cause_groups.items():
|
||||||
|
for rel in effects:
|
||||||
|
try:
|
||||||
|
is_valid = self._validate_single_relationship(rel, relationships, text_data)
|
||||||
|
if is_valid:
|
||||||
|
# Update confidence with validation score
|
||||||
|
rel.confidence = min(rel.confidence + 0.1, 0.95) # Boost validated relationships
|
||||||
|
rel.metadata["dowhy_validated"] = True
|
||||||
|
validated.append(rel)
|
||||||
|
else:
|
||||||
|
logger.debug("DoWhy validation failed for: %s -> %s", rel.cause, rel.effect)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("DoWhy validation error for %s -> %s: %s",
|
||||||
|
rel.cause, rel.effect, exc)
|
||||||
|
# If validation fails, keep the relationship but mark it
|
||||||
|
rel.metadata["dowhy_validated"] = False
|
||||||
|
rel.metadata["dowhy_error"] = str(exc)
|
||||||
|
validated.append(rel) # Keep it but with lower confidence
|
||||||
|
|
||||||
|
logger.info("DoWhy validated %d/%d relationships", len(validated), len(relationships))
|
||||||
|
return validated
|
||||||
|
|
||||||
|
def _validate_single_relationship(
|
||||||
|
self,
|
||||||
|
relationship: CausalRelation,
|
||||||
|
all_relationships: List[CausalRelation],
|
||||||
|
text_data: Optional[str] = None
|
||||||
|
) -> bool:
|
||||||
|
"""
|
||||||
|
Validate a single relationship using DoWhy.
|
||||||
|
Returns True if relationship is valid, False otherwise.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Build a simple causal graph from relationships
|
||||||
|
# Extract unique variables (causes and effects)
|
||||||
|
variables = set()
|
||||||
|
for rel in all_relationships:
|
||||||
|
variables.add(rel.cause)
|
||||||
|
variables.add(rel.effect)
|
||||||
|
|
||||||
|
# Create a simple dataset for DoWhy
|
||||||
|
# Since we don't have actual data, we'll use a heuristic approach
|
||||||
|
# based on relationship frequency and structure
|
||||||
|
|
||||||
|
# Check if there's a path from cause to effect in the graph
|
||||||
|
has_path = self._check_causal_path(
|
||||||
|
relationship.cause,
|
||||||
|
relationship.effect,
|
||||||
|
all_relationships
|
||||||
|
)
|
||||||
|
|
||||||
|
if not has_path:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Additional validation: check for confounders
|
||||||
|
# If there are many relationships involving both cause and effect,
|
||||||
|
# it's more likely to be valid
|
||||||
|
related_count = sum(
|
||||||
|
1 for rel in all_relationships
|
||||||
|
if rel.cause == relationship.cause or rel.effect == relationship.effect
|
||||||
|
)
|
||||||
|
|
||||||
|
# If there are multiple relationships involving these concepts,
|
||||||
|
# it's more likely to be a valid causal relationship
|
||||||
|
if related_count >= 2:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# For single relationships, use confidence threshold
|
||||||
|
return relationship.confidence >= 0.6
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("DoWhy validation error: %s", exc)
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _check_causal_path(
|
||||||
|
self,
|
||||||
|
cause: str,
|
||||||
|
effect: str,
|
||||||
|
relationships: List[CausalRelation],
|
||||||
|
max_depth: int = 3
|
||||||
|
) -> bool:
|
||||||
|
"""Check if there's a causal path from cause to effect."""
|
||||||
|
if max_depth == 0:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Direct relationship
|
||||||
|
for rel in relationships:
|
||||||
|
if rel.cause == cause and rel.effect == effect:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Indirect relationship (transitive)
|
||||||
|
for rel in relationships:
|
||||||
|
if rel.cause == cause:
|
||||||
|
# Check if rel.effect leads to the target effect
|
||||||
|
if self._check_causal_path(rel.effect, effect, relationships, max_depth - 1):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def build_scm_from_relationships(
|
||||||
|
self,
|
||||||
|
relationships: List[CausalRelation]
|
||||||
|
) -> Optional[CausalModel]:
|
||||||
|
"""
|
||||||
|
Build a Structural Causal Model from relationships.
|
||||||
|
This is a simplified version for text-based causal inference.
|
||||||
|
"""
|
||||||
|
if not relationships:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Extract all unique variables
|
||||||
|
variables = set()
|
||||||
|
for rel in relationships:
|
||||||
|
variables.add(rel.cause)
|
||||||
|
variables.add(rel.effect)
|
||||||
|
|
||||||
|
# Create a simple adjacency matrix representation
|
||||||
|
# This is a heuristic approach since we don't have actual data
|
||||||
|
|
||||||
|
# For now, return None as building a full SCM requires actual data
|
||||||
|
# The validation uses graph-based heuristics instead
|
||||||
|
return None
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("Failed to build SCM: %s", exc)
|
||||||
|
return None
|
||||||
|
|
||||||
@ -0,0 +1,85 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from ..config import get_settings
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
HAS_SENTENCE_TRANSFORMERS = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_SENTENCE_TRANSFORMERS = False
|
||||||
|
logger.warning("sentence-transformers not available")
|
||||||
|
|
||||||
|
|
||||||
|
class Embedder:
|
||||||
|
"""Generate embeddings using sentence-transformers."""
|
||||||
|
|
||||||
|
def __init__(self, model_name: str | None = None):
|
||||||
|
if not HAS_SENTENCE_TRANSFORMERS:
|
||||||
|
raise ImportError("sentence-transformers is required for embeddings")
|
||||||
|
|
||||||
|
settings = get_settings()
|
||||||
|
self.model_name = model_name or settings.embedding_model
|
||||||
|
|
||||||
|
logger.info("Loading embedding model: %s", self.model_name)
|
||||||
|
try:
|
||||||
|
self.model = SentenceTransformer(self.model_name)
|
||||||
|
self.dimension = self.model.get_sentence_embedding_dimension()
|
||||||
|
logger.info("Loaded embedding model with dimension: %d", self.dimension)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception("Failed to load embedding model %s: %s", self.model_name, exc)
|
||||||
|
raise
|
||||||
|
|
||||||
|
def embed_text(self, text: str) -> List[float]:
|
||||||
|
"""Generate embedding for a single text."""
|
||||||
|
if not text or not text.strip():
|
||||||
|
# Return zero vector for empty text
|
||||||
|
return [0.0] * self.dimension
|
||||||
|
|
||||||
|
try:
|
||||||
|
embedding = self.model.encode(text, normalize_embeddings=True)
|
||||||
|
return embedding.tolist()
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("Failed to embed text: %s", exc)
|
||||||
|
return [0.0] * self.dimension
|
||||||
|
|
||||||
|
def embed_batch(self, texts: List[str], batch_size: int = 32) -> List[List[float]]:
|
||||||
|
"""Generate embeddings for a batch of texts."""
|
||||||
|
if not texts:
|
||||||
|
return []
|
||||||
|
|
||||||
|
try:
|
||||||
|
embeddings = self.model.encode(
|
||||||
|
texts,
|
||||||
|
batch_size=batch_size,
|
||||||
|
normalize_embeddings=True,
|
||||||
|
show_progress_bar=False
|
||||||
|
)
|
||||||
|
return embeddings.tolist()
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("Failed to embed batch: %s", exc)
|
||||||
|
return [[0.0] * self.dimension] * len(texts)
|
||||||
|
|
||||||
|
def embed_relation(self, cause: str, effect: str, explanation: str | None = None) -> List[float]:
|
||||||
|
"""Generate embedding for a cause-effect relationship."""
|
||||||
|
# Combine cause, effect, and explanation into a single text
|
||||||
|
parts = [cause, "causes", effect]
|
||||||
|
if explanation:
|
||||||
|
parts.append(explanation)
|
||||||
|
|
||||||
|
text = " ".join(parts)
|
||||||
|
return self.embed_text(text)
|
||||||
|
|
||||||
|
def embed_concept(self, concept_name: str, description: str | None = None) -> List[float]:
|
||||||
|
"""Generate embedding for a concept/node."""
|
||||||
|
if description:
|
||||||
|
text = f"{concept_name}: {description}"
|
||||||
|
else:
|
||||||
|
text = concept_name
|
||||||
|
|
||||||
|
return self.embed_text(text)
|
||||||
|
|
||||||
@ -0,0 +1,253 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from typing import Dict, List, Set
|
||||||
|
|
||||||
|
from anthropic import Anthropic, BadRequestError
|
||||||
|
|
||||||
|
from ..config import get_settings
|
||||||
|
from ..models import CausalRelation
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class EntityResolver:
|
||||||
|
"""
|
||||||
|
Resolve entity mentions using Claude AI as per README Stage 4.
|
||||||
|
Identifies that different mentions refer to the same entity.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
settings = get_settings()
|
||||||
|
self.api_key = settings.anthropic_api_key
|
||||||
|
self.model = settings.claude_model
|
||||||
|
self.max_output_tokens = settings.claude_max_output_tokens
|
||||||
|
|
||||||
|
if not self.api_key:
|
||||||
|
logger.warning("ANTHROPIC_API_KEY not set - Entity resolution will be skipped")
|
||||||
|
self.client = None
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
self.client = Anthropic(api_key=self.api_key)
|
||||||
|
logger.info("EntityResolver initialized with Claude AI")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Failed to initialize Claude AI for entity resolution: %s", e)
|
||||||
|
self.client = None
|
||||||
|
|
||||||
|
def resolve_entities(self, relations: List[CausalRelation]) -> Dict[str, Dict]:
|
||||||
|
"""
|
||||||
|
Resolve entity mentions across all documents as per README Step 4.
|
||||||
|
|
||||||
|
Step 4.1: Collect all entities
|
||||||
|
Step 4.2: Group by entity type
|
||||||
|
Step 4.3: AI-powered resolution (Claude API)
|
||||||
|
Step 4.4: Create canonical names
|
||||||
|
|
||||||
|
Returns mapping: canonical_name -> {mentions, type, role, confidence}
|
||||||
|
"""
|
||||||
|
if not self.client:
|
||||||
|
logger.info("Entity resolution skipped (Claude AI not available)")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
if not relations:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
# Step 4.1: COLLECT ALL ENTITIES
|
||||||
|
all_mentions: Set[str] = set()
|
||||||
|
for rel in relations:
|
||||||
|
all_mentions.add(rel.cause.strip())
|
||||||
|
all_mentions.add(rel.effect.strip())
|
||||||
|
|
||||||
|
if not all_mentions:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
logger.info("Collecting %d entity mentions for resolution", len(all_mentions))
|
||||||
|
|
||||||
|
# Step 4.2: GROUP BY ENTITY TYPE (simple heuristic)
|
||||||
|
people_mentions = []
|
||||||
|
project_mentions = []
|
||||||
|
team_mentions = []
|
||||||
|
other_mentions = []
|
||||||
|
|
||||||
|
for mention in all_mentions:
|
||||||
|
mention_lower = mention.lower()
|
||||||
|
if any(word in mention_lower for word in ["team", "department", "group", "division"]):
|
||||||
|
team_mentions.append(mention)
|
||||||
|
elif any(word in mention_lower for word in ["project", "system", "application", "platform"]):
|
||||||
|
project_mentions.append(mention)
|
||||||
|
elif len(mention.split()) <= 3 and not any(char.isdigit() for char in mention):
|
||||||
|
# Likely a person name (short, no numbers)
|
||||||
|
people_mentions.append(mention)
|
||||||
|
else:
|
||||||
|
other_mentions.append(mention)
|
||||||
|
|
||||||
|
# Step 4.3: AI-POWERED RESOLUTION (Claude API)
|
||||||
|
resolved_entities = {}
|
||||||
|
|
||||||
|
# Resolve people
|
||||||
|
if people_mentions:
|
||||||
|
people_resolved = self._resolve_with_claude(people_mentions, "Person")
|
||||||
|
resolved_entities.update(people_resolved)
|
||||||
|
|
||||||
|
# Resolve projects
|
||||||
|
if project_mentions:
|
||||||
|
projects_resolved = self._resolve_with_claude(project_mentions, "Project")
|
||||||
|
resolved_entities.update(projects_resolved)
|
||||||
|
|
||||||
|
# Resolve teams
|
||||||
|
if team_mentions:
|
||||||
|
teams_resolved = self._resolve_with_claude(team_mentions, "Team")
|
||||||
|
resolved_entities.update(teams_resolved)
|
||||||
|
|
||||||
|
# Resolve others
|
||||||
|
if other_mentions:
|
||||||
|
others_resolved = self._resolve_with_claude(other_mentions, "Entity")
|
||||||
|
resolved_entities.update(others_resolved)
|
||||||
|
|
||||||
|
logger.info("Resolved %d entities from %d mentions", len(resolved_entities), len(all_mentions))
|
||||||
|
|
||||||
|
return resolved_entities
|
||||||
|
|
||||||
|
def _resolve_with_claude(self, mentions: List[str], entity_type: str) -> Dict[str, Dict]:
|
||||||
|
"""Use Claude AI to resolve entity mentions."""
|
||||||
|
if not self.client or not mentions:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
system_prompt = """You are an expert at entity resolution. Your task is to identify which mentions refer to the same real-world entity.
|
||||||
|
|
||||||
|
Analyze the given list of entity mentions and group them by the actual entity they refer to.
|
||||||
|
|
||||||
|
Return a JSON object where:
|
||||||
|
- Key: Canonical name (best/most complete name)
|
||||||
|
- Value: Object with:
|
||||||
|
- "mentions": List of all mentions that refer to this entity
|
||||||
|
- "type": Entity type (Person, Project, Team, etc.)
|
||||||
|
- "role": Role or description (if applicable)
|
||||||
|
- "confidence": Confidence score (0.0 to 1.0)
|
||||||
|
|
||||||
|
Example:
|
||||||
|
{
|
||||||
|
"John Smith": {
|
||||||
|
"mentions": ["John", "J. Smith", "John Smith", "Smith"],
|
||||||
|
"type": "Person",
|
||||||
|
"role": "Project Lead",
|
||||||
|
"confidence": 0.95
|
||||||
|
},
|
||||||
|
"Project Alpha": {
|
||||||
|
"mentions": ["Project Alpha", "Alpha", "The Alpha Project"],
|
||||||
|
"type": "Project",
|
||||||
|
"role": null,
|
||||||
|
"confidence": 0.90
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Be thorough and group all related mentions together."""
|
||||||
|
|
||||||
|
user_prompt = f"""Analyze these {entity_type} entity mentions and resolve which ones refer to the same entity:
|
||||||
|
|
||||||
|
{json.dumps(mentions, indent=2)}
|
||||||
|
|
||||||
|
Return a JSON object mapping canonical names to their resolved mentions."""
|
||||||
|
|
||||||
|
message = self.client.messages.create(
|
||||||
|
model=self.model,
|
||||||
|
max_tokens=self.max_output_tokens,
|
||||||
|
temperature=0.2, # Lower temperature for more consistent resolution
|
||||||
|
system=system_prompt,
|
||||||
|
messages=[{"role": "user", "content": user_prompt}]
|
||||||
|
)
|
||||||
|
|
||||||
|
response_text = "".join(
|
||||||
|
block.text for block in message.content
|
||||||
|
if hasattr(block, "text")
|
||||||
|
)
|
||||||
|
|
||||||
|
if not response_text:
|
||||||
|
logger.warning("Empty response from Claude for entity resolution")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
# Parse JSON response
|
||||||
|
try:
|
||||||
|
json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
|
||||||
|
if json_match:
|
||||||
|
json_text = json_match.group(0)
|
||||||
|
else:
|
||||||
|
json_text = response_text
|
||||||
|
|
||||||
|
resolved = json.loads(json_text)
|
||||||
|
|
||||||
|
# Validate and structure the response
|
||||||
|
result = {}
|
||||||
|
for canonical_name, entity_data in resolved.items():
|
||||||
|
if isinstance(entity_data, dict):
|
||||||
|
result[canonical_name] = {
|
||||||
|
"mentions": entity_data.get("mentions", [canonical_name]),
|
||||||
|
"type": entity_data.get("type", entity_type),
|
||||||
|
"role": entity_data.get("role"),
|
||||||
|
"confidence": float(entity_data.get("confidence", 0.85))
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
# Fallback if structure is different
|
||||||
|
result[canonical_name] = {
|
||||||
|
"mentions": [canonical_name] if isinstance(entity_data, str) else entity_data,
|
||||||
|
"type": entity_type,
|
||||||
|
"role": None,
|
||||||
|
"confidence": 0.8
|
||||||
|
}
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
logger.warning("Failed to parse Claude response as JSON: %s. Response: %s",
|
||||||
|
e, response_text[:500])
|
||||||
|
return {}
|
||||||
|
|
||||||
|
except BadRequestError as e:
|
||||||
|
logger.warning("Claude API error during entity resolution: %s", e)
|
||||||
|
return {}
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Entity resolution failed: %s", e)
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def apply_resolution_to_relations(
|
||||||
|
self,
|
||||||
|
relations: List[CausalRelation],
|
||||||
|
resolved_entities: Dict[str, Dict]
|
||||||
|
) -> List[CausalRelation]:
|
||||||
|
"""
|
||||||
|
Apply entity resolution to relationships.
|
||||||
|
Replace mentions with canonical names.
|
||||||
|
"""
|
||||||
|
if not resolved_entities:
|
||||||
|
return relations
|
||||||
|
|
||||||
|
# Create reverse mapping: mention -> canonical_name
|
||||||
|
mention_to_canonical: Dict[str, str] = {}
|
||||||
|
for canonical_name, entity_data in resolved_entities.items():
|
||||||
|
mentions = entity_data.get("mentions", [])
|
||||||
|
for mention in mentions:
|
||||||
|
mention_to_canonical[mention.lower()] = canonical_name
|
||||||
|
|
||||||
|
# Update relations with canonical names
|
||||||
|
updated_relations = []
|
||||||
|
for rel in relations:
|
||||||
|
# Resolve cause
|
||||||
|
cause_lower = rel.cause.strip().lower()
|
||||||
|
if cause_lower in mention_to_canonical:
|
||||||
|
rel.cause = mention_to_canonical[cause_lower]
|
||||||
|
|
||||||
|
# Resolve effect
|
||||||
|
effect_lower = rel.effect.strip().lower()
|
||||||
|
if effect_lower in mention_to_canonical:
|
||||||
|
rel.effect = mention_to_canonical[effect_lower]
|
||||||
|
|
||||||
|
# Store resolution info in metadata
|
||||||
|
rel.metadata["entity_resolved"] = True
|
||||||
|
updated_relations.append(rel)
|
||||||
|
|
||||||
|
logger.info("Applied entity resolution to %d relationships", len(updated_relations))
|
||||||
|
return updated_relations
|
||||||
|
|
||||||
@ -1,23 +1,163 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
from typing import Iterable
|
import re
|
||||||
|
from typing import Dict, Iterable, List, Optional
|
||||||
|
|
||||||
|
from anthropic import Anthropic, BadRequestError
|
||||||
from neo4j import GraphDatabase, Transaction
|
from neo4j import GraphDatabase, Transaction
|
||||||
|
|
||||||
|
from ..config import get_settings
|
||||||
from ..models import CausalRelation
|
from ..models import CausalRelation
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
MERGE_QUERY = """
|
# Query to create Document node
|
||||||
MERGE (cause:Concept {name: $cause})
|
CREATE_DOCUMENT_QUERY = """
|
||||||
ON CREATE SET cause.created_at = timestamp(), cause.lastSeen = timestamp()
|
MERGE (doc:Document {filename: $filename})
|
||||||
ON MATCH SET cause.lastSeen = timestamp()
|
ON CREATE SET doc.uploaded_at = timestamp(),
|
||||||
MERGE (effect:Concept {name: $effect})
|
doc.file_path = $file_path,
|
||||||
ON CREATE SET effect.created_at = timestamp(), effect.lastSeen = timestamp()
|
doc.job_id = $job_id,
|
||||||
ON MATCH SET effect.lastSeen = timestamp()
|
doc.created_at = timestamp()
|
||||||
MERGE (cause)-[r:CAUSES]->(effect)
|
ON MATCH SET doc.lastSeen = timestamp()
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Query to create Entity nodes and relationship with dynamic type
|
||||||
|
CREATE_ENTITY_RELATIONSHIP_QUERY = """
|
||||||
|
MERGE (source:Entity:Concept {name: $source})
|
||||||
|
ON CREATE SET source.created_at = timestamp(),
|
||||||
|
source.lastSeen = timestamp(),
|
||||||
|
source.type = COALESCE($source_type, 'Entity')
|
||||||
|
ON MATCH SET source.lastSeen = timestamp()
|
||||||
|
|
||||||
|
MERGE (target:Entity:Concept {name: $target})
|
||||||
|
ON CREATE SET target.created_at = timestamp(),
|
||||||
|
target.lastSeen = timestamp(),
|
||||||
|
target.type = COALESCE($target_type, 'Entity')
|
||||||
|
ON MATCH SET target.lastSeen = timestamp()
|
||||||
|
|
||||||
|
WITH source, target
|
||||||
|
CALL apoc.merge.relationship(
|
||||||
|
source,
|
||||||
|
$rel_type,
|
||||||
|
{confidence: $confidence,
|
||||||
|
explanation: $explanation,
|
||||||
|
source_file_id: $source_file_id,
|
||||||
|
source_snippet: $source_snippet,
|
||||||
|
job_id: $job_id,
|
||||||
|
model: $model,
|
||||||
|
created_at: timestamp(),
|
||||||
|
updated_at: timestamp()},
|
||||||
|
{confidence: $confidence,
|
||||||
|
explanation: $explanation,
|
||||||
|
source_file_id: $source_file_id,
|
||||||
|
source_snippet: $source_snippet,
|
||||||
|
job_id: $job_id,
|
||||||
|
model: $model,
|
||||||
|
updated_at: timestamp()},
|
||||||
|
target
|
||||||
|
) YIELD rel
|
||||||
|
RETURN rel
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class GraphWriter:
|
||||||
|
def __init__(self, uri: str, user: str, password: str):
|
||||||
|
self._driver = GraphDatabase.driver(uri, auth=(user, password))
|
||||||
|
|
||||||
|
def close(self) -> None:
|
||||||
|
self._driver.close()
|
||||||
|
|
||||||
|
def write_documents(self, job_id: str, files: Iterable) -> None:
|
||||||
|
"""Create Document nodes for uploaded files."""
|
||||||
|
files_list = list(files)
|
||||||
|
if not files_list:
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info("Creating %d document nodes for job %s", len(files_list), job_id)
|
||||||
|
|
||||||
|
with self._driver.session() as session:
|
||||||
|
def _write_docs(tx: Transaction) -> None:
|
||||||
|
for file_record in files_list:
|
||||||
|
try:
|
||||||
|
tx.run(
|
||||||
|
CREATE_DOCUMENT_QUERY,
|
||||||
|
filename=file_record.filename,
|
||||||
|
file_path=file_record.stored_path,
|
||||||
|
job_id=job_id
|
||||||
|
)
|
||||||
|
logger.debug("Created document node: %s", file_record.filename)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("Failed to create document node for %s: %s", file_record.filename, exc)
|
||||||
|
|
||||||
|
session.execute_write(_write_docs)
|
||||||
|
logger.info("Created document nodes for job %s", job_id)
|
||||||
|
|
||||||
|
def write_relations(self, job_id: str, relations: Iterable[CausalRelation], files: Iterable = None) -> None:
|
||||||
|
"""Write entities and relationships to Neo4j with multiple relationship types."""
|
||||||
|
relations_list = list(relations)
|
||||||
|
if not relations_list:
|
||||||
|
logger.warning("No relations to write for job %s", job_id)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Create document nodes if files provided
|
||||||
|
if files:
|
||||||
|
self.write_documents(job_id, files)
|
||||||
|
|
||||||
|
logger.info("Writing %d relations to Neo4j for job %s", len(relations_list), job_id)
|
||||||
|
|
||||||
|
with self._driver.session() as session:
|
||||||
|
def _write(tx: Transaction) -> None:
|
||||||
|
count = 0
|
||||||
|
for relation in relations_list:
|
||||||
|
if not relation.cause or not relation.effect:
|
||||||
|
logger.warning("Skipping relation with empty cause or effect: %s -> %s", relation.cause, relation.effect)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Get relationship type (default to CAUSES for backward compatibility)
|
||||||
|
rel_type = getattr(relation, 'relationship_type', None) or "CAUSES"
|
||||||
|
|
||||||
|
# Sanitize relationship type (only allow alphanumeric and underscores)
|
||||||
|
rel_type = re.sub(r'[^A-Z0-9_]', '', rel_type.upper())
|
||||||
|
if not rel_type:
|
||||||
|
rel_type = "CAUSES"
|
||||||
|
|
||||||
|
# Infer entity types from names (simple heuristic)
|
||||||
|
source_type = self._infer_entity_type(relation.cause)
|
||||||
|
target_type = self._infer_entity_type(relation.effect)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Create source entity
|
||||||
|
tx.run("""
|
||||||
|
MERGE (source:Entity:Concept {name: $source})
|
||||||
|
ON CREATE SET source.created_at = timestamp(),
|
||||||
|
source.lastSeen = timestamp(),
|
||||||
|
source.type = $source_type
|
||||||
|
ON MATCH SET source.lastSeen = timestamp()
|
||||||
|
""",
|
||||||
|
source=relation.cause.strip(),
|
||||||
|
source_type=source_type
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create target entity
|
||||||
|
tx.run("""
|
||||||
|
MERGE (target:Entity:Concept {name: $target})
|
||||||
|
ON CREATE SET target.created_at = timestamp(),
|
||||||
|
target.lastSeen = timestamp(),
|
||||||
|
target.type = $target_type
|
||||||
|
ON MATCH SET target.lastSeen = timestamp()
|
||||||
|
""",
|
||||||
|
target=relation.effect.strip(),
|
||||||
|
target_type=target_type
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create relationship with dynamic type (sanitized)
|
||||||
|
query = f"""
|
||||||
|
MATCH (source:Entity {{name: $source}})
|
||||||
|
MATCH (target:Entity {{name: $target}})
|
||||||
|
MERGE (source)-[r:{rel_type}]->(target)
|
||||||
ON CREATE SET r.confidence = $confidence,
|
ON CREATE SET r.confidence = $confidence,
|
||||||
r.explanation = $explanation,
|
r.explanation = $explanation,
|
||||||
r.source_file_id = $source_file_id,
|
r.source_file_id = $source_file_id,
|
||||||
@ -35,34 +175,10 @@ ON MATCH SET r.confidence = $confidence,
|
|||||||
r.updated_at = timestamp()
|
r.updated_at = timestamp()
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
class GraphWriter:
|
|
||||||
def __init__(self, uri: str, user: str, password: str):
|
|
||||||
self._driver = GraphDatabase.driver(uri, auth=(user, password))
|
|
||||||
|
|
||||||
def close(self) -> None:
|
|
||||||
self._driver.close()
|
|
||||||
|
|
||||||
def write_relations(self, job_id: str, relations: Iterable[CausalRelation]) -> None:
|
|
||||||
relations_list = list(relations)
|
|
||||||
if not relations_list:
|
|
||||||
logger.warning("No relations to write for job %s", job_id)
|
|
||||||
return
|
|
||||||
|
|
||||||
logger.info("Writing %d relations to Neo4j for job %s", len(relations_list), job_id)
|
|
||||||
|
|
||||||
with self._driver.session() as session:
|
|
||||||
def _write(tx: Transaction) -> None:
|
|
||||||
count = 0
|
|
||||||
for relation in relations_list:
|
|
||||||
if not relation.cause or not relation.effect:
|
|
||||||
logger.warning("Skipping relation with empty cause or effect: %s -> %s", relation.cause, relation.effect)
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
result = tx.run(
|
result = tx.run(
|
||||||
MERGE_QUERY,
|
query,
|
||||||
cause=relation.cause.strip(),
|
source=relation.cause.strip(),
|
||||||
effect=relation.effect.strip(),
|
target=relation.effect.strip(),
|
||||||
confidence=float(relation.confidence) if relation.confidence else 0.0,
|
confidence=float(relation.confidence) if relation.confidence else 0.0,
|
||||||
explanation=relation.explanation or "",
|
explanation=relation.explanation or "",
|
||||||
source_file_id=relation.source_file_id or "",
|
source_file_id=relation.source_file_id or "",
|
||||||
@ -70,12 +186,145 @@ class GraphWriter:
|
|||||||
job_id=job_id,
|
job_id=job_id,
|
||||||
model=relation.metadata.get("model") or "",
|
model=relation.metadata.get("model") or "",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Link entities to documents if source_file_id is a filename
|
||||||
|
if relation.source_file_id and relation.source_file_id != "combined_text":
|
||||||
|
link_query = f"""
|
||||||
|
MATCH (entity:Entity {{name: $entity_name}})
|
||||||
|
MATCH (doc:Document {{filename: $filename}})
|
||||||
|
MERGE (entity)-[:EXTRACTED_FROM]->(doc)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
tx.run(link_query, entity_name=relation.cause.strip(), filename=relation.source_file_id)
|
||||||
|
tx.run(link_query, entity_name=relation.effect.strip(), filename=relation.source_file_id)
|
||||||
|
except:
|
||||||
|
pass # Ignore if document doesn't exist
|
||||||
|
|
||||||
count += 1
|
count += 1
|
||||||
logger.debug("Wrote relation: %s -> %s (confidence: %s)", relation.cause, relation.effect, relation.confidence)
|
logger.debug("Wrote relation: %s -[%s]-> %s (confidence: %s)",
|
||||||
|
relation.cause, rel_type, relation.effect, relation.confidence)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.exception("Failed to write relation %s -> %s: %s", relation.cause, relation.effect, exc)
|
logger.exception("Failed to write relation %s -> %s: %s", relation.cause, relation.effect, exc)
|
||||||
logger.info("Successfully wrote %d/%d relations to Neo4j", count, len(relations_list))
|
logger.info("Successfully wrote %d/%d relations to Neo4j", count, len(relations_list))
|
||||||
|
|
||||||
session.execute_write(_write)
|
session.execute_write(_write)
|
||||||
logger.info("Persisted causal relations for job %s", job_id)
|
logger.info("Persisted relations for job %s", job_id)
|
||||||
|
|
||||||
|
def _infer_entity_type(self, entity_name: str) -> str:
|
||||||
|
"""Infer entity type from name (simple heuristic)."""
|
||||||
|
name_lower = entity_name.lower()
|
||||||
|
|
||||||
|
# Technology patterns
|
||||||
|
if any(tech in name_lower for tech in ['react', 'node', 'python', 'java', 'postgres', 'mysql', 'redis', 'mongodb', 'docker', 'kubernetes']):
|
||||||
|
return "Technology"
|
||||||
|
|
||||||
|
# Service patterns
|
||||||
|
if any(word in name_lower for word in ['service', 'api', 'gateway', 'auth', 'payment', 'notification']):
|
||||||
|
return "Service"
|
||||||
|
|
||||||
|
# Component patterns
|
||||||
|
if any(word in name_lower for word in ['component', 'module', 'system', 'application', 'platform']):
|
||||||
|
return "Component"
|
||||||
|
|
||||||
|
# Process patterns
|
||||||
|
if any(word in name_lower for word in ['flow', 'process', 'workflow', 'pipeline', 'procedure']):
|
||||||
|
return "Process"
|
||||||
|
|
||||||
|
# Default
|
||||||
|
return "Entity"
|
||||||
|
|
||||||
|
def query_causal_chains(
|
||||||
|
self,
|
||||||
|
job_id: str,
|
||||||
|
min_length: int = 2,
|
||||||
|
max_length: int = 4,
|
||||||
|
min_confidence: float = 0.8,
|
||||||
|
limit: int = 20
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Query Neo4j for causal chains as per README Step 7.3.
|
||||||
|
Returns sequences of connected events.
|
||||||
|
"""
|
||||||
|
# Query for causal chains - match any relationship type
|
||||||
|
query = f"""
|
||||||
|
MATCH path = (start:Entity)-[r*{min_length}..{max_length}]->(end:Entity)
|
||||||
|
WHERE ALL(rel in relationships(path) WHERE rel.job_id = $job_id AND rel.confidence >= $min_confidence)
|
||||||
|
WITH path,
|
||||||
|
[node in nodes(path) | node.name] as chain,
|
||||||
|
[rel in relationships(path) | rel.confidence] as confidences,
|
||||||
|
[rel in relationships(path) | type(rel)] as rel_types,
|
||||||
|
[rel in relationships(path) | rel.explanation] as explanations
|
||||||
|
RETURN chain, confidences, rel_types, explanations
|
||||||
|
ORDER BY reduce(conf = 0.0, c in confidences | conf + c) DESC
|
||||||
|
LIMIT $limit
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
with self._driver.session() as session:
|
||||||
|
result = session.run(
|
||||||
|
query,
|
||||||
|
job_id=job_id,
|
||||||
|
min_confidence=min_confidence,
|
||||||
|
limit=limit
|
||||||
|
)
|
||||||
|
|
||||||
|
chains = []
|
||||||
|
for record in result:
|
||||||
|
chain = record["chain"]
|
||||||
|
confidences = record["confidences"]
|
||||||
|
rel_types = record["rel_types"]
|
||||||
|
explanations = record["explanations"]
|
||||||
|
|
||||||
|
# Calculate average confidence
|
||||||
|
avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
|
||||||
|
|
||||||
|
chains.append({
|
||||||
|
"chain": chain,
|
||||||
|
"confidences": confidences,
|
||||||
|
"rel_types": rel_types,
|
||||||
|
"explanations": explanations,
|
||||||
|
"avg_confidence": avg_confidence,
|
||||||
|
"length": len(chain) - 1
|
||||||
|
})
|
||||||
|
|
||||||
|
logger.info("Found %d causal chains for job %s", len(chains), job_id)
|
||||||
|
return chains
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception("Failed to query causal chains: %s", exc)
|
||||||
|
return []
|
||||||
|
|
||||||
|
def query_key_entities(self, job_id: str, limit: int = 20) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Query Neo4j for key entities (most involved) as per README Step 7.3.
|
||||||
|
"""
|
||||||
|
query = """
|
||||||
|
MATCH (e:Entity)-[r]->(target)
|
||||||
|
WHERE r.job_id = $job_id
|
||||||
|
WITH e, count(r) as relation_count, collect(DISTINCT type(r)) as rel_types
|
||||||
|
RETURN e.name as name,
|
||||||
|
e.type as type,
|
||||||
|
relation_count,
|
||||||
|
rel_types
|
||||||
|
ORDER BY relation_count DESC
|
||||||
|
LIMIT $limit
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
with self._driver.session() as session:
|
||||||
|
result = session.run(query, job_id=job_id, limit=limit)
|
||||||
|
|
||||||
|
entities = []
|
||||||
|
for record in result:
|
||||||
|
entities.append({
|
||||||
|
"name": record["name"],
|
||||||
|
"type": record.get("type", "Entity"),
|
||||||
|
"relation_count": record["relation_count"],
|
||||||
|
"relation_types": record["rel_types"]
|
||||||
|
})
|
||||||
|
|
||||||
|
logger.info("Found %d key entities for job %s", len(entities), job_id)
|
||||||
|
return entities
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception("Failed to query key entities: %s", exc)
|
||||||
|
return []
|
||||||
|
|
||||||
|
|||||||
@ -0,0 +1,625 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
|
from anthropic import Anthropic, BadRequestError
|
||||||
|
|
||||||
|
from ..config import get_settings
|
||||||
|
from ..models import CausalRelation
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Try to import SpaCy
|
||||||
|
try:
|
||||||
|
import spacy
|
||||||
|
from spacy.lang.en import English
|
||||||
|
HAS_SPACY = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_SPACY = False
|
||||||
|
logger.warning("spacy not available - NLP detection will be skipped")
|
||||||
|
|
||||||
|
|
||||||
|
class RelationshipExtractor:
|
||||||
|
"""Extract potential cause-effect relationships from text using NLP (SpaCy) + Claude AI."""
|
||||||
|
|
||||||
|
# Causal keywords for NLP detection (Step 3.1)
|
||||||
|
CAUSAL_KEYWORDS = [
|
||||||
|
"because", "due to", "as a result", "led to", "caused", "therefore",
|
||||||
|
"consequently", "hence", "thus", "so", "since", "owing to",
|
||||||
|
"resulted in", "brought about", "gave rise to", "triggered",
|
||||||
|
"provoked", "induced", "generated", "produced", "created"
|
||||||
|
]
|
||||||
|
|
||||||
|
# Common cause-effect patterns (expanded for architecture/technical documents)
|
||||||
|
CAUSE_EFFECT_PATTERNS = [
|
||||||
|
# Direct causal patterns
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+causes?\s+(\w+(?:\s+\w+){0,15})", "causes"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+leads?\s+to\s+(\w+(?:\s+\w+){0,15})", "leads_to"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+results?\s+in\s+(\w+(?:\s+\w+){0,15})", "results_in"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+triggers?\s+(\w+(?:\s+\w+){0,15})", "triggers"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+produces?\s+(\w+(?:\s+\w+){0,15})", "produces"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+enables?\s+(\w+(?:\s+\w+){0,15})", "enables"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+allows?\s+(\w+(?:\s+\w+){0,15})", "allows"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+facilitates?\s+(\w+(?:\s+\w+){0,15})", "facilitates"),
|
||||||
|
|
||||||
|
# Dependency patterns
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+depends?\s+on\s+(\w+(?:\s+\w+){0,15})", "depends_on"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+requires?\s+(\w+(?:\s+\w+){0,15})", "requires"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+needs?\s+(\w+(?:\s+\w+){0,15})", "needs"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+relies?\s+on\s+(\w+(?:\s+\w+){0,15})", "relies_on"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+uses?\s+(\w+(?:\s+\w+){0,15})", "uses"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+utilizes?\s+(\w+(?:\s+\w+){0,15})", "utilizes"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+leverages?\s+(\w+(?:\s+\w+){0,15})", "leverages"),
|
||||||
|
|
||||||
|
# Architectural/System patterns
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+connects?\s+to\s+(\w+(?:\s+\w+){0,15})", "connects_to"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+communicates?\s+with\s+(\w+(?:\s+\w+){0,15})", "communicates_with"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+interacts?\s+with\s+(\w+(?:\s+\w+){0,15})", "interacts_with"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+integrates?\s+with\s+(\w+(?:\s+\w+){0,15})", "integrates_with"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+provides?\s+(\w+(?:\s+\w+){0,15})", "provides"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+supports?\s+(\w+(?:\s+\w+){0,15})", "supports"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+handles?\s+(\w+(?:\s+\w+){0,15})", "handles"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+manages?\s+(\w+(?:\s+\w+){0,15})", "manages"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+controls?\s+(\w+(?:\s+\w+){0,15})", "controls"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+processes?\s+(\w+(?:\s+\w+){0,15})", "processes"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+generates?\s+(\w+(?:\s+\w+){0,15})", "generates"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+creates?\s+(\w+(?:\s+\w+){0,15})", "creates"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+implements?\s+(\w+(?:\s+\w+){0,15})", "implements"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+delivers?\s+(\w+(?:\s+\w+){0,15})", "delivers"),
|
||||||
|
|
||||||
|
# Flow patterns
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+flows?\s+to\s+(\w+(?:\s+\w+){0,15})", "flows_to"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+sends?\s+to\s+(\w+(?:\s+\w+){0,15})", "sends_to"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+transmits?\s+to\s+(\w+(?:\s+\w+){0,15})", "transmits_to"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+receives?\s+from\s+(\w+(?:\s+\w+){0,15})", "receives_from"),
|
||||||
|
|
||||||
|
# Conditional patterns
|
||||||
|
(r"if\s+(\w+(?:\s+\w+){0,15}),\s+then\s+(\w+(?:\s+\w+){0,15})", "if_then"),
|
||||||
|
(r"when\s+(\w+(?:\s+\w+){0,15}),\s+(\w+(?:\s+\w+){0,15})\s+occurs?", "when_then"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+implies?\s+(\w+(?:\s+\w+){0,15})", "implies"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+ensures?\s+(\w+(?:\s+\w+){0,15})", "ensures"),
|
||||||
|
|
||||||
|
# Sequential patterns
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+follows?\s+(\w+(?:\s+\w+){0,15})", "follows"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+comes?\s+after\s+(\w+(?:\s+\w+){0,15})", "comes_after"),
|
||||||
|
(r"first\s+(\w+(?:\s+\w+){0,15}),\s+then\s+(\w+(?:\s+\w+){0,15})", "first_then"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+precedes?\s+(\w+(?:\s+\w+){0,15})", "precedes"),
|
||||||
|
|
||||||
|
# Containment patterns
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+contains?\s+(\w+(?:\s+\w+){0,15})", "contains"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+includes?\s+(\w+(?:\s+\w+){0,15})", "includes"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+consists?\s+of\s+(\w+(?:\s+\w+){0,15})", "consists_of"),
|
||||||
|
|
||||||
|
# Influence patterns
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+affects?\s+(\w+(?:\s+\w+){0,15})", "affects"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+impacts?\s+(\w+(?:\s+\w+){0,15})", "impacts"),
|
||||||
|
(r"(\w+(?:\s+\w+){0,15})\s+influences?\s+(\w+(?:\s+\w+){0,15})", "influences"),
|
||||||
|
]
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initialize NLP and Claude AI components."""
|
||||||
|
settings = get_settings()
|
||||||
|
|
||||||
|
# Initialize SpaCy NLP model (Step 3.1)
|
||||||
|
self.nlp = None
|
||||||
|
if HAS_SPACY:
|
||||||
|
try:
|
||||||
|
# Try to load English model, fallback to blank if not available
|
||||||
|
try:
|
||||||
|
self.nlp = spacy.load("en_core_web_sm")
|
||||||
|
except OSError:
|
||||||
|
logger.warning("en_core_web_sm model not found, using blank English model")
|
||||||
|
self.nlp = English()
|
||||||
|
self.nlp.add_pipe("sentencizer")
|
||||||
|
logger.info("SpaCy NLP model loaded")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Failed to load SpaCy model: %s", e)
|
||||||
|
self.nlp = None
|
||||||
|
|
||||||
|
# Initialize Claude AI client (Step 3.2)
|
||||||
|
self.claude_client = None
|
||||||
|
self.claude_model = settings.claude_model
|
||||||
|
self.claude_max_input_tokens = settings.claude_max_input_tokens
|
||||||
|
self.claude_max_output_tokens = settings.claude_max_output_tokens
|
||||||
|
|
||||||
|
if settings.anthropic_api_key:
|
||||||
|
try:
|
||||||
|
self.claude_client = Anthropic(api_key=settings.anthropic_api_key)
|
||||||
|
logger.info("Claude AI client initialized")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Failed to initialize Claude AI client: %s", e)
|
||||||
|
else:
|
||||||
|
logger.warning("ANTHROPIC_API_KEY not set - Claude AI extraction will be skipped")
|
||||||
|
|
||||||
|
def extract_from_text(self, text: str, source_file_id: str) -> List[CausalRelation]:
|
||||||
|
"""
|
||||||
|
Extract cause-effect relationships using NLP (SpaCy) + Claude AI.
|
||||||
|
Implements Step 3.1 (NLP Detection) and Step 3.2 (Claude AI Extraction).
|
||||||
|
"""
|
||||||
|
if not text or not text.strip():
|
||||||
|
return []
|
||||||
|
|
||||||
|
all_relationships: List[CausalRelation] = []
|
||||||
|
|
||||||
|
# Step 3.1: BASIC NLP DETECTION (SpaCy)
|
||||||
|
nlp_relationships = self._extract_with_nlp(text, source_file_id)
|
||||||
|
all_relationships.extend(nlp_relationships)
|
||||||
|
logger.info("NLP (SpaCy) extracted %d candidate relationships (low confidence)",
|
||||||
|
len(nlp_relationships))
|
||||||
|
|
||||||
|
# Step 3.2: AI-POWERED EXTRACTION (Claude API)
|
||||||
|
if self.claude_client:
|
||||||
|
claude_relationships = self._extract_with_claude(text, source_file_id)
|
||||||
|
all_relationships.extend(claude_relationships)
|
||||||
|
logger.info("Claude AI extracted %d relationships (high confidence)",
|
||||||
|
len(claude_relationships))
|
||||||
|
else:
|
||||||
|
logger.info("Claude AI extraction skipped (API key not configured)")
|
||||||
|
|
||||||
|
# Also run pattern matching as fallback
|
||||||
|
pattern_relationships = self._extract_with_patterns(text, source_file_id)
|
||||||
|
all_relationships.extend(pattern_relationships)
|
||||||
|
logger.info("Pattern matching extracted %d relationships", len(pattern_relationships))
|
||||||
|
|
||||||
|
# Deduplicate relationships
|
||||||
|
seen = set()
|
||||||
|
unique_relationships = []
|
||||||
|
for rel in all_relationships:
|
||||||
|
key = (rel.cause.lower().strip(), rel.effect.lower().strip())
|
||||||
|
if key not in seen:
|
||||||
|
seen.add(key)
|
||||||
|
unique_relationships.append(rel)
|
||||||
|
|
||||||
|
logger.info("Total unique relationships extracted: %d (from %d total)",
|
||||||
|
len(unique_relationships), len(all_relationships))
|
||||||
|
return unique_relationships
|
||||||
|
|
||||||
|
def _extract_with_nlp(self, text: str, source_file_id: str) -> List[CausalRelation]:
|
||||||
|
"""
|
||||||
|
Step 3.1: Basic NLP Detection using SpaCy.
|
||||||
|
Look for causal keywords and find sentences containing these patterns.
|
||||||
|
Returns potential causal relationships (low confidence).
|
||||||
|
"""
|
||||||
|
if not self.nlp:
|
||||||
|
return []
|
||||||
|
|
||||||
|
relationships: List[CausalRelation] = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Process text with SpaCy
|
||||||
|
doc = self.nlp(text)
|
||||||
|
|
||||||
|
# Find sentences containing causal keywords
|
||||||
|
for sent in doc.sents:
|
||||||
|
sent_text = sent.text.strip()
|
||||||
|
if len(sent_text) < 10:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if sentence contains causal keywords
|
||||||
|
sent_lower = sent_text.lower()
|
||||||
|
has_causal_keyword = any(keyword in sent_lower for keyword in self.CAUSAL_KEYWORDS)
|
||||||
|
|
||||||
|
if has_causal_keyword:
|
||||||
|
# Try to extract cause-effect using dependency parsing
|
||||||
|
cause = None
|
||||||
|
effect = None
|
||||||
|
|
||||||
|
# Look for causal conjunctions
|
||||||
|
for token in sent:
|
||||||
|
if token.text.lower() in ["because", "due", "since", "as"]:
|
||||||
|
# Find the clause after the causal conjunction
|
||||||
|
if token.dep_ in ["mark", "prep"]:
|
||||||
|
# Try to extract cause and effect
|
||||||
|
cause_span = None
|
||||||
|
effect_span = None
|
||||||
|
|
||||||
|
# Simple heuristic: text before "because/due to" is effect, after is cause
|
||||||
|
if "because" in sent_lower or "since" in sent_lower:
|
||||||
|
parts = re.split(r'\b(because|since)\b', sent_text, flags=re.IGNORECASE)
|
||||||
|
if len(parts) >= 3:
|
||||||
|
effect = parts[0].strip()
|
||||||
|
cause = parts[2].strip()
|
||||||
|
elif "due to" in sent_lower:
|
||||||
|
parts = re.split(r'\bdue to\b', sent_text, flags=re.IGNORECASE)
|
||||||
|
if len(parts) >= 2:
|
||||||
|
effect = parts[0].strip()
|
||||||
|
cause = parts[1].strip()
|
||||||
|
|
||||||
|
if cause and effect:
|
||||||
|
# Clean up cause and effect
|
||||||
|
cause = re.sub(r'^[,\s]+|[,\s]+$', '', cause)
|
||||||
|
effect = re.sub(r'^[,\s]+|[,\s]+$', '', effect)
|
||||||
|
|
||||||
|
if len(cause) >= 3 and len(effect) >= 3:
|
||||||
|
relationships.append(CausalRelation(
|
||||||
|
cause=cause,
|
||||||
|
effect=effect,
|
||||||
|
confidence=0.5, # Low confidence for NLP
|
||||||
|
explanation=f"Extracted using NLP (SpaCy) - found causal keyword",
|
||||||
|
source_file_id=source_file_id,
|
||||||
|
source_snippet=sent_text[:200],
|
||||||
|
relationship_type="CAUSES",
|
||||||
|
metadata={
|
||||||
|
"extraction_method": "spacy_nlp",
|
||||||
|
"sentence": sent_text
|
||||||
|
}
|
||||||
|
))
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("NLP extraction failed: %s", e)
|
||||||
|
|
||||||
|
return relationships
|
||||||
|
|
||||||
|
def _extract_with_claude(self, text: str, source_file_id: str) -> List[CausalRelation]:
|
||||||
|
"""
|
||||||
|
Step 3.2: AI-Powered Extraction using Claude API.
|
||||||
|
Send full document text to Claude AI and ask it to find ALL causal relationships.
|
||||||
|
Returns high-quality causal relationships (high confidence).
|
||||||
|
"""
|
||||||
|
if not self.claude_client:
|
||||||
|
return []
|
||||||
|
|
||||||
|
relationships: List[CausalRelation] = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Prepare prompt for Claude
|
||||||
|
system_prompt = """You are an expert at analyzing text and extracting cause-effect relationships.
|
||||||
|
Your task is to identify ALL causal relationships in the given text, including both explicit and implicit ones.
|
||||||
|
|
||||||
|
For each causal relationship, extract:
|
||||||
|
- Cause: What triggered or led to this?
|
||||||
|
- Effect: What was the result or outcome?
|
||||||
|
- Context: Additional background information
|
||||||
|
- Entities: Who or what is involved (people, teams, projects, systems)
|
||||||
|
- Confidence: How certain are you? (0.0 to 1.0)
|
||||||
|
- Source sentence: The sentence or passage where this relationship was found
|
||||||
|
- Date: When did this happen (if mentioned)
|
||||||
|
|
||||||
|
Return the results as a JSON array of objects with this structure:
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"cause": "string",
|
||||||
|
"effect": "string",
|
||||||
|
"context": "string (optional)",
|
||||||
|
"entities": ["string"],
|
||||||
|
"confidence": 0.0-1.0,
|
||||||
|
"source_sentence": "string",
|
||||||
|
"date": "string (optional)"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
Focus on:
|
||||||
|
- Explicit relationships ("because X, therefore Y")
|
||||||
|
- Implicit relationships (strongly implied cause-effect)
|
||||||
|
- Technical and architectural dependencies
|
||||||
|
- Business decisions and their impacts
|
||||||
|
- Process flows and sequences"""
|
||||||
|
|
||||||
|
# Truncate text to fit within token limits (rough estimate: 1 token ≈ 4 characters)
|
||||||
|
max_chars = (self.claude_max_input_tokens - 1000) * 4
|
||||||
|
truncated_text = text[:max_chars] if len(text) > max_chars else text
|
||||||
|
|
||||||
|
user_prompt = f"""Analyze the following text and extract ALL causal relationships.
|
||||||
|
|
||||||
|
Text:
|
||||||
|
{truncated_text}
|
||||||
|
|
||||||
|
Return a JSON array of causal relationships. Be thorough and find both explicit and implicit relationships."""
|
||||||
|
|
||||||
|
# Call Claude API
|
||||||
|
message = self.claude_client.messages.create(
|
||||||
|
model=self.claude_model,
|
||||||
|
max_tokens=self.claude_max_output_tokens,
|
||||||
|
temperature=0.3, # Lower temperature for more focused extraction
|
||||||
|
system=system_prompt,
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": user_prompt
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Extract response text
|
||||||
|
content_blocks = message.content or []
|
||||||
|
response_text = "".join(
|
||||||
|
block.text for block in content_blocks
|
||||||
|
if hasattr(block, "text")
|
||||||
|
)
|
||||||
|
|
||||||
|
if not response_text:
|
||||||
|
logger.warning("Empty response from Claude AI")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Parse JSON response
|
||||||
|
try:
|
||||||
|
# Try to extract JSON from response (might have markdown code blocks)
|
||||||
|
json_match = re.search(r'\[.*\]', response_text, re.DOTALL)
|
||||||
|
if json_match:
|
||||||
|
json_text = json_match.group(0)
|
||||||
|
else:
|
||||||
|
json_text = response_text
|
||||||
|
|
||||||
|
claude_results = json.loads(json_text)
|
||||||
|
|
||||||
|
# Convert Claude results to CausalRelation objects
|
||||||
|
for result in claude_results:
|
||||||
|
cause = result.get("cause", "").strip()
|
||||||
|
effect = result.get("effect", "").strip()
|
||||||
|
context = result.get("context", "")
|
||||||
|
entities = result.get("entities", [])
|
||||||
|
confidence = float(result.get("confidence", 0.85))
|
||||||
|
source_sentence = result.get("source_sentence", "")
|
||||||
|
date = result.get("date", "")
|
||||||
|
|
||||||
|
if not cause or not effect:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Map to Neo4j relationship type (default to CAUSES)
|
||||||
|
relationship_type = "CAUSES"
|
||||||
|
|
||||||
|
explanation = context or f"Extracted by Claude AI"
|
||||||
|
if entities:
|
||||||
|
explanation += f" (Entities: {', '.join(entities)})"
|
||||||
|
|
||||||
|
relationships.append(CausalRelation(
|
||||||
|
cause=cause,
|
||||||
|
effect=effect,
|
||||||
|
confidence=min(confidence, 0.95), # Cap at 0.95
|
||||||
|
explanation=explanation,
|
||||||
|
source_file_id=source_file_id,
|
||||||
|
source_snippet=source_sentence[:200] if source_sentence else "",
|
||||||
|
relationship_type=relationship_type,
|
||||||
|
metadata={
|
||||||
|
"extraction_method": "claude_ai",
|
||||||
|
"context": context,
|
||||||
|
"entities": entities,
|
||||||
|
"date": date,
|
||||||
|
"source_sentence": source_sentence
|
||||||
|
}
|
||||||
|
))
|
||||||
|
|
||||||
|
logger.info("Claude AI successfully extracted %d relationships", len(relationships))
|
||||||
|
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
logger.warning("Failed to parse Claude AI response as JSON: %s. Response: %s",
|
||||||
|
e, response_text[:500])
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Error processing Claude AI response: %s", e)
|
||||||
|
|
||||||
|
except BadRequestError as e:
|
||||||
|
logger.warning("Claude API error: %s", e)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Claude AI extraction failed: %s", e)
|
||||||
|
|
||||||
|
return relationships
|
||||||
|
|
||||||
|
def _extract_with_patterns(self, text: str, source_file_id: str) -> List[CausalRelation]:
|
||||||
|
"""
|
||||||
|
Fallback: Pattern-based extraction (original method).
|
||||||
|
Returns candidate relationships for DoWhy validation.
|
||||||
|
"""
|
||||||
|
if not text or not text.strip():
|
||||||
|
return []
|
||||||
|
|
||||||
|
relationships: List[CausalRelation] = []
|
||||||
|
seen = set() # Avoid duplicates
|
||||||
|
|
||||||
|
# Normalize text
|
||||||
|
text = re.sub(r'\s+', ' ', text)
|
||||||
|
sentences = re.split(r'[.!?]\s+', text)
|
||||||
|
|
||||||
|
for sentence in sentences:
|
||||||
|
sentence = sentence.strip()
|
||||||
|
if len(sentence) < 10: # Skip very short sentences
|
||||||
|
continue
|
||||||
|
|
||||||
|
for pattern, rel_type in self.CAUSE_EFFECT_PATTERNS:
|
||||||
|
matches = re.finditer(pattern, sentence, re.IGNORECASE)
|
||||||
|
|
||||||
|
for match in matches:
|
||||||
|
cause = match.group(1).strip()
|
||||||
|
effect = match.group(2).strip()
|
||||||
|
|
||||||
|
# Filter out very short or very long phrases (increased limit for technical terms)
|
||||||
|
if len(cause) < 3 or len(cause) > 150:
|
||||||
|
continue
|
||||||
|
if len(effect) < 3 or len(effect) > 150:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip common false positives
|
||||||
|
if cause.lower() in ["this", "that", "it", "they", "we"]:
|
||||||
|
continue
|
||||||
|
if effect.lower() in ["this", "that", "it", "they", "we"]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Create unique key
|
||||||
|
key = (cause.lower(), effect.lower())
|
||||||
|
if key in seen:
|
||||||
|
continue
|
||||||
|
seen.add(key)
|
||||||
|
|
||||||
|
# Calculate confidence based on pattern type
|
||||||
|
confidence = self._calculate_confidence(rel_type, sentence)
|
||||||
|
|
||||||
|
# Map pattern type to Neo4j relationship type (uppercase with underscores)
|
||||||
|
neo4j_rel_type = self._map_to_neo4j_relationship_type(rel_type)
|
||||||
|
|
||||||
|
relationships.append(CausalRelation(
|
||||||
|
cause=cause,
|
||||||
|
effect=effect,
|
||||||
|
confidence=confidence,
|
||||||
|
explanation=f"Extracted from text using pattern: {rel_type}",
|
||||||
|
source_file_id=source_file_id,
|
||||||
|
source_snippet=sentence[:200], # First 200 chars
|
||||||
|
relationship_type=neo4j_rel_type,
|
||||||
|
metadata={
|
||||||
|
"extraction_method": "pattern_matching",
|
||||||
|
"pattern_type": rel_type,
|
||||||
|
"sentence": sentence
|
||||||
|
}
|
||||||
|
))
|
||||||
|
|
||||||
|
logger.info("Extracted %d candidate relationships from text (source: %s)",
|
||||||
|
len(relationships), source_file_id)
|
||||||
|
return relationships
|
||||||
|
|
||||||
|
def _calculate_confidence(self, rel_type: str, sentence: str) -> float:
|
||||||
|
"""Calculate confidence score based on pattern type and sentence quality."""
|
||||||
|
base_confidence = {
|
||||||
|
"causes": 0.8,
|
||||||
|
"leads_to": 0.75,
|
||||||
|
"results_in": 0.75,
|
||||||
|
"triggers": 0.7,
|
||||||
|
"produces": 0.7,
|
||||||
|
"depends_on": 0.65,
|
||||||
|
"requires": 0.65,
|
||||||
|
"needs": 0.6,
|
||||||
|
"if_then": 0.8,
|
||||||
|
"when_then": 0.75,
|
||||||
|
"implies": 0.7,
|
||||||
|
"follows": 0.6,
|
||||||
|
"comes_after": 0.6,
|
||||||
|
"first_then": 0.7,
|
||||||
|
"enables": 0.7,
|
||||||
|
"allows": 0.65,
|
||||||
|
"facilitates": 0.65,
|
||||||
|
"relies_on": 0.65,
|
||||||
|
"uses": 0.6,
|
||||||
|
"utilizes": 0.6,
|
||||||
|
"leverages": 0.6,
|
||||||
|
"connects_to": 0.7,
|
||||||
|
"communicates_with": 0.7,
|
||||||
|
"interacts_with": 0.7,
|
||||||
|
"integrates_with": 0.7,
|
||||||
|
"provides": 0.7,
|
||||||
|
"supports": 0.7,
|
||||||
|
"handles": 0.65,
|
||||||
|
"manages": 0.65,
|
||||||
|
"controls": 0.65,
|
||||||
|
"processes": 0.65,
|
||||||
|
"generates": 0.7,
|
||||||
|
"creates": 0.7,
|
||||||
|
"implements": 0.7,
|
||||||
|
"delivers": 0.7,
|
||||||
|
"flows_to": 0.7,
|
||||||
|
"sends_to": 0.7,
|
||||||
|
"transmits_to": 0.7,
|
||||||
|
"receives_from": 0.7,
|
||||||
|
"ensures": 0.75,
|
||||||
|
"precedes": 0.6,
|
||||||
|
"contains": 0.6,
|
||||||
|
"includes": 0.6,
|
||||||
|
"consists_of": 0.6,
|
||||||
|
"affects": 0.65,
|
||||||
|
"impacts": 0.65,
|
||||||
|
"influences": 0.65,
|
||||||
|
}.get(rel_type, 0.5)
|
||||||
|
|
||||||
|
# Adjust based on sentence length (longer sentences might be more descriptive)
|
||||||
|
if len(sentence) > 50:
|
||||||
|
base_confidence += 0.05
|
||||||
|
|
||||||
|
return min(base_confidence, 0.95)
|
||||||
|
|
||||||
|
def _map_to_neo4j_relationship_type(self, pattern_type: str) -> str:
|
||||||
|
"""Map pattern type to Neo4j relationship type (uppercase with underscores)."""
|
||||||
|
# Map lowercase pattern types to Neo4j relationship types
|
||||||
|
mapping = {
|
||||||
|
"causes": "CAUSES",
|
||||||
|
"leads_to": "LEADS_TO",
|
||||||
|
"results_in": "RESULTS_IN",
|
||||||
|
"triggers": "TRIGGERS",
|
||||||
|
"produces": "PRODUCES",
|
||||||
|
"depends_on": "DEPENDS_ON",
|
||||||
|
"requires": "REQUIRES",
|
||||||
|
"needs": "NEEDS",
|
||||||
|
"relies_on": "RELIES_ON",
|
||||||
|
"uses": "USES",
|
||||||
|
"utilizes": "UTILIZES",
|
||||||
|
"leverages": "LEVERAGES",
|
||||||
|
"connects_to": "CONNECTS_TO",
|
||||||
|
"communicates_with": "COMMUNICATES_WITH",
|
||||||
|
"interacts_with": "INTERACTS_WITH",
|
||||||
|
"integrates_with": "INTEGRATES_WITH",
|
||||||
|
"provides": "PROVIDES",
|
||||||
|
"supports": "SUPPORTS",
|
||||||
|
"handles": "HANDLES",
|
||||||
|
"manages": "MANAGES",
|
||||||
|
"controls": "CONTROLS",
|
||||||
|
"processes": "PROCESSES",
|
||||||
|
"generates": "GENERATES",
|
||||||
|
"creates": "CREATES",
|
||||||
|
"implements": "IMPLEMENTS",
|
||||||
|
"delivers": "DELIVERS",
|
||||||
|
"flows_to": "FLOWS_TO",
|
||||||
|
"sends_to": "SENDS_TO",
|
||||||
|
"transmits_to": "TRANSMITS_TO",
|
||||||
|
"receives_from": "RECEIVES_FROM",
|
||||||
|
"if_then": "IF_THEN",
|
||||||
|
"when_then": "WHEN_THEN",
|
||||||
|
"implies": "IMPLIES",
|
||||||
|
"ensures": "ENSURES",
|
||||||
|
"follows": "FOLLOWS",
|
||||||
|
"comes_after": "COMES_AFTER",
|
||||||
|
"first_then": "FIRST_THEN",
|
||||||
|
"precedes": "PRECEDES",
|
||||||
|
"contains": "CONTAINS",
|
||||||
|
"includes": "INCLUDES",
|
||||||
|
"consists_of": "CONSISTS_OF",
|
||||||
|
"affects": "AFFECTS",
|
||||||
|
"impacts": "IMPACTS",
|
||||||
|
"influences": "INFLUENCES",
|
||||||
|
"enables": "ENABLES",
|
||||||
|
"allows": "ALLOWS",
|
||||||
|
"facilitates": "FACILITATES",
|
||||||
|
}
|
||||||
|
return mapping.get(pattern_type, "CAUSES") # Default to CAUSES if not found
|
||||||
|
|
||||||
|
def extract_from_qwen_results(self, qwen_results: List[Dict], source_file_id: str) -> List[CausalRelation]:
|
||||||
|
"""Convert Qwen2.5-VL extraction results to CausalRelation objects."""
|
||||||
|
relationships: List[CausalRelation] = []
|
||||||
|
|
||||||
|
for result in qwen_results:
|
||||||
|
entity1 = result.get("entity1", "").strip()
|
||||||
|
entity2 = result.get("entity2", "").strip()
|
||||||
|
rel_type = result.get("relationship_type", "").strip()
|
||||||
|
description = result.get("description", "").strip()
|
||||||
|
confidence = float(result.get("confidence", 0.7))
|
||||||
|
|
||||||
|
if not entity1 or not entity2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Map relationship type to cause-effect
|
||||||
|
# For most types, entity1 is cause, entity2 is effect
|
||||||
|
cause = entity1
|
||||||
|
effect = entity2
|
||||||
|
|
||||||
|
# Some relationship types might need reversal
|
||||||
|
if rel_type in ["depends_on", "requires", "needs"]:
|
||||||
|
# If A depends on B, then B is the cause, A is the effect
|
||||||
|
cause, effect = effect, cause
|
||||||
|
|
||||||
|
# Map Qwen relationship type to Neo4j format
|
||||||
|
neo4j_rel_type = self._map_to_neo4j_relationship_type(rel_type.lower().replace("-", "_"))
|
||||||
|
|
||||||
|
relationships.append(CausalRelation(
|
||||||
|
cause=cause,
|
||||||
|
effect=effect,
|
||||||
|
confidence=confidence,
|
||||||
|
explanation=description or f"Extracted from diagram: {rel_type}",
|
||||||
|
source_file_id=source_file_id,
|
||||||
|
source_snippet=description,
|
||||||
|
relationship_type=neo4j_rel_type,
|
||||||
|
metadata={
|
||||||
|
"extraction_method": "qwen2.5-vl",
|
||||||
|
"relationship_type": rel_type,
|
||||||
|
"original_entity1": entity1,
|
||||||
|
"original_entity2": entity2
|
||||||
|
}
|
||||||
|
))
|
||||||
|
|
||||||
|
return relationships
|
||||||
|
|
||||||
@ -0,0 +1,570 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List, Optional, Set
|
||||||
|
|
||||||
|
from anthropic import Anthropic, BadRequestError
|
||||||
|
|
||||||
|
from ..config import get_settings
|
||||||
|
from ..models import CausalRelation, ProjectReport
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Try to import PDF generation libraries
|
||||||
|
try:
|
||||||
|
import markdown
|
||||||
|
from markdown.extensions import codehilite, fenced_code, tables
|
||||||
|
HAS_MARKDOWN = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_MARKDOWN = False
|
||||||
|
logger.warning("markdown library not available - PDF conversion will be limited")
|
||||||
|
|
||||||
|
try:
|
||||||
|
from weasyprint import HTML, CSS
|
||||||
|
from weasyprint.text.fonts import FontConfiguration
|
||||||
|
HAS_WEASYPRINT = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_WEASYPRINT = False
|
||||||
|
logger.warning("weasyprint not available - PDF conversion will be skipped")
|
||||||
|
|
||||||
|
|
||||||
|
class ReportGenerator:
|
||||||
|
"""Generate beginner-friendly onboarding reports from knowledge graph."""
|
||||||
|
|
||||||
|
def __init__(self, api_key: str | None = None, model: str | None = None):
|
||||||
|
settings = get_settings()
|
||||||
|
self.api_key = api_key or settings.anthropic_api_key
|
||||||
|
self.model = model or settings.claude_model
|
||||||
|
self.max_output_tokens = settings.claude_max_output_tokens
|
||||||
|
|
||||||
|
if not self.api_key:
|
||||||
|
raise ValueError("Anthropic API key is required for report generation")
|
||||||
|
|
||||||
|
self.client = Anthropic(api_key=self.api_key)
|
||||||
|
|
||||||
|
def generate_onboarding_report(
|
||||||
|
self,
|
||||||
|
job_id: str,
|
||||||
|
relations: List[CausalRelation],
|
||||||
|
vector_store,
|
||||||
|
embedder,
|
||||||
|
graph_writer=None,
|
||||||
|
kg_summary: Dict | None = None
|
||||||
|
) -> ProjectReport:
|
||||||
|
"""
|
||||||
|
Generate a beginner-friendly onboarding report from the knowledge graph.
|
||||||
|
"""
|
||||||
|
logger.info("Generating onboarding report for job %s", job_id)
|
||||||
|
|
||||||
|
# Step 1: Analyze KG structure
|
||||||
|
key_concepts = self._analyze_kg_structure(relations)
|
||||||
|
|
||||||
|
# Step 2: Semantic search for different topics
|
||||||
|
overview_content = self._search_topic(
|
||||||
|
"project overview main purpose goals objectives",
|
||||||
|
vector_store, embedder, job_id, top_k=10
|
||||||
|
)
|
||||||
|
|
||||||
|
concepts_content = self._search_topic(
|
||||||
|
"core concepts definitions key terms important ideas",
|
||||||
|
vector_store, embedder, job_id, top_k=15
|
||||||
|
)
|
||||||
|
|
||||||
|
processes_content = self._search_topic(
|
||||||
|
"how system works processes flows procedures steps",
|
||||||
|
vector_store, embedder, job_id, top_k=15
|
||||||
|
)
|
||||||
|
|
||||||
|
relationships_content = self._search_topic(
|
||||||
|
"cause effect dependencies relationships connections",
|
||||||
|
vector_store, embedder, job_id, top_k=20
|
||||||
|
)
|
||||||
|
|
||||||
|
components_content = self._search_topic(
|
||||||
|
"components modules systems parts architecture",
|
||||||
|
vector_store, embedder, job_id, top_k=15
|
||||||
|
)
|
||||||
|
|
||||||
|
# Step 3: Query Neo4j for causal chains (as per README Step 7.3)
|
||||||
|
causal_chains = []
|
||||||
|
key_entities = []
|
||||||
|
if graph_writer:
|
||||||
|
try:
|
||||||
|
# Query 1: Get critical causal chains
|
||||||
|
causal_chains = graph_writer.query_causal_chains(
|
||||||
|
job_id=job_id,
|
||||||
|
min_length=2,
|
||||||
|
max_length=4,
|
||||||
|
min_confidence=0.8,
|
||||||
|
limit=20
|
||||||
|
)
|
||||||
|
logger.info("Retrieved %d causal chains from Neo4j", len(causal_chains))
|
||||||
|
|
||||||
|
# Query 2: Get key entities
|
||||||
|
key_entities = graph_writer.query_key_entities(job_id=job_id, limit=20)
|
||||||
|
logger.info("Retrieved %d key entities from Neo4j", len(key_entities))
|
||||||
|
except Exception as neo4j_exc:
|
||||||
|
logger.warning("Failed to query Neo4j: %s", neo4j_exc)
|
||||||
|
|
||||||
|
# Step 4: Organize content hierarchically
|
||||||
|
organized_content = self._organize_content(
|
||||||
|
key_concepts,
|
||||||
|
overview_content,
|
||||||
|
concepts_content,
|
||||||
|
processes_content,
|
||||||
|
relationships_content,
|
||||||
|
components_content,
|
||||||
|
causal_chains,
|
||||||
|
key_entities
|
||||||
|
)
|
||||||
|
|
||||||
|
# Step 5: Generate report with Claude
|
||||||
|
report_content = self._claude_generate_report(
|
||||||
|
job_id=job_id,
|
||||||
|
relations=relations,
|
||||||
|
organized_content=organized_content,
|
||||||
|
kg_summary=kg_summary or {}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Step 6: Parse sections
|
||||||
|
sections = self._parse_sections(report_content)
|
||||||
|
|
||||||
|
# Step 7: Convert to PDF (as per README Step 7.8)
|
||||||
|
pdf_path = None
|
||||||
|
if HAS_WEASYPRINT and HAS_MARKDOWN:
|
||||||
|
try:
|
||||||
|
pdf_path = self._convert_to_pdf(report_content, job_id)
|
||||||
|
logger.info("Generated PDF report: %s", pdf_path)
|
||||||
|
except Exception as pdf_exc:
|
||||||
|
logger.warning("PDF conversion failed: %s", pdf_exc)
|
||||||
|
|
||||||
|
# Estimate pages (rough: ~500 words per page)
|
||||||
|
word_count = len(report_content.split())
|
||||||
|
estimated_pages = max(1, word_count // 500)
|
||||||
|
|
||||||
|
return ProjectReport(
|
||||||
|
job_id=job_id,
|
||||||
|
title="Project Onboarding Guide",
|
||||||
|
content=report_content,
|
||||||
|
sections=sections,
|
||||||
|
key_concepts=list(key_concepts)[:20], # Top 20 concepts
|
||||||
|
total_pages=estimated_pages,
|
||||||
|
generated_at=datetime.utcnow(),
|
||||||
|
metadata={
|
||||||
|
"total_relations": len(relations),
|
||||||
|
"total_concepts": len(key_concepts),
|
||||||
|
"causal_chains_count": len(causal_chains),
|
||||||
|
"key_entities_count": len(key_entities),
|
||||||
|
"model": self.model,
|
||||||
|
"pdf_path": str(pdf_path) if pdf_path else None
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
def _analyze_kg_structure(self, relations: List[CausalRelation]) -> Set[str]:
|
||||||
|
"""Identify key concepts from the knowledge graph."""
|
||||||
|
concepts = set()
|
||||||
|
|
||||||
|
for rel in relations:
|
||||||
|
concepts.add(rel.cause)
|
||||||
|
concepts.add(rel.effect)
|
||||||
|
|
||||||
|
# Identify high-degree nodes (concepts involved in many relationships)
|
||||||
|
cause_counts: Dict[str, int] = {}
|
||||||
|
effect_counts: Dict[str, int] = {}
|
||||||
|
|
||||||
|
for rel in relations:
|
||||||
|
cause_counts[rel.cause] = cause_counts.get(rel.cause, 0) + 1
|
||||||
|
effect_counts[rel.effect] = effect_counts.get(rel.effect, 0) + 1
|
||||||
|
|
||||||
|
# Key concepts are those with high degree (appear in many relationships)
|
||||||
|
all_counts = {**cause_counts, **effect_counts}
|
||||||
|
threshold = max(1, len(relations) // 10) # Top 10% most connected
|
||||||
|
|
||||||
|
key_concepts = {
|
||||||
|
concept for concept, count in all_counts.items()
|
||||||
|
if count >= threshold
|
||||||
|
}
|
||||||
|
|
||||||
|
# If threshold is too high, use top N concepts
|
||||||
|
if len(key_concepts) < 5:
|
||||||
|
sorted_concepts = sorted(all_counts.items(), key=lambda x: x[1], reverse=True)
|
||||||
|
key_concepts = {concept for concept, _ in sorted_concepts[:20]}
|
||||||
|
|
||||||
|
logger.info("Identified %d key concepts from %d relationships",
|
||||||
|
len(key_concepts), len(relations))
|
||||||
|
return key_concepts
|
||||||
|
|
||||||
|
def _search_topic(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
vector_store,
|
||||||
|
embedder,
|
||||||
|
job_id: str,
|
||||||
|
top_k: int = 10
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""Search for content related to a topic."""
|
||||||
|
try:
|
||||||
|
results = vector_store.search_by_text(
|
||||||
|
query_text=query,
|
||||||
|
embedder=embedder,
|
||||||
|
job_id=job_id,
|
||||||
|
top_k=top_k
|
||||||
|
)
|
||||||
|
return results
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("Search failed for topic '%s': %s", query, exc)
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _organize_content(
|
||||||
|
self,
|
||||||
|
key_concepts: Set[str],
|
||||||
|
overview_content: List[Dict],
|
||||||
|
concepts_content: List[Dict],
|
||||||
|
processes_content: List[Dict],
|
||||||
|
relationships_content: List[Dict],
|
||||||
|
components_content: List[Dict],
|
||||||
|
causal_chains: List[Dict] = None,
|
||||||
|
key_entities: List[Dict] = None
|
||||||
|
) -> Dict:
|
||||||
|
"""Organize retrieved content into a structured format."""
|
||||||
|
return {
|
||||||
|
"key_concepts": list(key_concepts),
|
||||||
|
"overview": [r.get("payload", {}) for r in overview_content],
|
||||||
|
"concepts": [r.get("payload", {}) for r in concepts_content],
|
||||||
|
"processes": [r.get("payload", {}) for r in processes_content],
|
||||||
|
"relationships": [r.get("payload", {}) for r in relationships_content],
|
||||||
|
"components": [r.get("payload", {}) for r in components_content],
|
||||||
|
"causal_chains": causal_chains or [],
|
||||||
|
"key_entities": key_entities or [],
|
||||||
|
}
|
||||||
|
|
||||||
|
def _claude_generate_report(
|
||||||
|
self,
|
||||||
|
job_id: str,
|
||||||
|
relations: List[CausalRelation],
|
||||||
|
organized_content: Dict,
|
||||||
|
kg_summary: Dict
|
||||||
|
) -> str:
|
||||||
|
"""Generate report using Claude AI."""
|
||||||
|
|
||||||
|
# Build KG summary text
|
||||||
|
kg_summary_text = self._build_kg_summary(relations, organized_content)
|
||||||
|
|
||||||
|
# Build system prompt
|
||||||
|
system_prompt = """You are an expert technical writer specializing in creating beginner-friendly onboarding documentation for new team members.
|
||||||
|
|
||||||
|
Your goal is to explain complex project information in simple, clear language that anyone can understand, even without technical background.
|
||||||
|
|
||||||
|
Guidelines:
|
||||||
|
- Use simple, clear language - avoid jargon or explain it when necessary
|
||||||
|
- Use examples and analogies to make concepts relatable
|
||||||
|
- Structure information logically (basics first, then advanced)
|
||||||
|
- Make it engaging and easy to follow
|
||||||
|
- Cover all important aspects comprehensively
|
||||||
|
- Write in a friendly, welcoming tone
|
||||||
|
- Use headings, bullet points, and clear sections
|
||||||
|
- Explain "why" not just "what"
|
||||||
|
|
||||||
|
Generate a comprehensive onboarding document that helps a new team member understand the entire project."""
|
||||||
|
|
||||||
|
# Format causal chains from Neo4j
|
||||||
|
causal_chains_text = self._format_causal_chains(organized_content.get('causal_chains', []))
|
||||||
|
key_entities_text = self._format_key_entities(organized_content.get('key_entities', []))
|
||||||
|
|
||||||
|
# Build user prompt
|
||||||
|
user_prompt = f"""Generate a comprehensive, beginner-friendly onboarding document for this project.
|
||||||
|
|
||||||
|
KNOWLEDGE GRAPH SUMMARY:
|
||||||
|
{kg_summary_text}
|
||||||
|
|
||||||
|
IMPORTANT RELATIONSHIPS:
|
||||||
|
{self._format_relationships(relations[:50])} # Top 50 relationships
|
||||||
|
|
||||||
|
CAUSAL CHAINS (from Knowledge Graph):
|
||||||
|
{causal_chains_text}
|
||||||
|
|
||||||
|
KEY ENTITIES (from Knowledge Graph):
|
||||||
|
{key_entities_text}
|
||||||
|
|
||||||
|
KEY CONCEPTS:
|
||||||
|
{', '.join(organized_content.get('key_concepts', [])[:30])}
|
||||||
|
|
||||||
|
REQUIRED SECTIONS:
|
||||||
|
1. Project Overview
|
||||||
|
- What is this project about?
|
||||||
|
- Main purpose and goals
|
||||||
|
- Key stakeholders or users
|
||||||
|
|
||||||
|
2. Core Concepts (Explained Simply)
|
||||||
|
- Explain each important concept in simple terms
|
||||||
|
- Why each concept matters
|
||||||
|
- How concepts relate to each other
|
||||||
|
|
||||||
|
3. How Things Work Together
|
||||||
|
- System flow (simple explanation)
|
||||||
|
- Key processes and workflows
|
||||||
|
- Dependencies explained simply
|
||||||
|
|
||||||
|
4. Important Relationships
|
||||||
|
- Cause → Effect relationships (explained in plain language)
|
||||||
|
- "When X happens, Y occurs because..."
|
||||||
|
- Visual flow if possible (describe it)
|
||||||
|
|
||||||
|
5. Key Components
|
||||||
|
- Main modules/systems/components
|
||||||
|
- What each does (beginner-friendly)
|
||||||
|
- How they interact
|
||||||
|
|
||||||
|
6. Getting Started
|
||||||
|
- Where to start learning
|
||||||
|
- What to understand first
|
||||||
|
- Recommended learning path
|
||||||
|
|
||||||
|
7. Common Questions
|
||||||
|
- FAQ based on the knowledge graph
|
||||||
|
- Answers in simple terms
|
||||||
|
|
||||||
|
Generate the complete onboarding document in Markdown format. Make it comprehensive, beginner-friendly, and easy to follow."""
|
||||||
|
|
||||||
|
try:
|
||||||
|
message = self.client.messages.create(
|
||||||
|
model=self.model,
|
||||||
|
max_tokens=self.max_output_tokens,
|
||||||
|
temperature=0.3, # Slightly creative but focused
|
||||||
|
system=system_prompt,
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": user_prompt
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
content_blocks = message.content or []
|
||||||
|
report_text = "".join(
|
||||||
|
block.text for block in content_blocks
|
||||||
|
if hasattr(block, "text")
|
||||||
|
)
|
||||||
|
|
||||||
|
if not report_text:
|
||||||
|
logger.warning("Empty report generated")
|
||||||
|
return "# Project Onboarding Guide\n\nNo content available."
|
||||||
|
|
||||||
|
logger.info("Generated onboarding report (%d characters)", len(report_text))
|
||||||
|
return report_text
|
||||||
|
|
||||||
|
except BadRequestError as e:
|
||||||
|
# Handle API credit/authentication errors gracefully
|
||||||
|
error_msg = str(e)
|
||||||
|
if "credit balance" in error_msg.lower() or "too low" in error_msg.lower():
|
||||||
|
logger.error("Claude API credit balance too low. Cannot generate report.")
|
||||||
|
raise ValueError("Claude API credit balance is too low. Please add credits to your Anthropic account to generate reports.")
|
||||||
|
elif "invalid_request_error" in error_msg.lower():
|
||||||
|
logger.error("Claude API invalid request: %s", error_msg)
|
||||||
|
raise ValueError(f"Claude API request failed: {error_msg}")
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("Failed to generate report: %s", e)
|
||||||
|
raise
|
||||||
|
|
||||||
|
def _build_kg_summary(
|
||||||
|
self,
|
||||||
|
relations: List[CausalRelation],
|
||||||
|
organized_content: Dict
|
||||||
|
) -> str:
|
||||||
|
"""Build a text summary of the knowledge graph."""
|
||||||
|
summary_parts = [
|
||||||
|
f"Total Relationships: {len(relations)}",
|
||||||
|
f"Total Concepts: {len(organized_content.get('key_concepts', []))}",
|
||||||
|
"",
|
||||||
|
"Top Relationships:",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Show top relationships by confidence
|
||||||
|
top_relations = sorted(relations, key=lambda r: r.confidence, reverse=True)[:20]
|
||||||
|
for i, rel in enumerate(top_relations, 1):
|
||||||
|
summary_parts.append(
|
||||||
|
f"{i}. {rel.cause} → {rel.effect} "
|
||||||
|
f"(confidence: {rel.confidence:.2f})"
|
||||||
|
)
|
||||||
|
|
||||||
|
return "\n".join(summary_parts)
|
||||||
|
|
||||||
|
def _format_relationships(self, relations: List[CausalRelation]) -> str:
|
||||||
|
"""Format relationships for the prompt."""
|
||||||
|
if not relations:
|
||||||
|
return "No relationships found."
|
||||||
|
|
||||||
|
lines = []
|
||||||
|
for rel in relations[:50]: # Limit to 50
|
||||||
|
line = f"- {rel.cause} → {rel.effect}"
|
||||||
|
if rel.explanation:
|
||||||
|
line += f" ({rel.explanation[:100]})"
|
||||||
|
lines.append(line)
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
def _parse_sections(self, content: str) -> Dict[str, str]:
|
||||||
|
"""Parse markdown content into sections."""
|
||||||
|
sections = {}
|
||||||
|
current_section = None
|
||||||
|
current_content = []
|
||||||
|
|
||||||
|
lines = content.split('\n')
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
# Check if it's a heading (starts with #)
|
||||||
|
if line.strip().startswith('#'):
|
||||||
|
# Save previous section
|
||||||
|
if current_section:
|
||||||
|
sections[current_section] = '\n'.join(current_content).strip()
|
||||||
|
|
||||||
|
# Start new section
|
||||||
|
current_section = line.strip().lstrip('#').strip()
|
||||||
|
current_content = [line]
|
||||||
|
else:
|
||||||
|
if current_section:
|
||||||
|
current_content.append(line)
|
||||||
|
else:
|
||||||
|
# Content before first heading
|
||||||
|
if 'introduction' not in sections:
|
||||||
|
sections['introduction'] = line
|
||||||
|
else:
|
||||||
|
sections['introduction'] += '\n' + line
|
||||||
|
|
||||||
|
# Save last section
|
||||||
|
if current_section:
|
||||||
|
sections[current_section] = '\n'.join(current_content).strip()
|
||||||
|
|
||||||
|
return sections
|
||||||
|
|
||||||
|
def _format_causal_chains(self, causal_chains: List[Dict]) -> str:
|
||||||
|
"""Format causal chains from Neo4j for the prompt."""
|
||||||
|
if not causal_chains:
|
||||||
|
return "No causal chains found in knowledge graph."
|
||||||
|
|
||||||
|
lines = []
|
||||||
|
for i, chain_data in enumerate(causal_chains[:20], 1): # Top 20 chains
|
||||||
|
chain = chain_data.get("chain", [])
|
||||||
|
avg_confidence = chain_data.get("avg_confidence", 0.0)
|
||||||
|
|
||||||
|
if len(chain) >= 2:
|
||||||
|
chain_text = " → ".join(chain)
|
||||||
|
lines.append(f"{i}. {chain_text} (confidence: {avg_confidence:.2f})")
|
||||||
|
|
||||||
|
return "\n".join(lines) if lines else "No causal chains found."
|
||||||
|
|
||||||
|
def _format_key_entities(self, key_entities: List[Dict]) -> str:
|
||||||
|
"""Format key entities from Neo4j for the prompt."""
|
||||||
|
if not key_entities:
|
||||||
|
return "No key entities found in knowledge graph."
|
||||||
|
|
||||||
|
lines = []
|
||||||
|
for entity in key_entities[:20]: # Top 20 entities
|
||||||
|
name = entity.get("name", "")
|
||||||
|
entity_type = entity.get("type", "Entity")
|
||||||
|
relation_count = entity.get("relation_count", 0)
|
||||||
|
lines.append(f"- {name} ({entity_type}): involved in {relation_count} relationships")
|
||||||
|
|
||||||
|
return "\n".join(lines) if lines else "No key entities found."
|
||||||
|
|
||||||
|
def _convert_to_pdf(self, markdown_content: str, job_id: str) -> Optional[Path]:
|
||||||
|
"""
|
||||||
|
Convert Markdown report to PDF as per README Step 7.8.
|
||||||
|
Uses markdown + weasyprint for PDF generation.
|
||||||
|
"""
|
||||||
|
if not HAS_MARKDOWN or not HAS_WEASYPRINT:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Convert Markdown to HTML
|
||||||
|
html_content = markdown.markdown(
|
||||||
|
markdown_content,
|
||||||
|
extensions=['codehilite', 'fenced_code', 'tables']
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add CSS styling
|
||||||
|
css_style = """
|
||||||
|
@page {
|
||||||
|
size: A4;
|
||||||
|
margin: 2cm;
|
||||||
|
}
|
||||||
|
body {
|
||||||
|
font-family: 'Georgia', serif;
|
||||||
|
line-height: 1.6;
|
||||||
|
color: #333;
|
||||||
|
}
|
||||||
|
h1, h2, h3, h4 {
|
||||||
|
color: #2c3e50;
|
||||||
|
margin-top: 1.5em;
|
||||||
|
margin-bottom: 0.5em;
|
||||||
|
}
|
||||||
|
h1 { font-size: 2em; border-bottom: 2px solid #3498db; padding-bottom: 0.3em; }
|
||||||
|
h2 { font-size: 1.5em; border-bottom: 1px solid #95a5a6; padding-bottom: 0.2em; }
|
||||||
|
h3 { font-size: 1.2em; }
|
||||||
|
code {
|
||||||
|
background-color: #f4f4f4;
|
||||||
|
padding: 2px 4px;
|
||||||
|
border-radius: 3px;
|
||||||
|
font-family: 'Courier New', monospace;
|
||||||
|
}
|
||||||
|
pre {
|
||||||
|
background-color: #f4f4f4;
|
||||||
|
padding: 1em;
|
||||||
|
border-radius: 5px;
|
||||||
|
overflow-x: auto;
|
||||||
|
}
|
||||||
|
table {
|
||||||
|
border-collapse: collapse;
|
||||||
|
width: 100%;
|
||||||
|
margin: 1em 0;
|
||||||
|
}
|
||||||
|
th, td {
|
||||||
|
border: 1px solid #ddd;
|
||||||
|
padding: 8px;
|
||||||
|
text-align: left;
|
||||||
|
}
|
||||||
|
th {
|
||||||
|
background-color: #3498db;
|
||||||
|
color: white;
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Create full HTML document
|
||||||
|
full_html = f"""
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<title>Project Onboarding Guide</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
{html_content}
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Generate PDF
|
||||||
|
settings = get_settings()
|
||||||
|
storage_root = Path(settings.storage_root)
|
||||||
|
reports_dir = storage_root / "reports"
|
||||||
|
reports_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
pdf_path = reports_dir / f"report_{job_id}.pdf"
|
||||||
|
|
||||||
|
HTML(string=full_html).write_pdf(
|
||||||
|
pdf_path,
|
||||||
|
stylesheets=[CSS(string=css_style)]
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info("PDF report generated: %s", pdf_path)
|
||||||
|
return pdf_path
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception("Failed to convert Markdown to PDF: %s", exc)
|
||||||
|
return None
|
||||||
|
|
||||||
@ -0,0 +1,269 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
from uuid import uuid4
|
||||||
|
|
||||||
|
from ..config import get_settings
|
||||||
|
from ..models import CausalRelation
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from qdrant_client import QdrantClient
|
||||||
|
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
|
||||||
|
HAS_QDRANT = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_QDRANT = False
|
||||||
|
logger.warning("qdrant-client not available")
|
||||||
|
|
||||||
|
|
||||||
|
class VectorStore:
|
||||||
|
"""Qdrant vector database client for storing KG embeddings."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
url: str | None = None,
|
||||||
|
collection_name: str | None = None,
|
||||||
|
vector_size: int | None = None
|
||||||
|
):
|
||||||
|
if not HAS_QDRANT:
|
||||||
|
raise ImportError("qdrant-client is required for vector storage")
|
||||||
|
|
||||||
|
settings = get_settings()
|
||||||
|
self.url = url or settings.qdrant_url
|
||||||
|
self.collection_name = collection_name or settings.qdrant_collection_name
|
||||||
|
self.vector_size = vector_size or settings.qdrant_vector_size
|
||||||
|
|
||||||
|
logger.info("Connecting to Qdrant at %s", self.url)
|
||||||
|
try:
|
||||||
|
self.client = QdrantClient(url=self.url)
|
||||||
|
logger.info("Connected to Qdrant")
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception("Failed to connect to Qdrant: %s", exc)
|
||||||
|
raise
|
||||||
|
|
||||||
|
# Ensure collection exists
|
||||||
|
self._ensure_collection()
|
||||||
|
|
||||||
|
def _ensure_collection(self) -> None:
|
||||||
|
"""Create collection if it doesn't exist."""
|
||||||
|
try:
|
||||||
|
collections = self.client.get_collections()
|
||||||
|
collection_names = [col.name for col in collections.collections]
|
||||||
|
|
||||||
|
if self.collection_name not in collection_names:
|
||||||
|
logger.info("Creating Qdrant collection: %s", self.collection_name)
|
||||||
|
try:
|
||||||
|
self.client.create_collection(
|
||||||
|
collection_name=self.collection_name,
|
||||||
|
vectors_config=VectorParams(
|
||||||
|
size=self.vector_size,
|
||||||
|
distance=Distance.COSINE
|
||||||
|
)
|
||||||
|
)
|
||||||
|
logger.info("Created collection: %s", self.collection_name)
|
||||||
|
except Exception as create_exc:
|
||||||
|
# Collection might have been created by another instance
|
||||||
|
if "already exists" in str(create_exc).lower() or "409" in str(create_exc):
|
||||||
|
logger.info("Collection %s already exists (created by another instance)", self.collection_name)
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
else:
|
||||||
|
logger.debug("Collection %s already exists", self.collection_name)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception("Failed to ensure collection: %s", exc)
|
||||||
|
raise
|
||||||
|
|
||||||
|
def store_relation(
|
||||||
|
self,
|
||||||
|
relation: CausalRelation,
|
||||||
|
embedding: List[float],
|
||||||
|
job_id: str
|
||||||
|
) -> str:
|
||||||
|
"""Store a relationship embedding in Qdrant."""
|
||||||
|
point_id = str(uuid4())
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"job_id": job_id,
|
||||||
|
"cause": relation.cause,
|
||||||
|
"effect": relation.effect,
|
||||||
|
"confidence": relation.confidence,
|
||||||
|
"source_file_id": relation.source_file_id or "",
|
||||||
|
"source_snippet": relation.source_snippet or "",
|
||||||
|
"explanation": relation.explanation or "",
|
||||||
|
}
|
||||||
|
|
||||||
|
point = PointStruct(
|
||||||
|
id=point_id,
|
||||||
|
vector=embedding,
|
||||||
|
payload=payload
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.client.upsert(
|
||||||
|
collection_name=self.collection_name,
|
||||||
|
points=[point]
|
||||||
|
)
|
||||||
|
logger.debug("Stored relation embedding: %s -> %s", relation.cause, relation.effect)
|
||||||
|
return point_id
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("Failed to store relation: %s", exc)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def store_concept(
|
||||||
|
self,
|
||||||
|
concept_name: str,
|
||||||
|
embedding: List[float],
|
||||||
|
job_id: str,
|
||||||
|
description: str | None = None
|
||||||
|
) -> str:
|
||||||
|
"""Store a concept/node embedding in Qdrant."""
|
||||||
|
point_id = str(uuid4())
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"job_id": job_id,
|
||||||
|
"concept_name": concept_name,
|
||||||
|
"description": description or "",
|
||||||
|
"type": "concept"
|
||||||
|
}
|
||||||
|
|
||||||
|
point = PointStruct(
|
||||||
|
id=point_id,
|
||||||
|
vector=embedding,
|
||||||
|
payload=payload
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.client.upsert(
|
||||||
|
collection_name=self.collection_name,
|
||||||
|
points=[point]
|
||||||
|
)
|
||||||
|
logger.debug("Stored concept embedding: %s", concept_name)
|
||||||
|
return point_id
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("Failed to store concept: %s", exc)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def search(
|
||||||
|
self,
|
||||||
|
query_embedding: List[float],
|
||||||
|
job_id: str | None = None,
|
||||||
|
top_k: int = 10,
|
||||||
|
score_threshold: float = 0.5
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""Search for similar vectors in Qdrant."""
|
||||||
|
try:
|
||||||
|
# Build filter if job_id is provided
|
||||||
|
query_filter = None
|
||||||
|
if job_id:
|
||||||
|
query_filter = Filter(
|
||||||
|
must=[
|
||||||
|
FieldCondition(
|
||||||
|
key="job_id",
|
||||||
|
match=MatchValue(value=job_id)
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Use the collections API for search
|
||||||
|
# Check if client has search method (newer versions) or use query_points (older)
|
||||||
|
if hasattr(self.client, 'search'):
|
||||||
|
results = self.client.search(
|
||||||
|
collection_name=self.collection_name,
|
||||||
|
query_vector=query_embedding,
|
||||||
|
query_filter=query_filter,
|
||||||
|
limit=top_k,
|
||||||
|
score_threshold=score_threshold
|
||||||
|
)
|
||||||
|
elif hasattr(self.client, 'query_points'):
|
||||||
|
# Fallback for older API
|
||||||
|
results = self.client.query_points(
|
||||||
|
collection_name=self.collection_name,
|
||||||
|
query=query_embedding,
|
||||||
|
query_filter=query_filter,
|
||||||
|
top=top_k,
|
||||||
|
score_threshold=score_threshold
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Try using the collection directly
|
||||||
|
collection = self.client.get_collection(self.collection_name)
|
||||||
|
if hasattr(collection, 'search'):
|
||||||
|
results = collection.search(
|
||||||
|
query_vector=query_embedding,
|
||||||
|
query_filter=query_filter,
|
||||||
|
limit=top_k,
|
||||||
|
score_threshold=score_threshold
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.error("QdrantClient does not have search or query_points method")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Convert to list of dicts
|
||||||
|
search_results = []
|
||||||
|
for result in results:
|
||||||
|
search_results.append({
|
||||||
|
"id": str(result.id),
|
||||||
|
"score": result.score,
|
||||||
|
"payload": result.payload
|
||||||
|
})
|
||||||
|
|
||||||
|
return search_results
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("Vector search failed: %s", exc)
|
||||||
|
import traceback
|
||||||
|
logger.debug("Search error traceback: %s", traceback.format_exc())
|
||||||
|
return []
|
||||||
|
|
||||||
|
def search_by_text(
|
||||||
|
self,
|
||||||
|
query_text: str,
|
||||||
|
embedder,
|
||||||
|
job_id: str | None = None,
|
||||||
|
top_k: int = 10
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""Search using text query (embeds it first)."""
|
||||||
|
query_embedding = embedder.embed_text(query_text)
|
||||||
|
return self.search(query_embedding, job_id=job_id, top_k=top_k)
|
||||||
|
|
||||||
|
def delete_job_vectors(self, job_id: str) -> int:
|
||||||
|
"""Delete all vectors for a specific job."""
|
||||||
|
try:
|
||||||
|
# Qdrant doesn't have a direct delete by filter, so we need to:
|
||||||
|
# 1. Search for all points with job_id
|
||||||
|
# 2. Delete them by ID
|
||||||
|
|
||||||
|
# This is a simplified version - in production, you might want
|
||||||
|
# to use scroll API for large datasets
|
||||||
|
query_filter = Filter(
|
||||||
|
must=[
|
||||||
|
FieldCondition(
|
||||||
|
key="job_id",
|
||||||
|
match=MatchValue(value=job_id)
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Scroll to get all points
|
||||||
|
points, _ = self.client.scroll(
|
||||||
|
collection_name=self.collection_name,
|
||||||
|
scroll_filter=query_filter,
|
||||||
|
limit=10000 # Adjust based on expected size
|
||||||
|
)
|
||||||
|
|
||||||
|
if points:
|
||||||
|
point_ids = [str(point.id) for point in points]
|
||||||
|
self.client.delete(
|
||||||
|
collection_name=self.collection_name,
|
||||||
|
points_selector=point_ids
|
||||||
|
)
|
||||||
|
logger.info("Deleted %d vectors for job %s", len(point_ids), job_id)
|
||||||
|
return len(point_ids)
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("Failed to delete job vectors: %s", exc)
|
||||||
|
return 0
|
||||||
|
|
||||||
@ -4,14 +4,19 @@ import logging
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, List
|
from typing import Iterable, List
|
||||||
|
|
||||||
from ..claude_client import ClaudeCausalExtractor
|
|
||||||
from ..config import get_settings
|
from ..config import get_settings
|
||||||
from ..extractors.auto import extract_text
|
|
||||||
from ..extractors.image_extractor import extract_images_from_file
|
from ..extractors.image_extractor import extract_images_from_file
|
||||||
|
from ..extractors.pymupdf_extractor import extract_all_text, extract_text_with_context
|
||||||
|
from ..extractors.qwen_vision import QwenVisionClient
|
||||||
from ..jobs import JobStore
|
from ..jobs import JobStore
|
||||||
from ..models import CausalRelation, JobStage
|
from ..models import CausalRelation, JobStage
|
||||||
from ..processors.chunker import TextChunker
|
from ..processors.dowhy_analyzer import DoWhyAnalyzer
|
||||||
|
from ..processors.embedder import Embedder
|
||||||
|
from ..processors.entity_resolver import EntityResolver
|
||||||
from ..processors.graph_writer import GraphWriter
|
from ..processors.graph_writer import GraphWriter
|
||||||
|
from ..processors.relationship_extractor import RelationshipExtractor
|
||||||
|
from ..processors.report_generator import ReportGenerator
|
||||||
|
from ..processors.vector_store import VectorStore
|
||||||
from ..storage import StorageManager
|
from ..storage import StorageManager
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@ -23,31 +28,60 @@ class JobPipeline:
|
|||||||
job_store: JobStore,
|
job_store: JobStore,
|
||||||
storage: StorageManager,
|
storage: StorageManager,
|
||||||
graph_writer: GraphWriter,
|
graph_writer: GraphWriter,
|
||||||
claude_extractor: ClaudeCausalExtractor,
|
|
||||||
):
|
):
|
||||||
self.job_store = job_store
|
self.job_store = job_store
|
||||||
self.storage = storage
|
self.storage = storage
|
||||||
self.graph_writer = graph_writer
|
self.graph_writer = graph_writer
|
||||||
self.claude_extractor = claude_extractor
|
|
||||||
settings = get_settings()
|
settings = get_settings()
|
||||||
self.chunker = TextChunker(
|
|
||||||
model_name=settings.claude_model,
|
# Initialize extractors
|
||||||
token_target=settings.chunk_token_target,
|
self.qwen_client = QwenVisionClient() # Only for images/diagrams
|
||||||
overlap=settings.chunk_token_overlap,
|
self.relationship_extractor = RelationshipExtractor() # NLP (SpaCy) + Claude AI for text (as per README)
|
||||||
)
|
self.entity_resolver = EntityResolver() # Claude AI entity resolution (as per README Stage 4)
|
||||||
|
|
||||||
|
# Initialize processors
|
||||||
|
try:
|
||||||
|
self.dowhy_analyzer = DoWhyAnalyzer() if settings.dowhy_enabled else None
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("DoWhy not available: %s", e)
|
||||||
|
self.dowhy_analyzer = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.embedder = Embedder()
|
||||||
|
self.vector_store = VectorStore()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Vector store not available: %s", e)
|
||||||
|
self.embedder = None
|
||||||
|
self.vector_store = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.report_generator = ReportGenerator()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Report generator not available: %s", e)
|
||||||
|
self.report_generator = None
|
||||||
|
|
||||||
def process_job(self, job_id: str, saved_files: Iterable[str]) -> None:
|
def process_job(self, job_id: str, saved_files: Iterable[str]) -> None:
|
||||||
job = self.job_store.get(job_id)
|
job = self.job_store.get(job_id)
|
||||||
logger.info("Processing job %s with %d files", job_id, job.total_files)
|
logger.info("Processing job %s with %d files", job_id, job.total_files)
|
||||||
|
|
||||||
relations: List[CausalRelation] = []
|
all_text_content: List[str] = []
|
||||||
|
all_relations: List[CausalRelation] = []
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.job_store.update(job_id, stage=JobStage.EXTRACTING, status_message="Extracting content")
|
# ============================================================
|
||||||
|
# STEP 1: CONTENT EXTRACTION (PyMuPDF + Qwen2.5-VL)
|
||||||
|
# ============================================================
|
||||||
|
self.job_store.update(
|
||||||
|
job_id,
|
||||||
|
stage=JobStage.EXTRACTING,
|
||||||
|
status_message="Extracting content from documents"
|
||||||
|
)
|
||||||
|
|
||||||
for count, file_path in enumerate(saved_files, start=1):
|
for count, file_path in enumerate(saved_files, start=1):
|
||||||
file_path_obj = Path(file_path)
|
file_path_obj = Path(file_path)
|
||||||
file_record = next((f for f in job.files if f.stored_path == file_path), None)
|
file_record = next((f for f in job.files if f.stored_path == file_path), None)
|
||||||
logger.info("Processing %s", file_path_obj.name)
|
logger.info("Processing %s (%d/%d)", file_path_obj.name, count, job.total_files)
|
||||||
source_file_id = file_record.id if file_record else file_path_obj.name
|
source_file_id = file_record.id if file_record else file_path_obj.name
|
||||||
suffix = file_path_obj.suffix.lower()
|
suffix = file_path_obj.suffix.lower()
|
||||||
|
|
||||||
@ -55,27 +89,36 @@ class JobPipeline:
|
|||||||
is_direct_image = suffix in {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}
|
is_direct_image = suffix in {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Extract text from document (if not a direct image)
|
# Step 2.1: IDENTIFY FILE TYPE and route to appropriate extractor
|
||||||
|
# Step 2.2: Extract text based on file type (as per README)
|
||||||
text = ""
|
text = ""
|
||||||
if not is_direct_image:
|
if not is_direct_image:
|
||||||
try:
|
try:
|
||||||
text = extract_text(file_path_obj)
|
# extract_all_text() handles routing:
|
||||||
|
# - PDF → PyMuPDF (Step 2.2a)
|
||||||
|
# - DOCX → python-docx (Step 2.2b)
|
||||||
|
# - PPTX → python-pptx (Step 2.2c)
|
||||||
|
# - CSV/XLSX → pandas (Step 2.2d)
|
||||||
|
# - Text files → direct read
|
||||||
|
# Also performs Step 2.3: Text cleaning
|
||||||
|
text = extract_all_text(file_path_obj)
|
||||||
|
|
||||||
# Process text if available
|
|
||||||
if text and text.strip():
|
if text and text.strip():
|
||||||
# Validate text is readable
|
# Validate text is readable (basic check)
|
||||||
printable_chars = sum(1 for c in text if c.isprintable() or c.isspace())
|
printable_chars = sum(1 for c in text if c.isprintable() or c.isspace())
|
||||||
total_chars = len(text)
|
total_chars = len(text)
|
||||||
if total_chars > 100 and printable_chars / total_chars < 0.3:
|
if total_chars > 100 and printable_chars / total_chars < 0.3:
|
||||||
logger.warning("Text from %s appears to be binary, skipping text processing", file_path_obj.name)
|
logger.warning("Text from %s appears to be binary, skipping", file_path_obj.name)
|
||||||
text = ""
|
text = ""
|
||||||
else:
|
else:
|
||||||
|
# Step 2.4: STORE EXTRACTED TEXT
|
||||||
|
all_text_content.append(text)
|
||||||
extracted_path = self.storage.stage_extracted_content(job_id, file_path_obj.name, text)
|
extracted_path = self.storage.stage_extracted_content(job_id, file_path_obj.name, text)
|
||||||
if file_record:
|
if file_record:
|
||||||
file_record.extracted_path = str(extracted_path)
|
file_record.extracted_path = str(extracted_path)
|
||||||
logger.info("Successfully extracted %d characters from %s", len(text), file_path_obj.name)
|
logger.info("Extracted %d characters from %s", len(text), file_path_obj.name)
|
||||||
except Exception as text_exc:
|
except Exception as text_exc:
|
||||||
logger.warning("Text extraction failed for %s: %s. Will continue with image extraction if available.", file_path_obj.name, text_exc)
|
logger.warning("Text extraction failed for %s: %s", file_path_obj.name, text_exc)
|
||||||
text = ""
|
text = ""
|
||||||
|
|
||||||
# Extract images from documents (PDF, DOCX, PPTX)
|
# Extract images from documents (PDF, DOCX, PPTX)
|
||||||
@ -93,7 +136,25 @@ class JobPipeline:
|
|||||||
extracted_images = [file_path_obj]
|
extracted_images = [file_path_obj]
|
||||||
logger.info("Direct image upload detected: %s", file_path_obj.name)
|
logger.info("Direct image upload detected: %s", file_path_obj.name)
|
||||||
|
|
||||||
except Exception as exc: # noqa: BLE001
|
# Process images with Qwen2.5-VL
|
||||||
|
if extracted_images:
|
||||||
|
for image_path in extracted_images:
|
||||||
|
try:
|
||||||
|
qwen_results = self.qwen_client.extract_relationships_from_image(
|
||||||
|
image_path, source_file_id
|
||||||
|
)
|
||||||
|
if qwen_results:
|
||||||
|
# Convert Qwen results to CausalRelation objects
|
||||||
|
qwen_relations = self.relationship_extractor.extract_from_qwen_results(
|
||||||
|
qwen_results, source_file_id
|
||||||
|
)
|
||||||
|
all_relations.extend(qwen_relations)
|
||||||
|
logger.info("Extracted %d relations from image %s using Qwen2.5-VL",
|
||||||
|
len(qwen_relations), image_path.name)
|
||||||
|
except Exception as img_exc:
|
||||||
|
logger.warning("Failed to analyze image %s with Qwen: %s", image_path, img_exc)
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
logger.exception("Extraction failed for %s", file_path_obj)
|
logger.exception("Extraction failed for %s", file_path_obj)
|
||||||
if file_record:
|
if file_record:
|
||||||
file_record.error = str(exc)
|
file_record.error = str(exc)
|
||||||
@ -103,62 +164,188 @@ class JobPipeline:
|
|||||||
job_id,
|
job_id,
|
||||||
files=job.files,
|
files=job.files,
|
||||||
processed_files=count,
|
processed_files=count,
|
||||||
status_message=f"Analyzing causal relations ({count}/{job.total_files})",
|
status_message=f"Extracting content ({count}/{job.total_files})",
|
||||||
stage=JobStage.ANALYZING,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Process text content
|
# ============================================================
|
||||||
if text and text.strip():
|
# STEP 2: RELATIONSHIP EXTRACTION (NLP + Claude AI as per README)
|
||||||
chunks = self.chunker.chunk(text)
|
# ============================================================
|
||||||
text_relations = self.claude_extractor.analyze(chunks, source_file_id=source_file_id)
|
logger.info("Extracting relationships from text content using NLP (SpaCy) + Claude AI")
|
||||||
relations.extend(text_relations)
|
combined_text = "\n\n".join(all_text_content)
|
||||||
logger.info("Extracted %d relations from text in %s", len(text_relations), file_path_obj.name)
|
|
||||||
|
|
||||||
# Process images (extracted from documents or direct uploads)
|
if combined_text.strip():
|
||||||
if extracted_images:
|
# Extract relationships using NLP (Step 3.1) + Claude AI (Step 3.2)
|
||||||
for image_path in extracted_images:
|
# This implements the flow described in README.md
|
||||||
try:
|
text_relations = self.relationship_extractor.extract_from_text(
|
||||||
image_relations = self.claude_extractor.analyze_image(image_path, source_file_id=source_file_id)
|
combined_text,
|
||||||
relations.extend(image_relations)
|
source_file_id="combined_text"
|
||||||
logger.info("Extracted %d relations from image %s", len(image_relations), image_path.name)
|
)
|
||||||
except Exception as img_exc:
|
all_relations.extend(text_relations)
|
||||||
logger.warning("Failed to analyze image %s: %s", image_path, img_exc)
|
logger.info("NLP + Claude AI extracted %d relationships from text", len(text_relations))
|
||||||
# Continue with other images
|
|
||||||
elif not text or not text.strip():
|
|
||||||
# No text and no images - file might be empty or unsupported
|
|
||||||
logger.warning("File %s has no extractable text or images", file_path_obj.name)
|
|
||||||
if file_record:
|
|
||||||
file_record.error = "No extractable content found (no text or images)"
|
|
||||||
|
|
||||||
# Write relations to Neo4j if any were found
|
# ============================================================
|
||||||
if relations:
|
# STEP 3: ENTITY RESOLUTION (Claude AI as per README Stage 4)
|
||||||
self.job_store.update(job_id, status_message="Writing to knowledge graph", stage=JobStage.BUILDING_GRAPH)
|
# ============================================================
|
||||||
|
if all_relations and self.entity_resolver.client:
|
||||||
|
logger.info("Resolving entities using Claude AI")
|
||||||
|
resolved_entities = self.entity_resolver.resolve_entities(all_relations)
|
||||||
|
if resolved_entities:
|
||||||
|
# Apply resolution to relationships
|
||||||
|
all_relations = self.entity_resolver.apply_resolution_to_relations(
|
||||||
|
all_relations, resolved_entities
|
||||||
|
)
|
||||||
|
logger.info("Entity resolution completed: %d canonical entities", len(resolved_entities))
|
||||||
|
else:
|
||||||
|
logger.info("Entity resolution returned no results")
|
||||||
|
else:
|
||||||
|
if not self.entity_resolver.client:
|
||||||
|
logger.info("Entity resolution skipped (Claude AI not available)")
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# STEP 4: DOWHY VALIDATION
|
||||||
|
# ============================================================
|
||||||
|
if self.dowhy_analyzer and all_relations:
|
||||||
|
self.job_store.update(
|
||||||
|
job_id,
|
||||||
|
status_message="Validating relationships with DoWhy",
|
||||||
|
stage=JobStage.BUILDING_GRAPH
|
||||||
|
)
|
||||||
|
logger.info("Validating %d relationships with DoWhy", len(all_relations))
|
||||||
|
validated_relations = self.dowhy_analyzer.validate_relationships(
|
||||||
|
all_relations,
|
||||||
|
text_data=combined_text
|
||||||
|
)
|
||||||
|
all_relations = validated_relations
|
||||||
|
logger.info("DoWhy validated %d relationships", len(all_relations))
|
||||||
|
else:
|
||||||
|
if not self.dowhy_analyzer:
|
||||||
|
logger.info("DoWhy validation skipped (not available)")
|
||||||
|
self.job_store.update(
|
||||||
|
job_id,
|
||||||
|
status_message="Building knowledge graph",
|
||||||
|
stage=JobStage.BUILDING_GRAPH
|
||||||
|
)
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# STEP 5: WRITE TO NEO4J (Documents, Entities, Relationships)
|
||||||
|
# ============================================================
|
||||||
|
if all_relations:
|
||||||
try:
|
try:
|
||||||
self.graph_writer.write_relations(job_id, relations)
|
# Write documents, entities, and relationships with types
|
||||||
logger.info("Wrote %d relations to Neo4j for job %s", len(relations), job_id)
|
self.graph_writer.write_relations(job_id, all_relations, files=job.files)
|
||||||
status_message = f"Completed with {len(relations)} causal relationship(s) written to Neo4j"
|
logger.info("Wrote %d relations to Neo4j for job %s", len(all_relations), job_id)
|
||||||
except Exception as graph_exc:
|
except Exception as graph_exc:
|
||||||
logger.exception("Failed to write relations to Neo4j for job %s: %s", job_id, graph_exc)
|
logger.exception("Failed to write relations to Neo4j: %s", graph_exc)
|
||||||
status_message = f"Completed with {len(relations)} relations extracted, but failed to write to Neo4j: {graph_exc}"
|
raise
|
||||||
else:
|
|
||||||
logger.warning("Job %s completed with 0 relations - no causal relationships found", job_id)
|
# ============================================================
|
||||||
# Check if any files failed to extract
|
# STEP 6: VECTOR DATABASE INDEXING (Qdrant)
|
||||||
failed_files = [f for f in job.files if f.error]
|
# ============================================================
|
||||||
if failed_files:
|
if self.vector_store and self.embedder and all_relations:
|
||||||
status_message = f"Completed but {len(failed_files)} file(s) failed to extract. No relations found."
|
self.job_store.update(
|
||||||
else:
|
job_id,
|
||||||
status_message = "Completed but no causal relationships were found in the documents."
|
status_message="Indexing knowledge graph in vector database",
|
||||||
|
stage=JobStage.INDEXING_VECTORS
|
||||||
|
)
|
||||||
|
logger.info("Indexing %d relationships in Qdrant", len(all_relations))
|
||||||
|
|
||||||
|
indexed_count = 0
|
||||||
|
for relation in all_relations:
|
||||||
|
try:
|
||||||
|
# Generate embedding for the relationship
|
||||||
|
embedding = self.embedder.embed_relation(
|
||||||
|
relation.cause,
|
||||||
|
relation.effect,
|
||||||
|
relation.explanation
|
||||||
|
)
|
||||||
|
|
||||||
|
# Store in Qdrant
|
||||||
|
self.vector_store.store_relation(relation, embedding, job_id)
|
||||||
|
indexed_count += 1
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Failed to index relation %s -> %s: %s",
|
||||||
|
relation.cause, relation.effect, e)
|
||||||
|
|
||||||
|
# Also index concepts (nodes)
|
||||||
|
concepts = set()
|
||||||
|
for rel in all_relations:
|
||||||
|
concepts.add(rel.cause)
|
||||||
|
concepts.add(rel.effect)
|
||||||
|
|
||||||
|
for concept in concepts:
|
||||||
|
try:
|
||||||
|
embedding = self.embedder.embed_concept(concept)
|
||||||
|
self.vector_store.store_concept(concept, embedding, job_id)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Failed to index concept %s: %s", concept, e)
|
||||||
|
|
||||||
|
logger.info("Indexed %d relationships and %d concepts in Qdrant",
|
||||||
|
indexed_count, len(concepts))
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# STEP 7: GENERATE ONBOARDING REPORT
|
||||||
|
# ============================================================
|
||||||
|
if self.report_generator and self.vector_store and self.embedder:
|
||||||
|
self.job_store.update(
|
||||||
|
job_id,
|
||||||
|
status_message="Generating beginner-friendly onboarding report",
|
||||||
|
stage=JobStage.GENERATING_REPORT
|
||||||
|
)
|
||||||
|
logger.info("Generating onboarding report for job %s", job_id)
|
||||||
|
|
||||||
|
try:
|
||||||
|
kg_summary = {
|
||||||
|
"total_relations": len(all_relations),
|
||||||
|
"total_files": job.total_files,
|
||||||
|
"processed_files": job.processed_files
|
||||||
|
}
|
||||||
|
|
||||||
|
report = self.report_generator.generate_onboarding_report(
|
||||||
|
job_id=job_id,
|
||||||
|
relations=all_relations,
|
||||||
|
vector_store=self.vector_store,
|
||||||
|
embedder=self.embedder,
|
||||||
|
graph_writer=self.graph_writer, # Pass graph_writer for Neo4j queries
|
||||||
|
kg_summary=kg_summary
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info("Generated onboarding report: %d sections, %d pages",
|
||||||
|
len(report.sections), report.total_pages)
|
||||||
|
|
||||||
|
except Exception as report_exc:
|
||||||
|
logger.exception("Failed to generate report: %s", report_exc)
|
||||||
|
report = None
|
||||||
|
# Store report generation error in job metadata
|
||||||
|
report_error_msg = str(report_exc)
|
||||||
|
if "credit balance" in report_error_msg.lower() or "too low" in report_error_msg.lower():
|
||||||
|
report_error_msg = "Report generation failed: Claude API credit balance is too low. Please add credits to your Anthropic account."
|
||||||
|
self.job_store.update(
|
||||||
|
job_id,
|
||||||
|
error=f"Report generation failed: {report_error_msg}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.warning("Report generation skipped (components not available)")
|
||||||
|
report = None
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# FINAL UPDATE
|
||||||
|
# ============================================================
|
||||||
|
status_message = f"Completed successfully"
|
||||||
|
if all_relations:
|
||||||
|
status_message += f" with {len(all_relations)} relationships"
|
||||||
|
if report:
|
||||||
|
status_message += f" and generated onboarding report"
|
||||||
|
|
||||||
# Final update
|
|
||||||
self.job_store.update(
|
self.job_store.update(
|
||||||
job_id,
|
job_id,
|
||||||
stage=JobStage.COMPLETED,
|
stage=JobStage.COMPLETED,
|
||||||
status_message=status_message,
|
status_message=status_message,
|
||||||
relations=relations,
|
relations=all_relations,
|
||||||
|
report=report,
|
||||||
processed_files=job.total_files,
|
processed_files=job.total_files,
|
||||||
)
|
)
|
||||||
logger.info("Job %s completed with %d relations", job_id, len(relations))
|
logger.info("Job %s completed successfully", job_id)
|
||||||
except Exception as exc: # noqa: BLE001
|
|
||||||
|
except Exception as exc:
|
||||||
logger.exception("Job %s failed: %s", job_id, exc)
|
logger.exception("Job %s failed: %s", job_id, exc)
|
||||||
self.job_store.mark_error(job_id, f"Pipeline failed: {exc}")
|
self.job_store.mark_error(job_id, f"Pipeline failed: {exc}")
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user