added qdrant db in multi doc service

This commit is contained in:
Pradeep 2025-12-01 09:04:09 +05:30
parent 603e9b4b20
commit 72fea0dee8
35 changed files with 5398 additions and 1765 deletions

View File

@ -196,27 +196,45 @@ services:
# retries: 5
# start_period: 60s
chromadb:
image: chromadb/chroma:latest
container_name: pipeline_chromadb
# chromadb:
# image: chromadb/chroma:latest
# container_name: pipeline_chromadb
# ports:
# - "8010:8000"
# environment:
# - CHROMA_SERVER_HOST=0.0.0.0
# - CHROMA_SERVER_HTTP_PORT=8000
# - IS_PERSISTENT=TRUE
# - PERSIST_DIRECTORY=/chroma/chroma
# - ANONYMIZED_TELEMETRY=TRUE
# volumes:
# - chromadb_data:/chroma/chroma
# networks:
# - pipeline_network
# healthcheck:
# test: ["CMD-SHELL", "timeout 5 bash -c '</dev/tcp/127.0.0.1/8000' || exit 1"]
# interval: 15s
# timeout: 10s
# retries: 3
# start_period: 30s
qdrant:
image: qdrant/qdrant:latest
container_name: pipeline_qdrant
ports:
- "8010:8000"
environment:
- CHROMA_SERVER_HOST=0.0.0.0
- CHROMA_SERVER_HTTP_PORT=8000
- IS_PERSISTENT=TRUE
- PERSIST_DIRECTORY=/chroma/chroma
- ANONYMIZED_TELEMETRY=TRUE
- "6333:6333"
- "6334:6334"
volumes:
- chromadb_data:/chroma/chroma
- qdrant_data:/qdrant/storage
networks:
- pipeline_network
healthcheck:
test: ["CMD-SHELL", "timeout 5 bash -c '</dev/tcp/127.0.0.1/8000' || exit 1"]
interval: 15s
test: ["CMD-SHELL", "timeout 2 bash -c '</dev/tcp/127.0.0.1/6333' || exit 1"]
interval: 30s
timeout: 10s
retries: 3
retries: 5
start_period: 30s
restart: unless-stopped
@ -294,97 +312,97 @@ services:
start_period: 40s
restart: unless-stopped
requirement-processor:
build: ./services/requirement-processor
container_name: pipeline_requirement_processor
ports:
- "8001:8001"
environment:
- POSTGRES_HOST=postgres
- POSTGRES_PORT=5432
- POSTGRES_DB=dev_pipeline
- POSTGRES_USER=pipeline_admin
- POSTGRES_PASSWORD=secure_pipeline_2024
- DATABASE_URL=postgresql://pipeline_admin:secure_pipeline_2024@postgres:5432/dev_pipeline
- REDIS_HOST=redis
- REDIS_PORT=6379
- REDIS_PASSWORD=redis_secure_2024
- MONGODB_HOST=mongodb
- MONGODB_PORT=27017
- NEO4J_URI=bolt://neo4j:7687
- NEO4J_USER=neo4j
- NEO4J_PASSWORD=password
- CHROMA_HOST=chromadb
- CHROMA_PORT=8000
- REDIS_URL=redis://:redis_secure_2024@redis:6379
networks:
- pipeline_network
depends_on:
postgres:
condition: service_healthy
redis:
condition: service_healthy
mongodb:
condition: service_started
migrations:
condition: service_completed_successfully
# requirement-processor:
# build: ./services/requirement-processor
# container_name: pipeline_requirement_processor
# ports:
# - "8001:8001"
# environment:
# - POSTGRES_HOST=postgres
# - POSTGRES_PORT=5432
# - POSTGRES_DB=dev_pipeline
# - POSTGRES_USER=pipeline_admin
# - POSTGRES_PASSWORD=secure_pipeline_2024
# - DATABASE_URL=postgresql://pipeline_admin:secure_pipeline_2024@postgres:5432/dev_pipeline
# - REDIS_HOST=redis
# - REDIS_PORT=6379
# - REDIS_PASSWORD=redis_secure_2024
# - MONGODB_HOST=mongodb
# - MONGODB_PORT=27017
# - NEO4J_URI=bolt://neo4j:7687
# - NEO4J_USER=neo4j
# - NEO4J_PASSWORD=password
# - CHROMA_HOST=chromadb
# - CHROMA_PORT=8000
# - REDIS_URL=redis://:redis_secure_2024@redis:6379
# networks:
# - pipeline_network
# depends_on:
# postgres:
# condition: service_healthy
# redis:
# condition: service_healthy
# mongodb:
# condition: service_started
# migrations:
# condition: service_completed_successfully
tech-stack-selector:
build: ./services/tech-stack-selector
container_name: pipeline_tech_stack_selector
ports:
- "8002:8002"
environment:
- POSTGRES_HOST=postgres
- POSTGRES_PORT=5432
- POSTGRES_DB=dev_pipeline
- POSTGRES_USER=pipeline_admin
- POSTGRES_PASSWORD=secure_pipeline_2024
- REDIS_HOST=redis
- REDIS_PORT=6379
- REDIS_PASSWORD=redis_secure_2024
- CLAUDE_API_KEY=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
networks:
- pipeline_network
depends_on:
postgres:
condition: service_healthy
redis:
condition: service_healthy
migrations:
condition: service_completed_successfully
# tech-stack-selector:
# build: ./services/tech-stack-selector
# container_name: pipeline_tech_stack_selector
# ports:
# - "8002:8002"
# environment:
# - POSTGRES_HOST=postgres
# - POSTGRES_PORT=5432
# - POSTGRES_DB=dev_pipeline
# - POSTGRES_USER=pipeline_admin
# - POSTGRES_PASSWORD=secure_pipeline_2024
# - REDIS_HOST=redis
# - REDIS_PORT=6379
# - REDIS_PASSWORD=redis_secure_2024
# - CLAUDE_API_KEY=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
# networks:
# - pipeline_network
# depends_on:
# postgres:
# condition: service_healthy
# redis:
# condition: service_healthy
# migrations:
# condition: service_completed_successfully
architecture-designer:
build: ./services/architecture-designer
container_name: pipeline_architecture_designer
ports:
- "8003:8003"
environment:
- PORT=8003
- HOST=0.0.0.0
- CLAUDE_API_KEY=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
- ANTHROPIC_API_KEY=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
- POSTGRES_HOST=postgres
- POSTGRES_PORT=5432
- POSTGRES_DB=dev_pipeline
- POSTGRES_USER=pipeline_admin
- POSTGRES_PASSWORD=secure_pipeline_2024
- MONGODB_HOST=mongodb
- MONGODB_PORT=27017
networks:
- pipeline_network
depends_on:
postgres:
condition: service_healthy
mongodb:
condition: service_started
migrations:
condition: service_completed_successfully
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8003/health"]
interval: 30s
timeout: 10s
retries: 3
# architecture-designer:
# build: ./services/architecture-designer
# container_name: pipeline_architecture_designer
# ports:
# - "8003:8003"
# environment:
# - PORT=8003
# - HOST=0.0.0.0
# - CLAUDE_API_KEY=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
# - ANTHROPIC_API_KEY=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
# - POSTGRES_HOST=postgres
# - POSTGRES_PORT=5432
# - POSTGRES_DB=dev_pipeline
# - POSTGRES_USER=pipeline_admin
# - POSTGRES_PASSWORD=secure_pipeline_2024
# - MONGODB_HOST=mongodb
# - MONGODB_PORT=27017
# networks:
# - pipeline_network
# depends_on:
# postgres:
# condition: service_healthy
# mongodb:
# condition: service_started
# migrations:
# condition: service_completed_successfully
# healthcheck:
# test: ["CMD", "curl", "-f", "http://localhost:8003/health"]
# interval: 30s
# timeout: 10s
# retries: 3
# code-generator:
# build: ./services/code-generator
@ -461,34 +479,34 @@ services:
migrations:
condition: service_completed_successfully
deployment-manager:
build: ./services/deployment-manager
container_name: pipeline_deployment_manager
ports:
- "8006:8006"
environment:
- POSTGRES_HOST=postgres
- POSTGRES_PORT=5432
- POSTGRES_DB=dev_pipeline
- POSTGRES_USER=pipeline_admin
- POSTGRES_PASSWORD=secure_pipeline_2024
- MONGODB_HOST=mongodb
- MONGODB_PORT=27017
- RABBITMQ_HOST=rabbitmq
- RABBITMQ_PORT=5672
- RABBITMQ_USER=pipeline_admin
- RABBITMQ_PASSWORD=rabbit_secure_2024
networks:
- pipeline_network
depends_on:
postgres:
condition: service_healthy
rabbitmq:
condition: service_healthy
mongodb:
condition: service_started
migrations:
condition: service_completed_successfully
# deployment-manager:
# build: ./services/deployment-manager
# container_name: pipeline_deployment_manager
# ports:
# - "8006:8006"
# environment:
# - POSTGRES_HOST=postgres
# - POSTGRES_PORT=5432
# - POSTGRES_DB=dev_pipeline
# - POSTGRES_USER=pipeline_admin
# - POSTGRES_PASSWORD=secure_pipeline_2024
# - MONGODB_HOST=mongodb
# - MONGODB_PORT=27017
# - RABBITMQ_HOST=rabbitmq
# - RABBITMQ_PORT=5672
# - RABBITMQ_USER=pipeline_admin
# - RABBITMQ_PASSWORD=rabbit_secure_2024
# networks:
# - pipeline_network
# depends_on:
# postgres:
# condition: service_healthy
# rabbitmq:
# condition: service_healthy
# mongodb:
# condition: service_started
# migrations:
# condition: service_completed_successfully
user-auth:
build: ./services/user-auth
@ -583,38 +601,38 @@ services:
restart: unless-stopped
# AI Mockup / Wireframe Generation Service
ai-mockup-service:
build: ./services/ai-mockup-service
container_name: pipeline_ai_mockup_service
ports:
- "8021:8021"
environment:
- PORT=8021
- HOST=0.0.0.0
- CLAUDE_API_KEY=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
- POSTGRES_HOST=postgres
- POSTGRES_PORT=5432
- POSTGRES_DB=dev_pipeline
- POSTGRES_USER=pipeline_admin
- POSTGRES_PASSWORD=secure_pipeline_2024
- REDIS_HOST=redis
- REDIS_PORT=6379
- REDIS_PASSWORD=redis_secure_2024
- JWT_ACCESS_SECRET=access-secret-key-2024-tech4biz-secure_pipeline_2024
- USER_AUTH_SERVICE_URL=http://user-auth:8011
- FLASK_ENV=development
networks:
- pipeline_network
depends_on:
postgres:
condition: service_healthy
user-auth:
condition: service_healthy
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8021/health"]
interval: 30s
timeout: 10s
retries: 3
# ai-mockup-service:
# build: ./services/ai-mockup-service
# container_name: pipeline_ai_mockup_service
# ports:
# - "8021:8021"
# environment:
# - PORT=8021
# - HOST=0.0.0.0
# - CLAUDE_API_KEY=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
# - POSTGRES_HOST=postgres
# - POSTGRES_PORT=5432
# - POSTGRES_DB=dev_pipeline
# - POSTGRES_USER=pipeline_admin
# - POSTGRES_PASSWORD=secure_pipeline_2024
# - REDIS_HOST=redis
# - REDIS_PORT=6379
# - REDIS_PASSWORD=redis_secure_2024
# - JWT_ACCESS_SECRET=access-secret-key-2024-tech4biz-secure_pipeline_2024
# - USER_AUTH_SERVICE_URL=http://user-auth:8011
# - FLASK_ENV=development
# networks:
# - pipeline_network
# depends_on:
# postgres:
# condition: service_healthy
# user-auth:
# condition: service_healthy
# healthcheck:
# test: ["CMD", "curl", "-f", "http://localhost:8021/health"]
# interval: 30s
# timeout: 10s
# retries: 3
git-integration:
build: ./services/git-integration
@ -731,7 +749,7 @@ services:
environment:
- PORT=8022
- HOST=0.0.0.0
- ANTHROPIC_API_KEY=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
- ANTHROPIC_API_KEY=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
# Neo4j Configuration
- USE_NEO4J_KG=true
@ -790,17 +808,37 @@ services:
environment:
- PORT=8024
- HOST=0.0.0.0
- ANTHROPIC_API_KEY=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
# Claude/Anthropic Configuration
- ANTHROPIC_API_KEY=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
- MULTI_DOC_CLAUDE_MODEL=claude-3-5-haiku-latest
- CLAUDE_MODEL=claude-3-5-haiku-latest
# Qwen2.5-VL API Configuration
- QWEN_API_KEY=${QWEN_API_KEY:-}
- QWEN_API_URL=${QWEN_API_URL:-https://api.example.com/v1/chat/completions}
- QWEN_MODEL=qwen2.5-vl
# Neo4j Configuration
- NEO4J_URI=bolt://neo4j:7687
- NEO4J_USER=neo4j
- NEO4J_PASSWORD=password
- NEO4J_DATABASE=neo4j
# Qdrant Configuration
- QDRANT_URL=http://qdrant:6333
- QDRANT_COLLECTION_NAME=kg_embeddings
# DoWhy Configuration
- DOWHY_ENABLED=true
- DOWHY_CONFIDENCE_THRESHOLD=0.05
# Embedding Configuration
- EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
- EMBEDDING_DIMENSION=384
# Storage Configuration
- STORAGE_DIR=/app/storage
- MULTI_DOC_STORAGE_ROOT=/app/storage
# Database configurations (optional, for job tracking)
- POSTGRES_HOST=pipeline_postgres
@ -817,6 +855,8 @@ services:
depends_on:
neo4j:
condition: service_healthy
qdrant:
condition: service_healthy
postgres:
condition: service_healthy
redis:
@ -958,6 +998,8 @@ volumes:
driver: local
multi_document_storage:
driver: local
qdrant_data:
driver: local
# =====================================
# Networks

View File

@ -7094,8 +7094,29 @@ async def main():
js_files = [fa for fa in frontend_files if fa.path.lower().endswith(('.js', '.jsx', '.mjs', '.cjs'))]
ts_files = [fa for fa in frontend_files if fa.path.lower().endswith(('.ts', '.tsx'))]
# Allocate frontend persona
from persona_system import allocate_code_persona, build_code_analysis_persona_prompt
# Determine if it's UI or state management focused
has_state_files = len(state_files) > 0
sample_file = frontend_files[0] if frontend_files else None
sample_path = sample_file.path if sample_file else ""
sample_content = getattr(sample_file, 'content', '')[:1000] if sample_file else ""
# Allocate persona - prefer state management if state files exist
if has_state_files:
# Try to get state management persona
persona = allocate_code_persona("store/state.ts", sample_content, "frontend_state")
if "state" not in persona.get("role", "").lower():
# Fallback to UI persona
persona = allocate_code_persona(sample_path, sample_content, "frontend_ui")
else:
persona = allocate_code_persona(sample_path, sample_content, "frontend_ui")
assignment_context = f"CTO has assigned you to analyze the frontend codebase for this project. You are analyzing {len(frontend_files)} frontend files including components, routing, state management, and configuration."
front_end_prompt = f"""
You are a Senior Frontend Architect and Technical Writer with 20+ years of experience. Analyze this frontend codebase and produce a comprehensive, technically precise report. The audience includes senior engineers and stakeholders who expect evidence-based, objective findings.
Analyze this frontend codebase and produce a comprehensive, technically precise report. The audience includes senior engineers and stakeholders who expect evidence-based, objective findings.
STRICT STYLE RULES:
- Use professional, technical language only. Do not use analogies, metaphors, storytelling, or colloquial comparisons.
@ -7211,6 +7232,9 @@ FINAL REQUIREMENTS:
- Ensure total length between 2000-3000 words.
"""
# Enhance prompt with persona
enhanced_prompt = build_code_analysis_persona_prompt(front_end_prompt, persona, assignment_context)
try:
print(f"🤖 [FRONTEND AI] Calling Claude API for comprehensive frontend analysis...")
print(f"🤖 [FRONTEND AI] Analyzing {len(frontend_files)} frontend files...")
@ -7220,7 +7244,7 @@ FINAL REQUIREMENTS:
model=os.getenv("CLAUDE_MODEL", "claude-3-5-haiku-latest"),
max_tokens=8000, # Increased from 6000 to 8000 for more detailed analysis
temperature=0.1,
messages=[{"role": "user", "content": front_end_prompt}]
messages=[{"role": "user", "content": enhanced_prompt}]
)
ai_analysis = message.content[0].text.strip()
@ -7230,7 +7254,7 @@ FINAL REQUIREMENTS:
if not ai_analysis or len(ai_analysis) < 100:
print("⚠️ [FRONTEND AI] AI analysis too short, regenerating...")
# Retry with more emphasis on detail
retry_prompt = front_end_prompt + "\n\nIMPORTANT: Provide a VERY DETAILED analysis. The previous response was too short. Please provide at least 2000 words of detailed explanation."
retry_prompt = enhanced_prompt + "\n\nIMPORTANT: Provide a VERY DETAILED analysis. The previous response was too short. Please provide at least 2000 words of detailed explanation."
message = self.client.messages.create(
model=os.getenv("CLAUDE_MODEL", "claude-3-5-haiku-latest"),
max_tokens=8000,

View File

@ -524,7 +524,11 @@ class ChunkAnalyzer:
def _build_chunk_analysis_prompt(self, file_path: str, chunk: ChunkInfo,
chunk_index: int, total_chunks: int,
context_memories: Dict[str, Any]) -> str:
"""Build comprehensive analysis prompt for a chunk."""
"""Build comprehensive analysis prompt for a chunk with persona."""
from persona_system import allocate_code_persona, build_code_analysis_persona_prompt
# Allocate persona based on file path and chunk content
persona = allocate_code_persona(file_path, chunk.content, chunk.chunk_type)
# Build context information
context_info = ""
@ -538,8 +542,10 @@ class ChunkAnalyzer:
for practice in context_memories['best_practices'][:3]:
context_info += f"- {practice['content'][:100]}...\n"
assignment_context = f"CTO has assigned you to analyze chunk {chunk_index + 1} of {total_chunks} from file: {file_path}. This is a {chunk.chunk_type} chunk covering lines {chunk.start_line}-{chunk.end_line}."
prompt = f"""
You are a senior software engineer analyzing chunk {chunk_index + 1} of {total_chunks} from file: {file_path}
Analyzing chunk {chunk_index + 1} of {total_chunks} from file: {file_path}
CHUNK INFORMATION:
- Chunk Type: {chunk.chunk_type}
@ -564,7 +570,10 @@ Provide a focused analysis of this specific chunk, considering:
Focus on actionable insights for this specific code section.
"""
return prompt
# Enhance with persona
enhanced_prompt = build_code_analysis_persona_prompt(prompt, persona, assignment_context)
return enhanced_prompt
def _detect_language_from_path(self, file_path: str) -> str:
"""Detect language from file path."""

View File

@ -0,0 +1,755 @@
"""
World-Class Persona System for AI Analysis
Simulates real-world team allocation with domain-specific experts from top companies.
"""
from typing import Dict, List, Optional, Tuple
import re
# ============================================================================
# CODE ANALYSIS PERSONAS (for AI Analysis Service)
# ============================================================================
CODE_ANALYSIS_PERSONAS = {
# BACKEND DOMAINS
"backend_api": {
"role": "Senior Backend API Architect",
"companies": ["Google", "Amazon", "Stripe"],
"expertise": ["REST APIs", "GraphQL", "gRPC", "API Gateway", "Microservices"],
"experience_years": "18+",
"achievements": [
"Designed APIs at Google Cloud Platform handling 10M+ requests/day",
"Built scalable API infrastructure at Amazon AWS serving millions of customers",
"Led API architecture at Stripe processing billions in transactions"
],
"detection_keywords": ["api", "controller", "route", "endpoint", "service", "rest", "graphql"],
"focus_areas": [
"API design patterns and best practices",
"API versioning and backward compatibility",
"Rate limiting and throttling strategies",
"API documentation quality",
"Security vulnerabilities in API endpoints"
]
},
"backend_database": {
"role": "Senior Database Architect",
"companies": ["Amazon", "Oracle", "MongoDB"],
"expertise": ["SQL", "NoSQL", "Database Design", "Query Optimization", "Data Modeling"],
"experience_years": "20+",
"achievements": [
"Designed database systems at Amazon handling petabytes of data",
"Optimized databases at Oracle for enterprise-scale applications",
"Built distributed databases at MongoDB for global scale"
],
"detection_keywords": ["database", "db", "model", "schema", "migration", "repository", "orm", "query"],
"focus_areas": [
"Database schema design and normalization",
"Query performance and optimization",
"Data integrity and constraints",
"Indexing strategies",
"Transaction management"
]
},
"backend_business": {
"role": "Senior Backend Business Logic Architect",
"companies": ["Microsoft", "Salesforce", "SAP"],
"expertise": ["Business Logic", "Domain Modeling", "Design Patterns", "Service Layer"],
"experience_years": "17+",
"achievements": [
"Architected business logic systems at Microsoft for enterprise applications",
"Designed domain models at Salesforce for CRM platforms",
"Built service layers at SAP for ERP systems"
],
"detection_keywords": ["service", "business", "logic", "domain", "entity", "dto", "handler"],
"focus_areas": [
"Code organization and structure",
"Design patterns implementation",
"Business logic maintainability",
"Domain modeling quality",
"Service layer architecture"
]
},
# FRONTEND DOMAINS
"frontend_ui": {
"role": "Senior Frontend UI Architect",
"companies": ["Apple", "Meta", "Netflix"],
"expertise": ["React", "Vue", "Angular", "Component Design", "UI/UX"],
"experience_years": "15+",
"achievements": [
"Built user interfaces at Apple used by millions daily",
"Led React architecture at Meta (Facebook) for large-scale applications",
"Designed performance-optimized UIs at Netflix for 200M+ users"
],
"detection_keywords": ["component", "ui", "view", "page", "jsx", "tsx", "vue", "template"],
"focus_areas": [
"Component architecture and reusability",
"User experience and accessibility",
"UI performance optimization",
"Design system consistency",
"Responsive design implementation"
]
},
"frontend_state": {
"role": "Senior Frontend State Management Architect",
"companies": ["Meta", "Netflix", "Airbnb"],
"expertise": ["Redux", "Zustand", "Context API", "State Management", "Data Flow"],
"experience_years": "14+",
"achievements": [
"Architected state management at Meta for complex applications",
"Designed data flow patterns at Netflix for real-time updates",
"Built state systems at Airbnb for booking platforms"
],
"detection_keywords": ["store", "state", "redux", "context", "recoil", "zustand", "mobx"],
"focus_areas": [
"State architecture and patterns",
"Data flow optimization",
"State synchronization",
"Performance in state updates",
"State management best practices"
]
},
# DEVOPS DOMAINS
"devops_ci_cd": {
"role": "Senior DevOps CI/CD Architect",
"companies": ["Google", "Netflix", "Uber"],
"expertise": ["CI/CD", "Jenkins", "GitHub Actions", "GitLab CI", "Deployment Automation"],
"experience_years": "12+",
"achievements": [
"Built CI/CD pipelines at Google handling 50K+ deployments/day",
"Designed deployment systems at Netflix for zero-downtime releases",
"Architected automation at Uber for global scale"
],
"detection_keywords": ["ci", "cd", "pipeline", "jenkins", "github-actions", "gitlab", "deploy"],
"focus_areas": [
"CI/CD pipeline efficiency",
"Deployment strategy and automation",
"Quality gates and testing",
"Rollback strategies",
"Build optimization"
]
},
"devops_infrastructure": {
"role": "Senior Infrastructure Architect",
"companies": ["Amazon", "Google", "Microsoft"],
"expertise": ["Kubernetes", "Docker", "Terraform", "Cloud Infrastructure", "Scalability"],
"experience_years": "16+",
"achievements": [
"Designed infrastructure at Amazon AWS for global scale",
"Built container orchestration at Google for millions of containers",
"Architected cloud systems at Microsoft Azure with 99.99% uptime"
],
"detection_keywords": ["docker", "kubernetes", "terraform", "infrastructure", "cloud", "aws", "gcp", "azure"],
"focus_areas": [
"Infrastructure scalability",
"System reliability and uptime",
"Cost optimization",
"Security in infrastructure",
"Monitoring and observability"
]
},
# SECURITY DOMAINS
"security_engineer": {
"role": "Senior Security Engineer",
"companies": ["Google", "Microsoft", "Cloudflare"],
"expertise": ["Security", "Vulnerability Assessment", "Penetration Testing", "Security Architecture"],
"experience_years": "15+",
"achievements": [
"Led security initiatives at Google protecting billions of users",
"Designed security systems at Microsoft for enterprise applications",
"Built security infrastructure at Cloudflare for DDoS protection"
],
"detection_keywords": ["security", "auth", "encryption", "jwt", "oauth", "ssl", "tls", "cors"],
"focus_areas": [
"Security vulnerabilities and threats",
"Authentication and authorization",
"Data encryption and protection",
"Security best practices",
"Compliance and regulations"
]
},
# DATA DOMAINS
"data_engineer": {
"role": "Senior Data Engineer",
"companies": ["Google", "Netflix", "Uber"],
"expertise": ["Data Pipelines", "ETL", "Big Data", "Data Warehousing", "Spark"],
"experience_years": "13+",
"achievements": [
"Built data pipelines at Google processing petabytes daily",
"Designed ETL systems at Netflix for real-time analytics",
"Architected data infrastructure at Uber for millions of rides"
],
"detection_keywords": ["data", "pipeline", "etl", "warehouse", "spark", "hadoop", "kafka"],
"focus_areas": [
"Data architecture and pipelines",
"ETL performance and optimization",
"Data quality and validation",
"Scalability in data processing",
"Data governance"
]
},
"ml_engineer": {
"role": "Senior ML/AI Engineer",
"companies": ["OpenAI", "Anthropic", "Google DeepMind"],
"expertise": ["Machine Learning", "Deep Learning", "AI Systems", "Model Training"],
"experience_years": "12+",
"achievements": [
"Developed ML models at OpenAI for language understanding",
"Built AI systems at Anthropic for safety-critical applications",
"Designed training pipelines at Google DeepMind for large-scale models"
],
"detection_keywords": ["ml", "ai", "model", "training", "neural", "tensorflow", "pytorch", "learning"],
"focus_areas": [
"ML model architecture",
"Training pipeline optimization",
"Model performance and accuracy",
"Scalability in ML systems",
"AI safety and ethics"
]
},
# TESTING DOMAINS
"qa_automation": {
"role": "Senior QA Automation Architect",
"companies": ["Google", "Microsoft", "Amazon"],
"expertise": ["Test Automation", "Selenium", "Cypress", "Jest", "Testing Strategy"],
"experience_years": "14+",
"achievements": [
"Built test automation at Google for thousands of test cases",
"Designed testing frameworks at Microsoft for enterprise software",
"Architected QA systems at Amazon for e-commerce platforms"
],
"detection_keywords": ["test", "spec", "jest", "cypress", "selenium", "pytest", "testing"],
"focus_areas": [
"Test coverage and quality",
"Automation strategy",
"Test maintainability",
"Performance testing",
"Testing best practices"
]
},
"performance_engineer": {
"role": "Senior Performance Engineer",
"companies": ["Google", "Netflix", "Amazon"],
"expertise": ["Performance Optimization", "Load Testing", "Profiling", "Scalability"],
"experience_years": "16+",
"achievements": [
"Optimized systems at Google handling billions of requests",
"Designed performance solutions at Netflix for streaming at scale",
"Built performance infrastructure at Amazon for peak traffic"
],
"detection_keywords": ["performance", "load", "stress", "benchmark", "profiling", "optimization"],
"focus_areas": [
"Performance bottlenecks",
"Optimization strategies",
"Scalability concerns",
"Resource utilization",
"Performance testing"
]
},
# CTO (for synthesis)
"cto": {
"role": "Chief Technology Officer",
"companies": ["Google", "Microsoft", "Amazon"],
"expertise": ["Strategic Planning", "System Architecture", "Team Leadership", "Technology Strategy"],
"experience_years": "25+",
"achievements": [
"Former VP of Engineering at Google, leading teams of 500+ engineers",
"CTO at Microsoft Azure, responsible for cloud infrastructure strategy",
"Strategic advisor at Amazon Web Services for enterprise architecture"
],
"focus_areas": [
"Strategic technology insights",
"System-wide risk assessment",
"Architectural recommendations",
"Cross-domain synthesis",
"Executive-level analysis"
]
}
}
# ============================================================================
# DOCUMENT ANALYSIS PERSONAS (for Multi-Document Upload Service)
# ============================================================================
DOCUMENT_ANALYSIS_PERSONAS = {
"technical_doc_analyst": {
"role": "Senior Technical Documentation Analyst",
"companies": ["Google", "Stripe", "Microsoft"],
"expertise_domain": "technical documentation and API specifications",
"document_types": ["API docs", "technical specs", "developer guides"],
"experience_years": "15+",
"achievements": [
"Analyzed technical documentation at Google for millions of API integrations",
"Led documentation analysis at Stripe for developer experience",
"Mapped technical relationships at Microsoft for enterprise systems"
],
"focus_areas": [
"Technical dependencies and relationships",
"System integration points",
"API contract relationships",
"Technical process flows",
"Code-to-documentation mappings"
],
"visual_focus_areas": [
"API flow diagrams",
"System integration diagrams",
"Technical architecture flows"
],
"detection_keywords": ["api", "technical", "specification", "documentation", "guide", "reference", "developer"]
},
"business_process_analyst": {
"role": "Senior Business Process Analyst",
"companies": ["McKinsey", "Deloitte", "Accenture"],
"expertise_domain": "business processes and stakeholder requirements",
"document_types": ["business requirements", "user stories", "business plans"],
"experience_years": "18+",
"achievements": [
"Analyzed business processes at McKinsey for Fortune 500 companies",
"Led process mapping at Deloitte for enterprise transformations",
"Mapped stakeholder relationships at Accenture for global projects"
],
"focus_areas": [
"Business process flows",
"Requirement dependencies",
"Stakeholder impact chains",
"Business decision consequences",
"Organizational impact analysis"
],
"visual_focus_areas": [
"Business process diagrams",
"Stakeholder impact maps",
"Decision flowcharts"
],
"detection_keywords": ["business", "requirement", "stakeholder", "user story", "process", "workflow", "business plan"]
},
"system_architecture_analyst": {
"role": "Senior System Architecture Document Analyst",
"companies": ["Google", "Amazon", "Microsoft"],
"expertise_domain": "system architecture and design documents",
"document_types": ["architecture docs", "design documents", "system designs"],
"experience_years": "20+",
"achievements": [
"Analyzed architecture documents at Google for large-scale distributed systems",
"Mapped system relationships at Amazon for cloud infrastructure",
"Led architecture analysis at Microsoft for enterprise solutions"
],
"focus_areas": [
"Architecture relationships",
"Component dependencies",
"System interaction flows",
"Design decision impacts",
"Scalability relationships"
],
"visual_focus_areas": [
"Architecture diagrams",
"Component interaction diagrams",
"System dependency maps"
],
"detection_keywords": ["architecture", "design", "system", "component", "diagram", "architectural"]
},
"requirements_analyst": {
"role": "Senior Requirements & Specification Analyst",
"companies": ["IBM", "Oracle", "SAP"],
"expertise_domain": "requirements and functional specifications",
"document_types": ["requirements docs", "functional specs", "feature specs"],
"experience_years": "17+",
"achievements": [
"Analyzed requirements at IBM for enterprise software implementations",
"Mapped specifications at Oracle for database systems",
"Led requirement analysis at SAP for ERP platforms"
],
"focus_areas": [
"Requirement dependencies",
"Feature relationships",
"Specification impacts",
"Change propagation",
"Implementation dependencies"
],
"visual_focus_areas": [
"Requirement traceability diagrams",
"Feature dependency maps",
"Impact analysis charts"
],
"detection_keywords": ["requirement", "specification", "feature", "functional", "traceability", "spec"]
},
"process_flow_analyst": {
"role": "Senior Process Flow Analyst",
"companies": ["Amazon", "Netflix", "Uber"],
"expertise_domain": "operational processes and workflows",
"document_types": ["process docs", "workflows", "operational manuals"],
"experience_years": "14+",
"achievements": [
"Analyzed processes at Amazon for fulfillment operations",
"Mapped workflows at Netflix for content delivery",
"Led process analysis at Uber for ride-sharing operations"
],
"focus_areas": [
"Process step relationships",
"Workflow dependencies",
"Sequential cause-effects",
"Decision impacts",
"Operational dependencies"
],
"visual_focus_areas": [
"Process flowcharts",
"Workflow diagrams",
"Decision trees",
"Operational flow maps"
],
"detection_keywords": ["process", "workflow", "procedure", "operational", "manual", "step", "flow"]
},
"visual_architecture_analyst": {
"role": "Senior Visual Architecture Analyst",
"companies": ["Google", "Microsoft", "Apple"],
"expertise_domain": "visual diagrams and architecture drawings",
"document_types": ["diagrams", "flowcharts", "architecture drawings"],
"experience_years": "16+",
"achievements": [
"Analyzed visual diagrams at Google for complex system mappings",
"Mapped architecture drawings at Microsoft for enterprise solutions",
"Led visual analysis at Apple for product architecture"
],
"focus_areas": [
"Visual relationship extraction",
"Diagram dependency mapping",
"Flow analysis",
"Component interactions",
"Visual pattern recognition"
],
"visual_focus_areas": [
"All types of visual diagrams",
"Architecture drawings",
"Flowcharts and process diagrams",
"Component and sequence diagrams"
],
"detection_keywords": ["diagram", "flowchart", "visual", "drawing", "chart", "map", "image"]
}
}
# ============================================================================
# DOCUMENT TYPE MAPPING
# ============================================================================
DOCUMENT_PERSONA_MAPPING = {
# Technical Documents
"api_documentation": "technical_doc_analyst",
"technical_specification": "technical_doc_analyst",
"code_documentation": "technical_doc_analyst",
"developer_guide": "technical_doc_analyst",
# Business Documents
"business_requirements": "business_process_analyst",
"user_stories": "business_process_analyst",
"business_plan": "business_process_analyst",
"product_specification": "business_process_analyst",
"stakeholder_document": "business_process_analyst",
# Architecture Documents
"architecture_document": "system_architecture_analyst",
"system_design": "system_architecture_analyst",
"design_document": "system_architecture_analyst",
"technical_design": "system_architecture_analyst",
# Requirements Documents
"requirements_document": "requirements_analyst",
"functional_specification": "requirements_analyst",
"feature_specification": "requirements_analyst",
# Process Documents
"process_document": "process_flow_analyst",
"workflow_document": "process_flow_analyst",
"procedure_guide": "process_flow_analyst",
"operational_manual": "process_flow_analyst",
# Visual/Diagram Documents
"architecture_diagram": "visual_architecture_analyst",
"flowchart": "visual_architecture_analyst",
"sequence_diagram": "visual_architecture_analyst",
"component_diagram": "visual_architecture_analyst",
"process_diagram": "visual_architecture_analyst",
"system_diagram": "visual_architecture_analyst",
}
# ============================================================================
# PERSONA ALLOCATION FUNCTIONS
# ============================================================================
def allocate_code_persona(file_path: str, content: str, chunk_type: str = "module") -> Dict:
"""
Intelligently allocates code analysis persona based on file path, content, and type.
Returns persona config with prompt context.
"""
file_lower = file_path.lower()
content_lower = content.lower()[:2000] if content else "" # Sample content
# Score each persona based on detection rules
persona_scores = {}
for persona_id, persona_config in CODE_ANALYSIS_PERSONAS.items():
if persona_id == "cto": # Skip CTO for individual analysis
continue
score = 0
detection_keywords = persona_config.get("detection_keywords", [])
# Check file path (higher weight)
for keyword in detection_keywords:
if keyword in file_lower:
score += 15
# Check content (medium weight)
for keyword in detection_keywords:
if keyword in content_lower:
score += 8
# Check chunk type
if chunk_type and chunk_type.lower() in detection_keywords:
score += 10
# Domain-specific boosts
if "test" in file_lower and "qa" in persona_id:
score += 20
if "security" in file_lower and "security" in persona_id:
score += 20
if "performance" in file_lower and "performance" in persona_id:
score += 20
if score > 0:
persona_scores[persona_id] = score
# Select top persona
if persona_scores:
selected_id = max(persona_scores, key=persona_scores.get)
return CODE_ANALYSIS_PERSONAS[selected_id]
# Default fallback to backend business logic
return CODE_ANALYSIS_PERSONAS.get("backend_business", {})
def allocate_document_persona(file_path: str, content: str, file_type: str = "text") -> Dict:
"""
Intelligently allocates document analysis persona based on file path, content, and type.
Returns persona config for document analysis.
"""
file_lower = file_path.lower()
content_lower = content.lower()[:2000] if content else ""
# Check if it's an image/diagram
if file_type == "image" or any(ext in file_lower for ext in [".png", ".jpg", ".jpeg", ".gif", ".svg", ".pdf"]):
return DOCUMENT_ANALYSIS_PERSONAS.get("visual_architecture_analyst", {})
# Score each persona based on detection rules
persona_scores = {}
for persona_id, persona_config in DOCUMENT_ANALYSIS_PERSONAS.items():
score = 0
detection_keywords = persona_config.get("detection_keywords", [])
# Check file path (higher weight)
for keyword in detection_keywords:
if keyword in file_lower:
score += 15
# Check content (medium weight)
for keyword in detection_keywords:
if keyword in content_lower:
score += 8
# Check document type mapping
for doc_type, mapped_persona in DOCUMENT_PERSONA_MAPPING.items():
if doc_type in file_lower and mapped_persona == persona_id:
score += 20
if score > 0:
persona_scores[persona_id] = score
# Select top persona
if persona_scores:
selected_id = max(persona_scores, key=persona_scores.get)
return DOCUMENT_ANALYSIS_PERSONAS[selected_id]
# Default fallback to technical doc analyst
return DOCUMENT_ANALYSIS_PERSONAS.get("technical_doc_analyst", {})
def get_cto_persona() -> Dict:
"""Returns CTO persona for synthesis and high-level analysis."""
return CODE_ANALYSIS_PERSONAS.get("cto", {})
# ============================================================================
# PROMPT BUILDING FUNCTIONS
# ============================================================================
def build_persona_intro(persona: Dict, assignment_context: str = "", analysis_type: str = "code") -> str:
"""
Builds persona introduction section for prompts.
Works for both code and document analysis.
"""
if not persona:
return ""
role = persona.get("role", "Senior Engineer")
companies = persona.get("companies", [])
experience = persona.get("experience_years", "15+")
achievements = persona.get("achievements", [])
focus_areas = persona.get("focus_areas", [])
# Build company background
company_bg = ""
if companies:
company_bg = f"- Previously worked at {', '.join(companies[:2])}"
if len(companies) > 2:
company_bg += f" and {companies[2]}"
# Build achievements section
achievements_text = ""
if achievements:
achievements_text = "\n".join([f"- {achievement}" for achievement in achievements[:2]])
# Build focus areas
focus_text = ""
if focus_areas:
focus_text = "\n".join([f"- {focus}" for focus in focus_areas[:5]])
intro = f"""You are {role} with {experience} years of experience.
COMPANY BACKGROUND:
{company_bg}
KEY ACHIEVEMENTS:
{achievements_text}
YOUR ASSIGNMENT:
{assignment_context if assignment_context else 'Analyze the provided code/document for quality, issues, and recommendations.'}
YOUR FOCUS AREAS:
{focus_text}
---
"""
return intro
def build_code_analysis_persona_prompt(base_prompt: str, persona: Dict,
assignment_context: str = "") -> str:
"""
Enhances code analysis prompt with persona context.
"""
if not persona:
return base_prompt
persona_intro = build_persona_intro(persona, assignment_context, "code")
return persona_intro + base_prompt
def build_document_analysis_persona_prompt(base_prompt: str, persona: Dict,
document_type: str = "document",
assignment_context: str = "") -> str:
"""
Enhances document analysis prompt with persona context.
"""
if not persona:
return base_prompt
role = persona.get("role", "Senior Analyst")
companies = persona.get("companies", [])
expertise_domain = persona.get("expertise_domain", "document analysis")
experience = persona.get("experience_years", "15+")
achievements = persona.get("achievements", [])
focus_areas = persona.get("focus_areas", [])
company_bg = f"- Previously worked at {', '.join(companies[:2])}" if companies else ""
achievements_text = "\n".join([f"- {achievement}" for achievement in achievements[:2]]) if achievements else ""
focus_text = "\n".join([f"- {focus}" for focus in focus_areas[:5]]) if focus_areas else ""
intro = f"""You are {role}, a specialist in analyzing {expertise_domain} with {experience} years of experience.
COMPANY BACKGROUND:
{company_bg}
KEY ACHIEVEMENTS:
{achievements_text}
YOUR SPECIALIZATION:
You excel at identifying:
{focus_text}
YOUR ASSIGNMENT:
{assignment_context if assignment_context else f'Analyze this {document_type} to extract causal relationships and dependencies.'}
---
"""
return intro + base_prompt
def build_cto_synthesis_prompt(base_prompt: str, team_findings: List[Dict] = None) -> str:
"""
Builds CTO-level synthesis prompt with team allocation context.
"""
cto_persona = get_cto_persona()
if not cto_persona:
return base_prompt
role = cto_persona.get("role", "Chief Technology Officer")
companies = cto_persona.get("companies", [])
experience = cto_persona.get("experience_years", "25+")
achievements = cto_persona.get("achievements", [])
focus_areas = cto_persona.get("focus_areas", [])
company_bg = f"- Former VP of Engineering at {companies[0] if companies else 'Google'}, leading teams of 500+ engineers"
if len(companies) > 1:
company_bg += f"\n- CTO at {companies[1]}, responsible for cloud infrastructure strategy"
achievements_text = "\n".join([f"- {achievement}" for achievement in achievements[:2]]) if achievements else ""
focus_text = "\n".join([f"- {focus}" for focus in focus_areas[:5]]) if focus_areas else ""
team_allocation = ""
if team_findings:
team_allocation = "\n\nTEAM ALLOCATION:\n"
team_allocation += "You have allocated your expert team to analyze different domains:\n"
for finding in team_findings[:5]:
domain = finding.get("domain", "unknown")
team_allocation += f"- {domain}: Expert analysis completed\n"
intro = f"""You are {role} with {experience} years of experience.
COMPANY BACKGROUND:
{company_bg}
KEY ACHIEVEMENTS:
{achievements_text}
{team_allocation}
YOUR ROLE:
You have received this project and allocated your expert team to analyze different domains.
Now, synthesize all team findings into strategic recommendations.
YOUR FOCUS AREAS:
{focus_text}
---
"""
return intro + base_prompt

View File

@ -2673,8 +2673,10 @@ def build_intelligent_chunk_prompt(chunk: Dict, analysis_state: Optional[Dict] =
"""
Build comprehensive prompt for analyzing a semantically grouped chunk.
Generates detailed module-level analysis with context awareness.
Now includes progressive context from previous chunks.
Now includes progressive context from previous chunks and world-class persona.
"""
from persona_system import allocate_code_persona, build_code_analysis_persona_prompt
chunk_name = chunk.get('name', 'unknown')
chunk_type = chunk.get('chunk_type', 'module')
files_batch = chunk.get('files', [])
@ -2694,15 +2696,22 @@ def build_intelligent_chunk_prompt(chunk: Dict, analysis_state: Optional[Dict] =
optimized_files.append((file_path, optimized_content))
# Allocate appropriate persona based on files in chunk
# Use the first file to determine persona (or combine if multiple domains)
primary_file_path = optimized_files[0][0] if optimized_files else ""
primary_content = optimized_files[0][1] if optimized_files else ""
persona = allocate_code_persona(primary_file_path, primary_content, chunk_type)
# Build context from previous analyses (progressive learning)
context_section = build_context_from_state(analysis_state, chunk)
# Build assignment context
assignment_context = f"CTO has assigned you to analyze the '{chunk_name}' module/chunk for this project. This is a {chunk_type} type chunk containing {len(optimized_files)} files."
# Build comprehensive prompt with module context
prompt_parts = [
f"# COMPREHENSIVE ANALYSIS: {chunk_name.upper()}",
f"Chunk Type: {chunk_type}",
"",
"You are a senior software architect with 30+ years of experience. Analyze this module/chunk comprehensively.",
""
]
@ -2794,7 +2803,12 @@ def build_intelligent_chunk_prompt(chunk: Dict, analysis_state: Optional[Dict] =
"Focus on providing detailed, actionable insights that help understand the complete module context."
])
return "\n".join(prompt_parts)
base_prompt = "\n".join(prompt_parts)
# Enhance with persona
enhanced_prompt = build_code_analysis_persona_prompt(base_prompt, persona, assignment_context)
return enhanced_prompt
def build_smart_batch_prompt(files_batch: List[Tuple[str, str]]) -> str:
"""Legacy function: Build prompt for simple batch (backward compatibility)."""
@ -4719,13 +4733,13 @@ def build_synthesis_prompt(analysis_state: Dict, all_chunk_analyses: List[Dict]
"""
Build comprehensive prompt for cross-module synthesis analysis.
Synthesizes all individual module analyses into system-level insights.
Uses CTO persona for executive-level synthesis.
"""
from persona_system import get_cto_persona, build_cto_synthesis_prompt
prompt_parts = [
"# CROSS-MODULE SYNTHESIS ANALYSIS",
"",
"You are a senior software architect with 30+ years of experience. Your task is to synthesize",
"findings from multiple module-level analyses into comprehensive system-level insights.",
"",
"## CONTEXT: PREVIOUSLY ANALYZED MODULES",
""
]
@ -4842,7 +4856,19 @@ def build_synthesis_prompt(analysis_state: Dict, all_chunk_analyses: List[Dict]
"across all analyzed modules, not just repeating individual module findings."
])
return "\n".join(prompt_parts)
base_prompt = "\n".join(prompt_parts)
# Get team findings for CTO context
team_findings = []
if all_chunk_analyses:
for chunk_analysis in all_chunk_analyses:
module_name = chunk_analysis.get('module_name', 'unknown')
team_findings.append({"domain": module_name, "analysis": chunk_analysis})
# Enhance with CTO persona
enhanced_prompt = build_cto_synthesis_prompt(base_prompt, team_findings)
return enhanced_prompt
def parse_synthesis_response(response_text: str) -> Dict:
"""Parse synthesis response from Claude API."""

View File

@ -141,17 +141,19 @@ router.get('/auth/github/callback', async (req, res) => {
setImmediate(async () => {
try {
console.log('[GitHub OAuth] Starting background repository attachment for:', repoContext.repoUrl);
console.log('[GitHub OAuth] Using newly stored token for user:', user_id);
const GitHubIntegrationService = require('../services/github-integration.service');
const database = require('../config/database');
const githubService = new GitHubIntegrationService();
const { owner, repo, branch } = githubService.parseGitHubUrl(repoContext.repoUrl);
// Get metadata using authenticated Octokit
const repositoryData = await githubService.fetchRepositoryMetadata(owner, repo);
// Get metadata using authenticated Octokit with the specific user's token
// Pass userId to ensure we use the newly stored token
const repositoryData = await githubService.fetchRepositoryMetadata(owner, repo, false, user_id);
let actualBranch = repoContext.branchName || branch || repositoryData.default_branch || 'main';
// Attempt analysis and sync with fallback
const codebaseAnalysis = await githubService.analyzeCodebase(owner, repo, actualBranch, false);
// Attempt analysis and sync with fallback - use userId to ensure correct token
const codebaseAnalysis = await githubService.analyzeCodebase(owner, repo, actualBranch, false, user_id);
const insertQuery = `
INSERT INTO all_repositories (
repository_url, repository_name, owner_name,
@ -170,14 +172,14 @@ router.get('/auth/github/callback', async (req, res) => {
JSON.stringify(codebaseAnalysis),
'syncing',
repositoryData.visibility === 'private',
repoContext.userId || null,
user_id || repoContext.userId || null, // Use user_id from OAuth callback (most reliable)
'github' // This is GitHub OAuth callback, so provider is always github
];
const insertResult = await database.query(insertQuery, insertValues);
const repositoryRecord = insertResult.rows[0];
// Clone repository
const downloadResult = await githubService.syncRepositoryWithFallback(owner, repo, actualBranch, repositoryRecord.id, repositoryData.visibility !== 'private');
// Clone repository - use userId to ensure correct token
const downloadResult = await githubService.syncRepositoryWithFallback(owner, repo, actualBranch, repositoryRecord.id, repositoryData.visibility !== 'private', user_id);
const finalSyncStatus = downloadResult.success ? 'synced' : 'error';
await database.query('UPDATE all_repositories SET sync_status = $1, updated_at = NOW() WHERE id = $2', [finalSyncStatus, repositoryRecord.id]);

View File

@ -163,12 +163,28 @@ router.post('/:provider/attach-repository', async (req, res) => {
const { template_id, repository_url, branch_name } = req.body;
const userId = req.headers['x-user-id'] || req.query.user_id || req.body.user_id || (req.user && (req.user.id || req.user.userId));
console.log(`[VCS Attach] Extracted userId:`, userId, `from headers:`, req.headers['x-user-id'], `query:`, req.query.user_id, `body:`, req.body.user_id);
// Validate input - only repository_url is required (like GitHub)
if (!repository_url) {
return res.status(400).json({ success: false, message: 'Repository URL is required' });
}
const { owner, repo, branch } = provider.parseRepoUrl(repository_url);
// Clean and normalize the repository URL (trim whitespace, decode URL encoding)
let cleanedUrl = repository_url.trim();
// Decode URL-encoded characters (like %20 for spaces)
try {
cleanedUrl = decodeURIComponent(cleanedUrl);
} catch (e) {
// If decoding fails, use original URL
console.warn(`[VCS Attach] Failed to decode URL, using original: ${cleanedUrl}`);
}
// Trim again after decoding
cleanedUrl = cleanedUrl.trim();
console.log(`[VCS Attach] Original URL: ${repository_url}, Cleaned URL: ${cleanedUrl}`);
const { owner, repo, branch } = provider.parseRepoUrl(cleanedUrl);
// Enhanced flow: Detect private repos and redirect to OAuth immediately
const providerKey = (req.params.provider || '').toLowerCase();
@ -248,7 +264,44 @@ router.post('/:provider/attach-repository', async (req, res) => {
// For public repos or authenticated private repos, proceed with normal flow
const accessCheck = await provider.checkRepositoryAccess(owner, repo, userId);
console.log(`[VCS Attach] Access check result for ${owner}/${repo}:`, {
hasAccess: accessCheck.hasAccess,
requiresAuth: accessCheck.requiresAuth,
authError: accessCheck.authError,
error: accessCheck.error,
exists: accessCheck.exists,
github_username: accessCheck.github_username
});
if (!accessCheck.hasAccess) {
// If access check failed but requires auth, trigger OAuth flow
if (accessCheck.requiresAuth || accessCheck.authError) {
const oauthService = getOAuthService(providerKey);
if (oauthService) {
console.log(`🔒 [VCS Attach] Token exists but cannot access repository (or no valid token), redirecting to OAuth: ${repository_url}`);
console.log(`🔒 [VCS Attach] Reason: ${accessCheck.error || 'Authentication required'}, userId: ${userId}`);
// Generate OAuth URL with repository context in state
const stateBase = Math.random().toString(36).substring(7);
const state = `${stateBase}|uid=${userId || 'unknown'}|repo=${encodeURIComponent(repository_url)}|branch=${encodeURIComponent(branch_name || 'main')}|private_repo=true`;
const authUrl = oauthService.getAuthUrl(state, userId);
console.log(`🔒 [VCS Attach] Generated OAuth URL for ${providerKey}, returning requires_auth response`);
return res.json({
success: false,
message: `${providerKey.charAt(0).toUpperCase() + providerKey.slice(1)} authentication required for private repository`,
requires_auth: true,
is_private_repo: true,
auth_url: authUrl,
state: state
});
}
}
// If it's not an auth issue, return 404
console.log(`[VCS Attach] Access check failed without auth requirement, returning 404`);
return res.status(404).json({ success: false, message: accessCheck.error || 'Repository not accessible' });
}

View File

@ -21,8 +21,8 @@ class GitHubIntegrationService {
}
// Get authenticated Octokit instance
async getAuthenticatedOctokit() {
return await this.oauthService.getAuthenticatedOctokit();
async getAuthenticatedOctokit(userId = null) {
return await this.oauthService.getAuthenticatedOctokit(userId);
}
// Extract owner, repo, and branch from GitHub URL using parse-github-url library
@ -31,8 +31,15 @@ class GitHubIntegrationService {
throw new Error('URL must be a non-empty string');
}
// Normalize the URL first
// Normalize the URL first - trim and decode URL encoding
let normalizedUrl = url.trim();
// Decode URL-encoded characters (like %20 for spaces)
try {
normalizedUrl = decodeURIComponent(normalizedUrl).trim();
} catch (e) {
// If decoding fails, just trim
normalizedUrl = normalizedUrl.trim();
}
// Remove trailing slashes and .git extensions
normalizedUrl = normalizedUrl.replace(/\/+$/, '').replace(/\.git$/, '');
@ -216,7 +223,7 @@ class GitHubIntegrationService {
};
}
// No token found - try unauthenticated access first to check if it's public
// No token found that can access this repo - try unauthenticated access to check if it's public
try {
const unauthenticatedOctokit = new Octokit({
userAgent: 'CodeNuk-GitIntegration/1.0.0',
@ -234,13 +241,18 @@ class GitHubIntegrationService {
};
} catch (unauthenticatedError) {
if (unauthenticatedError.status === 404) {
// Repository truly doesn't exist
// 404 from unauthenticated access could mean:
// 1. Repository truly doesn't exist
// 2. Repository is private and requires authentication
// Since we already tried to find a token and none could access it,
// and we're being called from a private repo flow, assume it requires auth
console.log(`🔒 [GitHub] 404 from unauthenticated access - assuming private repo requires authentication`);
return {
exists: false,
exists: null, // Unknown - could be missing or private
isPrivate: null,
hasAccess: false,
requiresAuth: false,
error: 'Repository not found'
requiresAuth: true, // Changed from false to true - trigger OAuth
error: 'Repository not found or requires authentication'
};
} else if (unauthenticatedError.status === 401 || unauthenticatedError.status === 403) {
// Repository exists but requires authentication (private) - generate auth URL
@ -289,13 +301,13 @@ class GitHubIntegrationService {
}
// Get repository information from GitHub
async fetchRepositoryMetadata(owner, repo, skipAuth = false) {
async fetchRepositoryMetadata(owner, repo, skipAuth = false, userId = null) {
// If skipAuth is true, try with unauthenticated octokit first to check visibility
let octokit;
if (skipAuth) {
octokit = this.octokit; // Use unauthenticated instance
} else {
octokit = await this.getAuthenticatedOctokit();
octokit = await this.getAuthenticatedOctokit(userId);
}
const safe = async (fn, fallback) => {
@ -309,26 +321,41 @@ class GitHubIntegrationService {
let repoData;
try {
console.log(`🔍 [GitHub] fetchRepositoryMetadata: skipAuth=${skipAuth}, calling octokit.repos.get for ${owner}/${repo}`);
const response = await octokit.repos.get({ owner, repo });
if (skipAuth) {
if (response.status === 401 || response.status === 403) {
throw new Error('Authentication required to access repository');
} else if (response.status === 404) {
throw new Error('Repository not found');
}
}
repoData = response.data;
console.log(`✅ [GitHub] Successfully fetched repository data: ${repoData?.full_name || 'no full_name'}`);
// Validate we got real data
if (!repoData || !repoData.full_name) {
console.log(`❌ [GitHub] Invalid repository data received, throwing error`);
throw new Error('Invalid repository data received');
}
} catch (error) {
console.log(`🔍 [GitHub] Error in fetchRepositoryMetadata:`, error.message, error.status);
// Check error status from various possible locations
const status = error.status || error.response?.status || error.code;
const errorMessage = error.message || '';
const is404 = status === 404 || status === '404' || errorMessage.includes('404') || errorMessage.includes('Not Found');
const isAuthError = status === 401 || status === 403 || status === '401' || status === '403';
console.log(`🔍 [GitHub] Error in fetchRepositoryMetadata CATCH BLOCK:`, errorMessage, `Status: ${status || 'unknown'}`, `is404: ${is404}`, `isAuthError: ${isAuthError}`, `skipAuth: ${skipAuth}`);
console.log(`🔍 [GitHub] Error object:`, JSON.stringify({
status: error.status,
responseStatus: error.response?.status,
code: error.code,
message: error.message,
name: error.name
}));
if (skipAuth) {
// For GitHub, any error when skipAuth=true likely means private repo
if (error.status === 401 || error.status === 403 || error.status === 404) {
// For GitHub, any error when skipAuth=true means private repo or doesn't exist
// Always throw authentication required - let the caller decide if it's truly missing or private
console.log(`🔒 [GitHub] skipAuth=true, THROWING authentication required error - NOT using safe fallback`);
throw new Error('Authentication required to access repository');
}
// For other errors, also assume private repo
throw new Error('Authentication required to access repository');
}
// For other errors, use safe fallback
// For authenticated requests, use safe fallback (but only if skipAuth is false)
console.log(`⚠️ [GitHub] skipAuth=false, using safe fallback`);
repoData = await safe(
async () => {
const response = await octokit.repos.get({ owner, repo });
@ -336,6 +363,12 @@ class GitHubIntegrationService {
},
{}
);
// If safe fallback also failed, throw
if (!repoData || !repoData.full_name) {
console.log(`❌ [GitHub] Safe fallback also failed, throwing Repository not found`);
throw new Error('Repository not found');
}
}
const languages = await safe(
@ -364,7 +397,7 @@ class GitHubIntegrationService {
}
// Analyze codebase structure
async analyzeCodebase(owner, repo, branch, isPublicRepo = false) {
async analyzeCodebase(owner, repo, branch, isPublicRepo = false, userId = null) {
try {
// Use appropriate octokit instance based on repository type
let octokit;
@ -374,8 +407,8 @@ class GitHubIntegrationService {
userAgent: 'CodeNuk-GitIntegration/1.0.0',
});
} else {
// For private repos, use authenticated octokit
octokit = await this.getAuthenticatedOctokit();
// For private repos, use authenticated octokit with userId
octokit = await this.getAuthenticatedOctokit(userId);
}
// Get the commit SHA for the branch
@ -519,7 +552,7 @@ class GitHubIntegrationService {
}
// Git-based: clone or update local repo and re-index into DB
async syncRepositoryWithGit(owner, repo, branch, repositoryId, isPublicRepo = false) {
async syncRepositoryWithGit(owner, repo, branch, repositoryId, isPublicRepo = false, userId = null) {
const database = require('../config/database');
const localPath = this.gitRepoService.getLocalRepoPath(owner, repo, branch);
let storageRecord = null;
@ -544,7 +577,7 @@ class GitHubIntegrationService {
console.warn(`Failed to clone public repo without auth: ${error.message}`);
// Fallback to authenticated clone if available
try {
const tokenRecord = await this.oauthService.getToken();
const tokenRecord = userId ? await this.oauthService.getTokenForUser(userId) : await this.oauthService.getToken();
if (tokenRecord?.access_token) {
repoPath = await this.gitRepoService.cloneIfMissingWithAuth(
owner,
@ -560,7 +593,7 @@ class GitHubIntegrationService {
} else {
// For private repos, try authenticated clone first
try {
const tokenRecord = await this.oauthService.getToken();
const tokenRecord = userId ? await this.oauthService.getTokenForUser(userId) : await this.oauthService.getToken();
if (tokenRecord?.access_token) {
repoPath = await this.gitRepoService.cloneIfMissingWithAuth(
owner,
@ -628,7 +661,7 @@ class GitHubIntegrationService {
try {
// Try to ensure repo exists for the preferred branch
try {
const tokenRecord = await this.oauthService.getToken().catch(() => null);
const tokenRecord = userId ? await this.oauthService.getTokenForUser(userId).catch(() => null) : await this.oauthService.getToken().catch(() => null);
if (tokenRecord?.access_token) {
repoPath = await this.gitRepoService.cloneIfMissingWithAuth(owner, repo, preferredBranch, 'github.com', tokenRecord.access_token, 'oauth2');
} else {
@ -637,7 +670,7 @@ class GitHubIntegrationService {
} catch (cloneErr) {
// If the branch doesn't exist (e.g., refs/heads not found), try the alternate branch
try {
const tokenRecordAlt = await this.oauthService.getToken().catch(() => null);
const tokenRecordAlt = userId ? await this.oauthService.getTokenForUser(userId).catch(() => null) : await this.oauthService.getToken().catch(() => null);
repoPath = tokenRecordAlt?.access_token
? await this.gitRepoService.cloneIfMissingWithAuth(owner, repo, alternateBranch, 'github.com', tokenRecordAlt.access_token, 'oauth2')
: await this.gitRepoService.cloneIfMissing(owner, repo, alternateBranch);
@ -679,7 +712,7 @@ class GitHubIntegrationService {
try {
// Ensure repo exists similarly to diff flow
try {
const tokenRecord = await this.oauthService.getToken().catch(() => null);
const tokenRecord = userId ? await this.oauthService.getTokenForUser(userId).catch(() => null) : await this.oauthService.getToken().catch(() => null);
if (tokenRecord?.access_token) {
repoPath = await this.gitRepoService.cloneIfMissingWithAuth(owner, repo, preferredBranch, 'github.com', tokenRecord.access_token, 'oauth2');
} else {
@ -687,7 +720,7 @@ class GitHubIntegrationService {
}
} catch (_) {
try {
const tokenRecordAlt = await this.oauthService.getToken().catch(() => null);
const tokenRecordAlt = userId ? await this.oauthService.getTokenForUser(userId).catch(() => null) : await this.oauthService.getToken().catch(() => null);
repoPath = tokenRecordAlt?.access_token
? await this.gitRepoService.cloneIfMissingWithAuth(owner, repo, alternateBranch, 'github.com', tokenRecordAlt.access_token, 'oauth2')
: await this.gitRepoService.cloneIfMissing(owner, repo, alternateBranch);
@ -720,15 +753,15 @@ class GitHubIntegrationService {
}
// Try git-based sync first, fall back to GitHub API download on failure
async syncRepositoryWithFallback(owner, repo, branch, repositoryId, isPublicRepo = false) {
async syncRepositoryWithFallback(owner, repo, branch, repositoryId, isPublicRepo = false, userId = null) {
// First attempt: full git clone/fetch and index
const gitResult = await this.syncRepositoryWithGit(owner, repo, branch, repositoryId, isPublicRepo);
const gitResult = await this.syncRepositoryWithGit(owner, repo, branch, repositoryId, isPublicRepo, userId);
if (gitResult && gitResult.success) {
return { method: 'git', ...gitResult };
}
// Fallback: API-based download and storage
const apiResult = await this.downloadRepositoryWithStorage(owner, repo, branch, repositoryId, isPublicRepo);
const apiResult = await this.downloadRepositoryWithStorage(owner, repo, branch, repositoryId, isPublicRepo, userId);
if (apiResult && apiResult.success) {
return { method: 'api', ...apiResult, git_error: gitResult?.error };
}
@ -737,7 +770,7 @@ class GitHubIntegrationService {
}
// Download repository files locally and store in database
async downloadRepositoryWithStorage(owner, repo, branch, repositoryId, isPublicRepo = false) {
async downloadRepositoryWithStorage(owner, repo, branch, repositoryId, isPublicRepo = false, userId = null) {
const targetDir = path.join(
process.env.ATTACHED_REPOS_DIR,
`${owner}__${repo}__${branch}`
@ -765,8 +798,8 @@ class GitHubIntegrationService {
userAgent: 'CodeNuk-GitIntegration/1.0.0',
});
} else {
// For private repos, use authenticated octokit
octokit = await this.getAuthenticatedOctokit();
// For private repos, use authenticated octokit with userId
octokit = await this.getAuthenticatedOctokit(userId);
}
// Get the commit SHA for the branch

View File

@ -199,8 +199,16 @@ class GitHubOAuthService {
}
// Create authenticated Octokit instance
async getAuthenticatedOctokit() {
const tokenRecord = await this.getToken();
async getAuthenticatedOctokit(userId = null) {
// If userId is provided, get the newest token for that user
// Otherwise, get the newest token overall
let tokenRecord;
if (userId) {
tokenRecord = await this.getTokenForUser(userId);
console.log(`[GitHub OAuth] Using token for user ${userId}: ${tokenRecord?.github_username || 'none'}`);
} else {
tokenRecord = await this.getToken();
}
if (!tokenRecord) {
throw new Error('No GitHub token found. Please authenticate with GitHub first.');

View File

@ -15,7 +15,11 @@ class GithubAdapter {
return this.impl.parseGitHubUrl(url);
}
async checkRepositoryAccess(owner, repo) {
async checkRepositoryAccess(owner, repo, userId = null) {
// Use user-specific method if userId is provided
if (userId) {
return await this.impl.checkRepositoryAccessWithUser(owner, repo, userId);
}
return await this.impl.checkRepositoryAccess(owner, repo);
}

View File

@ -0,0 +1,58 @@
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
*.egg-info/
dist/
build/
*.egg
# Virtual environments
venv/
env/
ENV/
.venv
# IDE
.vscode/
.idea/
*.swp
*.swo
*~
# Documentation
*.md
!README.md
# Testing
.pytest_cache/
.coverage
htmlcov/
*.log
# Storage and temporary files
storage/
*.tmp
*.temp
# Git
.git/
.gitignore
# Docker
Dockerfile*
docker-compose*.yml
.dockerignore
# Environment files
.env
.env.local
*.env
# OS
.DS_Store
Thumbs.db

View File

@ -1,29 +1,60 @@
FROM python:3.11-slim
# Build stage - install dependencies that require compilation
FROM python:3.11-slim as builder
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1
WORKDIR /app
# Install build dependencies only
RUN apt-get update && \
apt-get install -y --no-install-recommends \
build-essential \
curl \
&& rm -rf /var/lib/apt/lists/*
# Copy and install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir --user -r requirements.txt && \
pip cache purge
# Download SpaCy English model
RUN python -m spacy download en_core_web_sm
# Runtime stage - minimal image with only runtime dependencies
FROM python:3.11-slim
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PYTHONPATH=/app/src \
PATH=/root/.local/bin:$PATH \
MULTI_DOC_STORAGE_ROOT=/app/storage \
MULTI_DOC_CLAUDE_MODEL=claude-3-5-haiku-latest \
CLAUDE_MODEL=claude-3-5-haiku-latest \
PORT=8024
WORKDIR /app
# Install only runtime dependencies (no build tools)
RUN apt-get update && \
apt-get install -y --no-install-recommends \
poppler-utils \
tesseract-ocr \
ffmpeg \
libmagic1 \
&& rm -rf /var/lib/apt/lists/*
curl \
# Required for some Python packages at runtime
libgomp1 \
libglib2.0-0 \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy Python packages from builder stage (includes spacy model)
COPY --from=builder /root/.local /root/.local
# Copy application code
COPY src ./src
ENV PYTHONPATH=/app/src \
MULTI_DOC_STORAGE_ROOT=/app/storage \
MULTI_DOC_CLAUDE_MODEL=claude-3-5-sonnet-20241022 \
PORT=8024
EXPOSE 8024
CMD ["sh", "-c", "uvicorn multi_document_upload_service.main:app --host 0.0.0.0 --port ${PORT:-8024}"]

View File

@ -1,144 +0,0 @@
# Fix: Empty Graph in Neo4j (No Relationships Found)
## Problem
When querying Neo4j for `CAUSES` relationships, you get "(no changes, no records)" because:
1. **PDF extraction failed** - Missing dependencies (`unstructured[pdf]`)
2. **0 relations extracted** - No text was extracted, so no analysis happened
3. **0 relations written** - Nothing was written to Neo4j (correct behavior)
## Root Cause
The service completed with 0 relations because:
- PDF file extraction failed: `partition_pdf() is not available because one or more dependencies are not installed`
- No text was extracted from the PDF
- No chunks were created
- No Claude analysis happened
- 0 relations were extracted
- 0 relations were written to Neo4j
## Solution
### Step 1: Update Dependencies
The `requirements.txt` has been updated to include:
```
unstructured[pdf]>=0.15.0
unstructured[docx]>=0.15.0
unstructured[pptx]>=0.15.0
unstructured[xlsx]>=0.15.0
```
### Step 2: Rebuild the Service
```bash
cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine
# Rebuild the service with new dependencies
docker-compose build multi-document-upload-service
# Restart the service
docker-compose restart multi-document-upload-service
# Check logs to verify it's working
docker-compose logs -f multi-document-upload-service
```
### Step 3: Verify Dependencies
```bash
# Check if unstructured[pdf] is installed
docker-compose exec multi-document-upload-service pip list | grep unstructured
```
### Step 4: Re-upload Documents
1. Go to Project Builder in the frontend
2. Click on "Upload Documents for Knowledge Graph"
3. Upload a PDF or other document
4. Wait for processing to complete
5. Check Neo4j for relationships
### Step 5: Check Neo4j
Run these queries in Neo4j Browser:
```cypher
// Check if any nodes exist
MATCH (n)
RETURN count(n) as node_count
// Check for CAUSES relationships
MATCH (n:Concept)-[r:CAUSES]->(m:Concept)
RETURN n.name as cause, m.name as effect, r.confidence as confidence
LIMIT 50
```
## Expected Behavior After Fix
1. **PDF extraction succeeds** - Text is extracted from PDF files
2. **Text is chunked** - Document is split into manageable chunks
3. **Claude analyzes** - Causal relationships are extracted
4. **Relations are written** - Relationships are stored in Neo4j
5. **Query returns results** - Neo4j query shows relationships
## Verification Steps
1. **Check service logs**:
```bash
docker-compose logs multi-document-upload-service | grep -i "extracted\|relation\|neo4j"
```
2. **Check job status**:
```bash
curl http://localhost:8000/api/multi-docs/jobs/{job_id}
```
Should show: `"processed_files": 1` and relations count > 0
3. **Check Neo4j**:
```cypher
MATCH (n:Concept)-[r:CAUSES]->(m:Concept)
RETURN count(r) as relation_count
```
## Improvements Made
1. ✅ **Added PDF dependencies** - `unstructured[pdf]`, `unstructured[docx]`, etc.
2. ✅ **Added fallback extractors** - Uses `pdfplumber` if unstructured fails
3. ✅ **Better error handling** - Shows actual errors in job status
4. ✅ **Improved logging** - More detailed logs for debugging
5. ✅ **Better Neo4j query** - Validates data before writing
## Troubleshooting
If you still see 0 relations after rebuilding:
1. **Check extraction logs**:
```bash
docker-compose logs multi-document-upload-service | grep -i "extract"
```
2. **Check Claude analysis**:
```bash
docker-compose logs multi-document-upload-service | grep -i "claude\|analyze"
```
3. **Check Neo4j connection**:
```bash
docker-compose logs multi-document-upload-service | grep -i "neo4j\|graph"
```
4. **Verify document has causal language**:
- Not all documents contain causal relationships
- Try uploading a document with clear cause-effect statements
- Example: "Smoking causes lung cancer" or "Rain causes flooding"
## Next Steps
1. Rebuild the service with new dependencies
2. Re-upload documents
3. Check Neo4j for relationships
4. If still no results, check service logs for errors
5. Verify the document contains causal language

View File

@ -1,176 +0,0 @@
# Neo4j Diagnostic Queries
## Issue: No relationships found in Neo4j
If you're seeing "(no changes, no records)" when querying for `CAUSES` relationships, here are diagnostic queries to check what's actually in the database.
## Diagnostic Queries
### 1. Check if any nodes exist
```cypher
MATCH (n)
RETURN count(n) as node_count
LIMIT 1
```
### 2. Check if Concept nodes exist
```cypher
MATCH (n:Concept)
RETURN count(n) as concept_count,
collect(DISTINCT labels(n)) as labels,
collect(DISTINCT keys(n)) as properties
LIMIT 10
```
### 3. Check all relationship types
```cypher
CALL db.relationshipTypes() YIELD relationshipType
RETURN relationshipType
```
### 4. Check all node labels
```cypher
CALL db.labels() YIELD label
RETURN label
```
### 5. Check all relationships (any type)
```cypher
MATCH (n)-[r]->(m)
RETURN type(r) as relationship_type,
count(r) as count,
labels(n) as from_labels,
labels(m) as to_labels
LIMIT 50
```
### 6. Check for CAUSES relationships specifically
```cypher
MATCH (n)-[r:CAUSES]->(m)
RETURN n, r, m
LIMIT 50
```
### 7. Check for relationships with lowercase "causes"
```cypher
MATCH (n)-[r]->(m)
WHERE type(r) =~ '(?i)causes'
RETURN type(r) as relationship_type, n, r, m
LIMIT 50
```
### 8. Check all nodes and their relationships
```cypher
MATCH (n)
OPTIONAL MATCH (n)-[r]->(m)
RETURN n, labels(n) as node_labels,
type(r) as relationship_type,
m, labels(m) as target_labels
LIMIT 50
```
### 9. Check for nodes created by the service (by job_id property)
```cypher
MATCH (n)-[r]->(m)
WHERE r.job_id IS NOT NULL
RETURN n, r, m, r.job_id as job_id
LIMIT 50
```
### 10. Check database statistics
```cypher
MATCH (n)
RETURN count(n) as total_nodes,
size([(n)-[r]->() | r]) as total_relationships
```
## Common Issues and Solutions
### Issue 1: No nodes at all
**Symptom**: Query 1 returns 0 nodes
**Cause**: Service hasn't written anything to Neo4j, or connection failed
**Solution**:
- Check service logs: `docker-compose logs multi-document-upload-service`
- Verify Neo4j connection in service configuration
- Check if job completed with 0 relations (extraction failed)
### Issue 2: Nodes exist but no relationships
**Symptom**: Query 1 returns nodes, but Query 6 returns no relationships
**Cause**: Relationships weren't created, or different relationship type
**Solution**:
- Check Query 5 to see what relationship types actually exist
- Check service logs for graph writing errors
- Verify the job actually extracted relations (check job status)
### Issue 3: Different relationship type
**Symptom**: Query 5 shows relationships but not `CAUSES`
**Cause**: Service might be using a different relationship type
**Solution**:
- Check Query 3 to see all relationship types
- Update query to use the correct relationship type
### Issue 4: Different node labels
**Symptom**: Query 6 returns no results, but Query 2 shows different labels
**Cause**: Service might be using different node labels
**Solution**:
- Check Query 2 to see what labels exist
- Update query to match actual labels
## Expected Structure
After a successful upload, you should see:
### Nodes
- **Label**: `Concept`
- **Properties**: `name`, `lastSeen`
### Relationships
- **Type**: `CAUSES`
- **Properties**: `confidence`, `explanation`, `source_file_id`, `source_snippet`, `job_id`, `model`, `updated_at`
### Example Query
```cypher
MATCH (cause:Concept)-[r:CAUSES]->(effect:Concept)
RETURN cause.name as cause,
effect.name as effect,
r.confidence as confidence,
r.job_id as job_id,
r.source_file_id as source_file
LIMIT 50
```
## Troubleshooting Steps
1. **Check service logs**:
```bash
docker-compose logs -f multi-document-upload-service
```
2. **Check if job completed successfully**:
```bash
curl http://localhost:8000/api/multi-docs/jobs/{job_id}
```
3. **Check Neo4j connection**:
```bash
docker-compose logs neo4j | grep -i error
```
4. **Verify Neo4j is running**:
```bash
docker-compose ps neo4j
```
5. **Test Neo4j connection manually**:
```bash
docker-compose exec neo4j cypher-shell -u neo4j -p password "MATCH (n) RETURN count(n)"
```
## Next Steps
1. Run the diagnostic queries above
2. Check the service logs for errors
3. Verify the job status via API
4. Re-upload documents after fixing dependencies
5. Check if relations were actually extracted (job status should show relation count)

View File

@ -1,85 +0,0 @@
# Quick Testing Guide - Multi-Document Upload
## 🚀 Quick Start Testing
### 1. Start Services
```bash
cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine
docker-compose up -d multi-document-upload-service neo4j redis postgres api-gateway
```
### 2. Verify Services
```bash
# Check health
curl http://localhost:8024/health
curl http://localhost:8000/api/multi-docs/health
```
### 3. Test via Frontend
1. **Open Frontend**: `http://localhost:3001`
2. **Login** (if required)
3. **Go to Project Builder**
4. **Complete Steps 1-2** (Project Type & Features)
5. **Step 3: Multi Docs Upload** appears
6. **Upload files**:
- Click upload area
- Select multiple files (PDF, DOCX, etc.)
- Click "Start Upload"
7. **Watch Progress**:
- Progress bar updates
- Status messages appear
- Polls every 4 seconds
8. **Auto-proceeds** when completed
### 4. Verify in Neo4j
```bash
# Open Neo4j Browser: http://localhost:7474
# Login: neo4j / password
# Query causal relationships:
MATCH (n)-[r:CAUSES]->(m)
RETURN n, r, m
LIMIT 50
```
## 📝 Test Checklist
- [ ] Service starts successfully
- [ ] Health endpoint works
- [ ] Frontend component renders
- [ ] File upload works
- [ ] Progress updates correctly
- [ ] Job completes successfully
- [ ] Neo4j graph contains relationships
- [ ] Error handling works
- [ ] Skip button works
## 🔍 Debug Commands
```bash
# View service logs
docker-compose logs -f multi-document-upload-service
# Check job status (replace {job_id})
curl http://localhost:8000/api/multi-docs/jobs/{job_id}
# Check graph summary
curl http://localhost:8000/api/multi-docs/jobs/{job_id}/graph
```
## ⚠️ Common Issues
1. **502 Bad Gateway**: Service not running → `docker-compose ps`
2. **413 Too Large**: File too big → Reduce file size
3. **No progress**: Check browser console → Check network tab
4. **No relationships**: Check Claude API key → Check service logs
## 🎯 Expected Flow
```
Upload Files → Job Created → Files Saved → Content Extracted →
Claude Analysis → Graph Built → Completed → Auto-proceed to Next Step
```

File diff suppressed because it is too large Load Diff

View File

@ -1,152 +0,0 @@
# Rebuild Instructions - Multi-Document Upload Service
## Issue: Empty Graph in Neo4j
**Problem**: Query returns "(no changes, no records)" because the job completed with 0 relations.
**Root Cause**: PDF extraction failed due to missing dependencies (`unstructured[pdf]`).
## Fixes Applied
1. ✅ Added PDF dependencies (`unstructured[pdf]`, `unstructured[docx]`, etc.)
2. ✅ Added fallback extractors (pdfplumber, python-docx, python-pptx)
3. ✅ Improved error handling and logging
4. ✅ Fixed Neo4j query syntax
5. ✅ Better status messages
## Rebuild Steps
### Step 1: Rebuild the Service
```bash
cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine
# Stop the service
docker-compose stop multi-document-upload-service
# Rebuild with new dependencies
docker-compose build --no-cache multi-document-upload-service
# Start the service
docker-compose up -d multi-document-upload-service
# Check logs to verify it's starting correctly
docker-compose logs -f multi-document-upload-service
```
### Step 2: Verify Dependencies
```bash
# Check if unstructured[pdf] is installed
docker-compose exec multi-document-upload-service pip list | grep unstructured
# You should see:
# unstructured
# unstructured-pdf
# unstructured-docx
# etc.
```
### Step 3: Test the Service
```bash
# Check health endpoint
curl http://localhost:8024/health
# Should return:
# {
# "status": "ok",
# "claude_model": "claude-3-5-haiku-latest",
# ...
# }
```
### Step 4: Re-upload Documents
1. Open frontend: `http://localhost:3001/project-builder`
2. Go to Step 1: Project Type
3. Find "Upload Documents for Knowledge Graph" section
4. Upload a PDF or other document
5. Wait for processing to complete
6. Check status - should show relation count > 0
### Step 5: Verify in Neo4j
Run these queries in Neo4j Browser (`http://localhost:7474`):
```cypher
// Check if any nodes exist
MATCH (n)
RETURN count(n) as node_count
// Check for CAUSES relationships
MATCH (n:Concept)-[r:CAUSES]->(m:Concept)
RETURN n.name as cause,
m.name as effect,
r.confidence as confidence,
r.job_id as job_id
LIMIT 50
```
## Expected Results
After rebuilding and re-uploading:
1. **PDF extraction succeeds**
2. **Text is extracted**
3. **Relations are extracted**
4. **Relations are written to Neo4j**
5. **Query returns results**
## Troubleshooting
If you still see 0 relations:
1. **Check service logs**:
```bash
docker-compose logs multi-document-upload-service | tail -50
```
2. **Check extraction logs**:
```bash
docker-compose logs multi-document-upload-service | grep -i "extract\|pdf"
```
3. **Check Claude analysis**:
```bash
docker-compose logs multi-document-upload-service | grep -i "claude\|analyze\|relation"
```
4. **Check Neo4j connection**:
```bash
docker-compose logs multi-document-upload-service | grep -i "neo4j\|graph\|write"
```
5. **Verify document has causal language**:
- Not all documents contain causal relationships
- Try uploading a document with clear cause-effect statements
- Example: "Smoking causes lung cancer"
## Quick Test
Test with a simple text file:
1. Create a test file `test_causal.txt`:
```
Smoking cigarettes causes lung cancer.
Heavy rain causes flooding.
Exercise improves health.
```
2. Upload it via the frontend
3. Check Neo4j for relationships
4. Should see 3 causal relationships
## Next Steps
1. Rebuild the service
2. Re-upload documents
3. Check Neo4j for relationships
4. If still no results, check service logs
5. Verify the document contains causal language

View File

@ -1,300 +0,0 @@
# Multi-Document Upload Service - Frontend Testing Guide
## Prerequisites
1. **Backend Services Running**:
```bash
cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine
docker-compose up -d
```
2. **Verify Services are Running**:
- API Gateway: `http://localhost:8000/health`
- Multi-Document Upload Service: `http://localhost:8024/health`
- Neo4j: `http://localhost:7474` (Browser interface)
- Frontend: `http://localhost:3001` (or your frontend port)
3. **Check Service Health**:
```bash
# Check API Gateway
curl http://localhost:8000/health
# Check Multi-Document Upload Service directly
curl http://localhost:8024/health
# Check via API Gateway proxy
curl http://localhost:8000/api/multi-docs/health
```
## Frontend Testing Steps
### Step 1: Navigate to Project Builder
1. Open your browser and go to: `http://localhost:3001` (or your frontend URL)
2. Log in if required
3. Click on **"Project Builder"** in the navigation
### Step 2: Go to Multi Docs Upload Step
1. In the Project Builder, you should see the workflow steps:
- **Step 1**: Project Type
- **Step 2**: Features
- **Step 3**: Multi Docs Upload ← **This is the new step**
- **Step 4**: Business Context
- **Step 5**: Generate
- **Step 6**: Architecture
2. Complete Steps 1 and 2 (Project Type and Features selection)
3. You will automatically be taken to **Step 3: Multi Docs Upload**
### Step 3: Upload Documents
1. **Click on the upload area** or **drag and drop files**
2. **Select multiple files** (you can mix different formats):
- PDF files (`.pdf`)
- Word documents (`.doc`, `.docx`)
- PowerPoint (`.ppt`, `.pptx`)
- Excel files (`.xls`, `.xlsx`)
- JSON files (`.json`)
- XML files (`.xml`)
- Markdown files (`.md`)
- Images (`.png`, `.jpg`, `.jpeg`) - will use OCR
- Audio files (`.mp3`, `.wav`) - will be transcribed
- Video files (`.mp4`, `.avi`) - will be transcribed
3. **View selected files**: You should see a list of all selected files with:
- File icon
- File name
- Remove button for each file
4. **Click "Start Upload"** button
### Step 4: Monitor Upload Progress
After clicking "Start Upload", you should see:
1. **Upload Status**:
- Button shows "Uploading..." with spinner
- Progress bar appears
- Stage messages appear:
- "Job received"
- "Saving files"
- "Extracting document content"
- "Calling Claude for causal relations"
- "Writing to Neo4j knowledge graph"
- "Completed"
2. **Progress Indicators**:
- Progress percentage (0-100%)
- Status message showing current stage
- Processed files count vs total files count
3. **Polling**: The frontend automatically polls the job status every 4 seconds
### Step 5: Verify Results
Once the job is completed:
1. **Check Neo4j Graph**:
- Open Neo4j Browser: `http://localhost:7474`
- Login with:
- Username: `neo4j`
- Password: `password`
- Run Cypher query to see the graph:
```cypher
MATCH (n)-[r:CAUSES]->(m)
RETURN n, r, m
LIMIT 50
```
2. **Check Job Status via API**:
```bash
# Replace {job_id} with the actual job ID from the frontend
curl http://localhost:8000/api/multi-docs/jobs/{job_id}
```
3. **Get Graph Summary**:
```bash
curl http://localhost:8000/api/multi-docs/jobs/{job_id}/graph
```
## Testing Different Scenarios
### Scenario 1: Single PDF File
- Upload one PDF file
- Verify it processes correctly
- Check Neo4j for causal relationships
### Scenario 2: Multiple Mixed Format Files
- Upload 3-5 files of different formats (PDF, DOCX, JSON, image)
- Verify all files are processed
- Check that progress updates correctly
### Scenario 3: Large Files
- Upload a large PDF (10+ MB)
- Verify it handles large files correctly
- Check processing time
### Scenario 4: Error Handling
- Try uploading an unsupported file type
- Verify error message appears
- Check that the error is displayed clearly
### Scenario 5: Skip Option
- Upload files
- Click "Skip" button before completion
- Verify you can proceed to the next step
- Job continues processing in the background
## Browser Developer Tools
### Check Network Requests
1. **Open Developer Tools** (F12)
2. **Go to Network tab**
3. **Filter by "multi-docs"**
4. **Monitor requests**:
- `POST /api/multi-docs/jobs` - Upload files
- `GET /api/multi-docs/jobs/{job_id}` - Poll job status
- `GET /api/multi-docs/jobs/{job_id}/graph` - Get graph summary
### Check Console Logs
1. **Open Console tab**
2. **Look for**:
- Upload progress logs
- Job status updates
- Any error messages
### Check Response Data
Verify the API responses:
```javascript
// Upload response should be:
{
"job_id": "uuid-here",
"stage": "received",
"total_files": 3,
"created_at": "2024-01-01T00:00:00Z"
}
// Status response should be:
{
"job_id": "uuid-here",
"stage": "extracting",
"status_message": "Extracting document content",
"total_files": 3,
"processed_files": 1,
"error": null,
"created_at": "2024-01-01T00:00:00Z",
"updated_at": "2024-01-01T00:01:00Z",
"files": [...]
}
```
## Troubleshooting
### Issue: Upload fails with 502 Bad Gateway
**Solution**:
- Check if multi-document-upload-service is running:
```bash
docker-compose ps multi-document-upload-service
```
- Check service logs:
```bash
docker-compose logs multi-document-upload-service
```
### Issue: Upload fails with 413 Request Entity Too Large
**Solution**:
- Check file sizes (max 500MB total per job)
- Reduce number of files or file sizes
- Check API Gateway body size limits
### Issue: Status polling stops working
**Solution**:
- Check browser console for errors
- Verify job ID is correct
- Check if job completed or failed
- Check network tab for failed requests
### Issue: No causal relationships found
**Solution**:
- Check Claude API key is configured correctly
- Check service logs for Claude API errors
- Verify documents contain causal language
- Check Neo4j connection
### Issue: Frontend shows "Failed" status
**Solution**:
- Check the error message in the frontend
- Check backend service logs:
```bash
docker-compose logs -f multi-document-upload-service
```
- Verify all dependencies are running (Neo4j, Redis, Postgres)
## Expected Behavior
### Successful Flow:
1. ✅ Files upload successfully
2. ✅ Job ID is returned
3. ✅ Status polling starts automatically
4. ✅ Progress updates every 4 seconds
5. ✅ Stage changes are displayed
6. ✅ Progress bar updates
7. ✅ Job completes successfully
8. ✅ Frontend automatically proceeds to next step
9. ✅ Neo4j contains causal relationships
### Error Flow:
1. ✅ Error message is displayed clearly
2. ✅ User can retry upload
3. ✅ User can skip and proceed
4. ✅ Error details are logged in console
## API Endpoints Reference
### Upload Files
```bash
POST /api/multi-docs/jobs
Content-Type: multipart/form-data
Form Data:
- files: File[] (multiple files)
- job_name: string (optional)
```
### Get Job Status
```bash
GET /api/multi-docs/jobs/{job_id}
```
### Get Graph Summary
```bash
GET /api/multi-docs/jobs/{job_id}/graph
```
### Health Check
```bash
GET /api/multi-docs/health
```
## Next Steps After Testing
1. **Verify Neo4j Graph**: Check that causal relationships are stored correctly
2. **Check Storage**: Verify files are stored in the persistent volume
3. **Monitor Performance**: Check processing times for different file types
4. **Test Error Scenarios**: Verify error handling works correctly
5. **Test Large Batches**: Upload 50+ files to test scalability
## Support
If you encounter issues:
1. Check service logs: `docker-compose logs multi-document-upload-service`
2. Check API Gateway logs: `docker-compose logs api-gateway`
3. Check Neo4j logs: `docker-compose logs neo4j`
4. Verify all environment variables are set correctly
5. Check network connectivity between services

View File

@ -8,10 +8,6 @@ pydantic-settings>=2.2.1
aiofiles>=23.2.1
tenacity>=8.2.3
python-dotenv>=1.0.1
unstructured[pdf]>=0.15.0
unstructured[docx]>=0.15.0
unstructured[pptx]>=0.15.0
unstructured[xlsx]>=0.15.0
pdfplumber>=0.11.0
python-docx>=1.1.0
python-pptx>=0.6.23
@ -30,5 +26,13 @@ beautifulsoup4>=4.12.3
lxml>=5.2.1
sqlalchemy>=2.0.25
httpx>=0.27.0
tiktoken>=0.7.0
dowhy>=0.11.0
qdrant-client>=1.7.0
sentence-transformers>=2.2.0
numpy>=1.24.0
scipy>=1.11.0
networkx>=3.1
spacy>=3.7.0
markdown>=3.5.0
weasyprint>=60.0

View File

@ -1,328 +0,0 @@
from __future__ import annotations
import base64
import json
import logging
import re
from pathlib import Path
from typing import Iterable, List
from anthropic import Anthropic, BadRequestError
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential, RetryCallState
from .models import CausalRelation
logger = logging.getLogger(__name__)
def is_billing_error(exception: Exception) -> bool:
"""Check if the exception is a billing/credit related error that shouldn't be retried."""
if isinstance(exception, BadRequestError):
error_message = str(exception).lower()
billing_keywords = ["credit", "balance", "too low", "billing", "upgrade", "purchase credits"]
return any(keyword in error_message for keyword in billing_keywords)
return False
def should_retry_exception(retry_state: RetryCallState) -> bool:
"""Custom retry condition that excludes billing errors."""
exception = retry_state.outcome.exception()
if exception is None:
return False
# Don't retry billing errors - they won't be resolved by retrying
if is_billing_error(exception):
return False
# Retry other exceptions
return True
CLAUDE_PROMPT_TEMPLATE = """You are an expert analyst extracting causal relationships from documents.
Given the following text chunk, identify all explicit or strongly implied cause and effect pairs.
Return JSON with the schema:
[
{
"cause": "<short phrase>",
"effect": "<short phrase>",
"confidence": 0-1 float,
"explanation": "<why this is causal>",
"source_snippet": "<exact quote or paraphrase>"
}
]
Only include items when the causal direction is clear.
If none are found, return an empty list [].
Text chunk:
```
<<<CHUNK_PLACEHOLDER>>>
```"""
IMAGE_PROMPT_TEMPLATE = """You are an expert analyst extracting causal relationships from images, diagrams, and visual content.
Analyze this image/diagram for causal relationships. Look for:
- Architecture flows (A B C)
- Dependency relationships
- Cause-effect chains in diagrams
- Process flows
- System interactions
- Data flows
- Sequential relationships
- Visual connections between components
Return JSON with the schema:
[
{
"cause": "<short phrase describing the cause>",
"effect": "<short phrase describing the effect>",
"confidence": 0-1 float,
"explanation": "<why this is causal, referencing visual elements>",
"source_snippet": "<description of what you see in the image that shows this relationship>"
}
]
Only include items when the causal direction is clear from the visual structure.
If none are found, return an empty list []."""
class ClaudeCausalExtractor:
def __init__(self, api_key: str, model: str, max_output_tokens: int = 4000):
self.client = Anthropic(api_key=api_key)
self.model = model
self.max_output_tokens = max_output_tokens
@retry(
retry=should_retry_exception,
wait=wait_exponential(multiplier=1, min=1, max=10),
stop=stop_after_attempt(3),
reraise=True,
)
def analyze_chunk(self, chunk: str, source_file_id: str) -> List[CausalRelation]:
logger.debug("Analyzing chunk with Claude model %s", self.model)
# Validate chunk is not empty and is readable text
if not chunk or not chunk.strip():
logger.warning("Empty or whitespace-only chunk, skipping")
return []
# Check if chunk contains mostly readable text (not binary data)
# Simple heuristic: if >50% of characters are non-printable or control chars, skip it
printable_chars = sum(1 for c in chunk if c.isprintable() or c.isspace())
if len(chunk) > 100 and printable_chars / len(chunk) < 0.5:
logger.warning("Chunk appears to contain binary data, skipping analysis")
return []
# Use string replacement with a unique placeholder to avoid KeyError with braces in content
# This prevents Python's .format() from interpreting braces in the chunk text as format placeholders
prompt_text = CLAUDE_PROMPT_TEMPLATE.replace("<<<CHUNK_PLACEHOLDER>>>", chunk)
try:
message = self.client.messages.create(
model=self.model,
max_tokens=self.max_output_tokens,
temperature=0.0,
system="You extract causal (cause→effect) relations with high precision.",
messages=[
{
"role": "user",
"content": [{"type": "text", "text": prompt_text}],
}
],
)
except BadRequestError as e:
# Check if it's a billing error
if is_billing_error(e):
error_msg = (
"Anthropic API credit balance is too low. "
"Please go to Plans & Billing to upgrade or purchase credits. "
f"Error: {str(e)}"
)
logger.error(error_msg)
raise RuntimeError(error_msg) from e
# Re-raise other BadRequestErrors
raise
content_blocks = message.content or []
raw_text = "".join(block.text for block in content_blocks if hasattr(block, "text")) # type: ignore[attr-defined]
if not raw_text:
return []
# Try to extract JSON from markdown code blocks if present
json_text = raw_text.strip()
# Look for JSON in markdown code blocks (```json ... ```)
json_match = re.search(r'```(?:json)?\s*(\[.*?\])\s*```', json_text, re.DOTALL)
if json_match:
json_text = json_match.group(1)
else:
# Look for JSON array/object at the start or end
json_match = re.search(r'(\[.*?\]|{.*?})', json_text, re.DOTALL)
if json_match:
json_text = json_match.group(1)
try:
data = json.loads(json_text)
if not isinstance(data, list):
logger.warning("Claude response is not a list: %s", type(data))
return []
relations: List[CausalRelation] = []
for item in data:
if not isinstance(item, dict):
continue
cause = item.get("cause", "").strip()
effect = item.get("effect", "").strip()
if not cause or not effect:
continue # Skip invalid relations
relations.append(
CausalRelation(
cause=cause,
effect=effect,
confidence=float(item.get("confidence", 0.0)),
explanation=item.get("explanation"),
source_file_id=source_file_id,
source_snippet=item.get("source_snippet"),
metadata={"model": self.model},
)
)
logger.info("Extracted %d relations from Claude response", len(relations))
return relations
except json.JSONDecodeError as e:
logger.warning("Failed to parse Claude response as JSON: %s. Raw text: %s", e, raw_text[:200])
return []
def analyze(self, chunks: Iterable[str], source_file_id: str) -> List[CausalRelation]:
relations: List[CausalRelation] = []
for chunk in chunks:
relations.extend(self.analyze_chunk(chunk, source_file_id=source_file_id))
return relations
@retry(
retry=should_retry_exception,
wait=wait_exponential(multiplier=1, min=1, max=10),
stop=stop_after_attempt(3),
reraise=True,
)
def analyze_image(self, image_path: Path, source_file_id: str) -> List[CausalRelation]:
"""
Analyze an image using Claude Vision API to extract causal relationships.
Sends image directly to Claude (no OCR).
"""
logger.info("Analyzing image with Claude Vision: %s", image_path.name)
try:
# Read and encode image as base64
with open(image_path, "rb") as image_file:
image_data = image_file.read()
# Determine media type
suffix = image_path.suffix.lower()
media_type_map = {
".png": "image/png",
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".gif": "image/gif",
".webp": "image/webp",
}
media_type = media_type_map.get(suffix, "image/png")
# Encode to base64
base64_image = base64.b64encode(image_data).decode("utf-8")
# Prepare content for Claude Vision API
content = [
{
"type": "image",
"source": {
"type": "base64",
"media_type": media_type,
"data": base64_image,
},
},
{
"type": "text",
"text": IMAGE_PROMPT_TEMPLATE,
},
]
# Call Claude Vision API
try:
message = self.client.messages.create(
model=self.model, # Claude models support vision
max_tokens=self.max_output_tokens,
temperature=0.0,
system="You extract causal (cause→effect) relations from visual content with high precision.",
messages=[
{
"role": "user",
"content": content,
}
],
)
except BadRequestError as e:
# Check if it's a billing error
if is_billing_error(e):
error_msg = (
"Anthropic API credit balance is too low. "
"Please go to Plans & Billing to upgrade or purchase credits. "
f"Error: {str(e)}"
)
logger.error(error_msg)
raise RuntimeError(error_msg) from e
# Re-raise other BadRequestErrors
raise
# Parse response
content_blocks = message.content or []
raw_text = "".join(block.text for block in content_blocks if hasattr(block, "text")) # type: ignore[attr-defined]
if not raw_text:
logger.warning("No text response from Claude Vision for image %s", image_path.name)
return []
# Extract JSON from response
json_text = raw_text.strip()
json_match = re.search(r'```(?:json)?\s*(\[.*?\])\s*```', json_text, re.DOTALL)
if json_match:
json_text = json_match.group(1)
else:
json_match = re.search(r'(\[.*?\]|{.*?})', json_text, re.DOTALL)
if json_match:
json_text = json_match.group(1)
try:
data = json.loads(json_text)
if not isinstance(data, list):
logger.warning("Claude Vision response is not a list: %s", type(data))
return []
relations: List[CausalRelation] = []
for item in data:
if not isinstance(item, dict):
continue
cause = item.get("cause", "").strip()
effect = item.get("effect", "").strip()
if not cause or not effect:
continue
relations.append(
CausalRelation(
cause=cause,
effect=effect,
confidence=float(item.get("confidence", 0.0)),
explanation=item.get("explanation"),
source_file_id=source_file_id,
source_snippet=item.get("source_snippet") or f"Image: {image_path.name}",
metadata={"model": self.model, "content_type": "image", "image_path": str(image_path)},
)
)
logger.info("Extracted %d relations from image %s", len(relations), image_path.name)
return relations
except json.JSONDecodeError as e:
logger.warning("Failed to parse Claude Vision response as JSON: %s. Raw text: %s", e, raw_text[:200])
return []
except Exception as exc:
logger.exception("Failed to analyze image %s: %s", image_path, exc)
return []

View File

@ -20,7 +20,7 @@ class Settings(BaseSettings):
model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")
anthropic_api_key: str | None = Field(default=None, validation_alias="ANTHROPIC_API_KEY")
claude_model: str = Field(default=os.getenv("MULTI_DOC_CLAUDE_MODEL", "claude-3-5-sonnet-20241022"))
claude_model: str = Field(default=os.getenv("MULTI_DOC_CLAUDE_MODEL", os.getenv("CLAUDE_MODEL", "claude-3-5-haiku-latest")))
claude_max_input_tokens: int = Field(default=200_000)
claude_max_output_tokens: int = Field(default=16_000)
@ -37,6 +37,27 @@ class Settings(BaseSettings):
job_retention_days: int = Field(default=30)
# Qwen2.5-VL API configuration
qwen_api_key: str | None = Field(default=None, validation_alias="QWEN_API_KEY")
qwen_api_url: str = Field(default=os.getenv("QWEN_API_URL", "https://api.example.com/v1/chat/completions"))
qwen_model: str = Field(default=os.getenv("QWEN_MODEL", "qwen2.5-vl"))
# DoWhy configuration
dowhy_enabled: bool = Field(default=True)
dowhy_confidence_threshold: float = Field(default=0.05)
# Embedding configuration
embedding_model: str = Field(default="sentence-transformers/all-MiniLM-L6-v2")
embedding_dimension: int = Field(default=384)
# Qdrant configuration
qdrant_url: str = Field(default=os.getenv("QDRANT_URL", "http://localhost:6333"))
qdrant_collection_name: str = Field(default="kg_embeddings")
qdrant_vector_size: int = Field(default=384)
# Report generation configuration
report_format: str = Field(default="markdown")
def ensure_storage_dirs(self) -> None:
(self.storage_root / "jobs").mkdir(parents=True, exist_ok=True)
(self.storage_root / "uploads").mkdir(parents=True, exist_ok=True)

View File

@ -1,168 +0,0 @@
from __future__ import annotations
import logging
from pathlib import Path
from typing import List
logger = logging.getLogger(__name__)
# Try to import unstructured, but fall back to alternatives if not available
try:
from unstructured.partition.auto import partition
HAS_UNSTRUCTURED = True
except ImportError:
HAS_UNSTRUCTURED = False
logger.warning("unstructured not available, will use fallback extractors")
# Fallback extractors
try:
import pdfplumber
HAS_PDFPLUMBER = True
except ImportError:
HAS_PDFPLUMBER = False
try:
from docx import Document as DocxDocument
HAS_DOCX = True
except ImportError:
HAS_DOCX = False
try:
from pptx import Presentation
HAS_PPTX = True
except ImportError:
HAS_PPTX = False
# Image processing libraries
try:
from PIL import Image
import pytesseract
HAS_OCR = True
except ImportError:
HAS_OCR = False
logger.warning("OCR libraries not available, image extraction will be limited")
def extract_text(path: Path) -> str:
"""
Extract text from a file using multiple strategies.
Falls back through: unstructured -> format-specific -> plain text read.
"""
suffix = path.suffix.lower()
# Validate PDF file before processing
if suffix == ".pdf":
# Quick validation: check if file starts with PDF magic bytes
try:
with path.open("rb") as f:
header = f.read(4)
if header != b"%PDF":
raise ValueError(
f"File {path.name} does not appear to be a valid PDF. "
f"PDF files must start with '%PDF' magic bytes. "
f"Got: {header[:20] if len(header) > 0 else 'empty file'}"
)
except Exception as exc:
if isinstance(exc, ValueError):
raise
logger.warning("Could not validate PDF header: %s", exc)
# Image files - return empty text (will be processed directly with Claude Vision)
# We skip OCR and send images directly to Claude Vision API
if suffix in {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}:
logger.info("Image file detected: %s. Will be processed directly with Claude Vision (no OCR)", path.name)
# Return empty string - images will be handled separately in pipeline
return ""
# Plain text files - direct read
if suffix in {".txt", ".md", ".json", ".xml", ".html", ".csv"}:
try:
return path.read_text(encoding="utf-8", errors="ignore")
except Exception as exc:
logger.warning("Failed to read %s as text: %s", path, exc)
raise
# Try unstructured first (if available)
if HAS_UNSTRUCTURED:
try:
elements = partition(filename=str(path))
lines: List[str] = []
for element in elements:
text = getattr(element, "text", None)
if text:
lines.append(text.strip())
if lines:
logger.info("Extracted %d lines using unstructured", len(lines))
return "\n".join(lines)
except Exception as exc:
logger.warning("unstructured extraction failed for %s: %s", path, exc)
# Continue to fallback methods
# Fallback: PDF with pdfplumber
if suffix == ".pdf" and HAS_PDFPLUMBER:
try:
with pdfplumber.open(path) as pdf:
text_parts = []
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text_parts.append(page_text)
if text_parts:
logger.info("Extracted PDF using pdfplumber")
return "\n".join(text_parts)
except Exception as exc:
logger.warning("pdfplumber extraction failed for %s: %s", path, exc)
# Fallback: DOCX
if suffix == ".docx" and HAS_DOCX:
try:
doc = DocxDocument(path)
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
if paragraphs:
logger.info("Extracted DOCX using python-docx")
return "\n".join(paragraphs)
except Exception as exc:
logger.warning("python-docx extraction failed for %s: %s", path, exc)
# Fallback: PPTX
if suffix in {".pptx", ".ppt"} and HAS_PPTX:
try:
prs = Presentation(path)
text_parts = []
for slide in prs.slides:
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text:
text_parts.append(shape.text.strip())
if text_parts:
logger.info("Extracted PPTX using python-pptx")
return "\n".join(text_parts)
except Exception as exc:
logger.warning("python-pptx extraction failed for %s: %s", path, exc)
# Last resort: try to read as text anyway, but validate it's readable
try:
content = path.read_text(encoding="utf-8", errors="ignore")
if content.strip():
# Check if content is actually readable text (not binary data)
# Simple heuristic: if >30% of characters are printable, consider it text
printable_chars = sum(1 for c in content if c.isprintable() or c.isspace())
total_chars = len(content)
if total_chars > 0 and printable_chars / total_chars > 0.3:
logger.warning("Read %s as plain text (may contain binary data)", path)
return content
else:
logger.error("Content from %s appears to be binary data, cannot extract text", path)
raise ValueError(f"File {path} appears to be binary or corrupted. Cannot extract readable text.")
except Exception as exc:
if isinstance(exc, ValueError):
raise
logger.warning("Failed to read %s as text: %s", path, exc)
# If all else fails, raise an error
raise ValueError(
f"Could not extract text from {path}. "
f"File type may not be supported, file may be corrupted, or dependencies are missing. "
f"Supported formats: PDF, DOCX, PPTX, XLSX, TXT, MD, JSON, XML, HTML, CSV, PNG, JPG, JPEG (with OCR)"
)

View File

@ -0,0 +1,320 @@
from __future__ import annotations
import logging
import re
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional
logger = logging.getLogger(__name__)
try:
import fitz # PyMuPDF
HAS_PYMUPDF = True
except ImportError:
HAS_PYMUPDF = False
logger.warning("PyMuPDF not available")
try:
from docx import Document as DocxDocument
HAS_DOCX = True
except ImportError:
HAS_DOCX = False
logger.warning("python-docx not available")
try:
from pptx import Presentation
HAS_PPTX = True
except ImportError:
HAS_PPTX = False
logger.warning("python-pptx not available")
try:
import pandas as pd
HAS_PANDAS = True
except ImportError:
HAS_PANDAS = False
logger.warning("pandas not available")
@dataclass
class ExtractedText:
"""Structured text extraction with context."""
text: str
page_number: int
metadata: dict
context: Optional[str] = None # Surrounding context
def extract_text_with_context(path: Path) -> List[ExtractedText]:
"""
Extract text from PDF using PyMuPDF with page-level context.
Returns structured text with metadata.
"""
if not HAS_PYMUPDF:
raise ImportError("PyMuPDF is required for text extraction")
if not path.exists():
raise FileNotFoundError(f"File not found: {path}")
if path.suffix.lower() != ".pdf":
# For non-PDF files, fall back to simple text reading
try:
text = path.read_text(encoding="utf-8", errors="ignore")
return [ExtractedText(
text=text,
page_number=1,
metadata={"file_type": path.suffix, "filename": path.name},
context=None
)]
except Exception as exc:
logger.warning("Failed to read %s as text: %s", path, exc)
raise
extracted_pages: List[ExtractedText] = []
try:
doc = fitz.open(path)
for page_num in range(len(doc)):
page = doc[page_num]
# Extract text
text = page.get_text()
# Extract metadata
metadata = {
"page_number": page_num + 1,
"page_count": len(doc),
"filename": path.name,
"file_type": "pdf",
"page_rect": {
"width": page.rect.width,
"height": page.rect.height
}
}
# Extract context (surrounding pages for better understanding)
context = None
if page_num > 0:
prev_page = doc[page_num - 1]
prev_text = prev_page.get_text()[:500] # Last 500 chars of previous page
context = f"Previous page context: {prev_text}"
if text.strip():
extracted_pages.append(ExtractedText(
text=text,
page_number=page_num + 1,
metadata=metadata,
context=context
))
doc.close()
logger.info("Extracted text from %d pages in %s", len(extracted_pages), path.name)
return extracted_pages
except Exception as exc:
logger.exception("Failed to extract text from PDF %s: %s", path, exc)
raise
def extract_text_from_docx(path: Path) -> str:
"""
Extract text from DOCX file using python-docx.
Reads paragraphs and tables as per README Step 2.2b.
"""
if not HAS_DOCX:
raise ImportError("python-docx is required for DOCX extraction")
try:
doc = DocxDocument(path)
text_parts = []
# Extract paragraphs
for paragraph in doc.paragraphs:
if paragraph.text.strip():
text_parts.append(paragraph.text.strip())
# Extract tables
for table in doc.tables:
table_text = []
for row in table.rows:
row_text = []
for cell in row.cells:
if cell.text.strip():
row_text.append(cell.text.strip())
if row_text:
table_text.append(" | ".join(row_text))
if table_text:
text_parts.append("\n".join(table_text))
result = "\n\n".join(text_parts)
logger.info("Extracted %d characters from DOCX %s", len(result), path.name)
return result
except Exception as exc:
logger.exception("Failed to extract text from DOCX %s: %s", path, exc)
raise
def extract_text_from_pptx(path: Path) -> str:
"""
Extract text from PPTX file using python-pptx.
Reads slides, titles, and notes as per README Step 2.2c.
"""
if not HAS_PPTX:
raise ImportError("python-pptx is required for PPTX extraction")
try:
prs = Presentation(path)
text_parts = []
for slide_num, slide in enumerate(prs.slides, 1):
slide_text = []
# Extract slide title
if slide.shapes.title and slide.shapes.title.text:
slide_text.append(f"Slide {slide_num} Title: {slide.shapes.title.text.strip()}")
# Extract content from shapes
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
# Skip title (already extracted)
if not (slide.shapes.title and shape == slide.shapes.title):
slide_text.append(shape.text.strip())
# Extract notes (if available)
if hasattr(slide, "notes_slide") and slide.notes_slide:
notes_text = ""
for shape in slide.notes_slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
notes_text += shape.text.strip() + " "
if notes_text.strip():
slide_text.append(f"Notes: {notes_text.strip()}")
if slide_text:
text_parts.append("\n".join(slide_text))
result = "\n\n".join(text_parts)
logger.info("Extracted %d characters from PPTX %s (%d slides)",
len(result), path.name, len(prs.slides))
return result
except Exception as exc:
logger.exception("Failed to extract text from PPTX %s: %s", path, exc)
raise
def extract_text_from_spreadsheet(path: Path) -> str:
"""
Extract text from CSV/XLSX file using pandas.
Reads rows and columns, converts to text representation as per README Step 2.2d.
"""
if not HAS_PANDAS:
raise ImportError("pandas is required for spreadsheet extraction")
try:
suffix = path.suffix.lower()
text_parts = []
if suffix == ".csv":
df = pd.read_csv(path, encoding="utf-8", errors="ignore")
elif suffix in {".xlsx", ".xls"}:
# Read first sheet by default
df = pd.read_excel(path, engine="openpyxl" if suffix == ".xlsx" else None)
else:
raise ValueError(f"Unsupported spreadsheet format: {suffix}")
# Convert DataFrame to text representation
# Add column headers
headers = " | ".join(str(col) for col in df.columns)
text_parts.append(f"Columns: {headers}")
# Add rows (limit to first 1000 rows to avoid huge output)
max_rows = min(1000, len(df))
for idx, row in df.head(max_rows).iterrows():
row_values = " | ".join(str(val) if pd.notna(val) else "" for val in row)
text_parts.append(f"Row {idx + 1}: {row_values}")
if len(df) > max_rows:
text_parts.append(f"... ({len(df) - max_rows} more rows)")
result = "\n".join(text_parts)
logger.info("Extracted %d characters from spreadsheet %s (%d rows)",
len(result), path.name, len(df))
return result
except Exception as exc:
logger.exception("Failed to extract text from spreadsheet %s: %s", path, exc)
raise
def clean_text(text: str) -> str:
"""
Clean extracted text as per README Step 2.3.
- Remove extra whitespace
- Fix encoding issues
- Preserve important structure
"""
if not text:
return ""
# Fix encoding issues (remove non-printable characters except newlines and tabs)
cleaned = "".join(char for char in text if char.isprintable() or char in "\n\t\r")
# Remove extra whitespace (but preserve paragraph breaks)
# Replace multiple spaces with single space
cleaned = re.sub(r'[ \t]+', ' ', cleaned)
# Normalize line breaks (preserve double newlines for paragraphs)
cleaned = re.sub(r'\r\n', '\n', cleaned) # Windows line breaks
cleaned = re.sub(r'\r', '\n', cleaned) # Old Mac line breaks
# Preserve paragraph structure (double newlines)
# But remove excessive blank lines (more than 2 consecutive)
cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
# Remove leading/trailing whitespace from each line
lines = [line.strip() for line in cleaned.split('\n')]
cleaned = '\n'.join(lines)
# Remove leading/trailing whitespace overall
cleaned = cleaned.strip()
return cleaned
def extract_all_text(path: Path) -> str:
"""
Extract all text from a file based on type (as per README Step 2).
Routes to appropriate extractor: PDF, DOCX, PPTX, CSV/XLSX, or plain text.
"""
suffix = path.suffix.lower()
# Step 2.2a: PDF
if suffix == ".pdf" and HAS_PYMUPDF:
extracted_pages = extract_text_with_context(path)
text = "\n\n".join([page.text for page in extracted_pages])
# Step 2.2b: DOCX (Word)
elif suffix == ".docx" and HAS_DOCX:
text = extract_text_from_docx(path)
# Step 2.2c: PPTX (PowerPoint)
elif suffix in {".pptx", ".ppt"} and HAS_PPTX:
text = extract_text_from_pptx(path)
# Step 2.2d: CSV/XLSX (Spreadsheet)
elif suffix in {".csv", ".xlsx", ".xls"} and HAS_PANDAS:
text = extract_text_from_spreadsheet(path)
# Fallback: Plain text files
else:
try:
text = path.read_text(encoding="utf-8", errors="ignore")
except Exception as exc:
logger.warning("Failed to read %s as text: %s", path, exc)
raise
# Step 2.3: TEXT CLEANING
text = clean_text(text)
return text

View File

@ -0,0 +1,153 @@
from __future__ import annotations
import base64
import json
import logging
from pathlib import Path
from typing import Dict, List, Optional
import httpx
from ..config import get_settings
logger = logging.getLogger(__name__)
class QwenVisionClient:
"""Client for Qwen2.5-VL API to extract relationships from diagrams and ERDs."""
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None, model: Optional[str] = None):
settings = get_settings()
self.api_key = api_key or settings.qwen_api_key
self.api_url = api_url or settings.qwen_api_url
self.model = model or settings.qwen_model
if not self.api_key:
logger.warning("Qwen API key not configured")
def extract_relationships_from_image(self, image_path: Path, source_file_id: str) -> List[Dict]:
"""
Extract relationships (entities, connections, flows) from an image using Qwen2.5-VL.
Returns list of extracted relationships.
"""
if not self.api_key:
logger.warning("Qwen API key not configured, skipping image analysis")
return []
try:
# Read and encode image
with open(image_path, "rb") as img_file:
image_data = img_file.read()
base64_image = base64.b64encode(image_data).decode("utf-8")
# Determine media type
suffix = image_path.suffix.lower()
media_type_map = {
".png": "image/png",
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".gif": "image/gif",
".webp": "image/webp",
}
media_type = media_type_map.get(suffix, "image/png")
# Prepare prompt for relationship extraction
prompt = """Analyze this diagram/ERD/image and extract all relationships, entities, and connections.
Extract:
1. Entities (boxes, nodes, components)
2. Relationships between entities (arrows, connections, flows)
3. Data flows and dependencies
4. Process flows
5. Architecture patterns
Return JSON with this structure:
[
{
"entity1": "name of first entity",
"entity2": "name of second entity",
"relationship_type": "causes|depends_on|flows_to|contains|uses",
"description": "description of the relationship",
"confidence": 0.0-1.0
}
]
Focus on cause-effect relationships, dependencies, and flows."""
# Prepare API request
payload = {
"model": self.model,
"messages": [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:{media_type};base64,{base64_image}"
}
},
{
"type": "text",
"text": prompt
}
]
}
],
"max_tokens": 4000,
"temperature": 0.0
}
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
}
# Make API call
with httpx.Client(timeout=60.0) as client:
response = client.post(self.api_url, json=payload, headers=headers)
response.raise_for_status()
result = response.json()
# Parse response
content = result.get("choices", [{}])[0].get("message", {}).get("content", "")
if not content:
logger.warning("Empty response from Qwen API for image %s", image_path.name)
return []
# Extract JSON from response
json_text = content.strip()
# Try to find JSON in markdown code blocks
if "```json" in json_text:
json_text = json_text.split("```json")[1].split("```")[0].strip()
elif "```" in json_text:
json_text = json_text.split("```")[1].split("```")[0].strip()
# Parse JSON
try:
relationships = json.loads(json_text)
if not isinstance(relationships, list):
relationships = [relationships]
# Add source metadata
for rel in relationships:
rel["source_file_id"] = source_file_id
rel["source_image"] = str(image_path.name)
rel["extraction_method"] = "qwen2.5-vl"
logger.info("Extracted %d relationships from image %s using Qwen2.5-VL",
len(relationships), image_path.name)
return relationships
except json.JSONDecodeError as e:
logger.warning("Failed to parse Qwen response as JSON: %s. Content: %s",
e, content[:200])
return []
except Exception as exc:
logger.exception("Failed to extract relationships from image %s: %s", image_path, exc)
return []

View File

@ -2,15 +2,16 @@ from __future__ import annotations
import logging
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional
from fastapi import BackgroundTasks, Depends, FastAPI, File, Form, HTTPException, UploadFile
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse
from .claude_client import ClaudeCausalExtractor
from .config import Settings, get_settings
from .jobs import JobStore
from .models import CreateJobResponse, JobGraphSummary, JobStage, JobStatusResponse
from .models import CreateJobResponse, JobGraphSummary, JobStage, JobStatusResponse, ProjectReport
from .processors.graph_writer import GraphWriter
from .storage import StorageManager
from .workflows.pipeline import JobPipeline
@ -20,8 +21,8 @@ logging.basicConfig(level=logging.INFO)
app = FastAPI(
title="Multi Document Upload Service",
version="0.1.0",
description="Processes multi-format documents to build causal knowledge graphs using Claude.",
version="0.2.0",
description="Processes multi-format documents to build knowledge graphs and generate beginner-friendly onboarding reports.",
)
@ -40,7 +41,6 @@ class ServiceContainer:
storage: StorageManager
job_store: JobStore
graph_writer: GraphWriter
claude_extractor: ClaudeCausalExtractor
pipeline: JobPipeline
@ -51,29 +51,24 @@ def get_container() -> ServiceContainer:
global _container
if _container is None:
settings = get_settings()
if not settings.anthropic_api_key:
raise HTTPException(status_code=500, detail="ANTHROPIC_API_KEY is not configured")
# Anthropic API key is only needed for report generation, not required at startup
# if not settings.anthropic_api_key:
# raise HTTPException(status_code=500, detail="ANTHROPIC_API_KEY is not configured")
storage = StorageManager(settings.storage_root)
job_store = JobStore(settings.storage_root)
graph_writer = GraphWriter(settings.neo4j_uri, settings.neo4j_user, settings.neo4j_password)
claude_extractor = ClaudeCausalExtractor(
api_key=settings.anthropic_api_key,
model=settings.claude_model,
max_output_tokens=min(settings.claude_max_output_tokens, 4000),
)
pipeline = JobPipeline(
job_store=job_store,
storage=storage,
graph_writer=graph_writer,
claude_extractor=claude_extractor,
)
_container = ServiceContainer(
settings=settings,
storage=storage,
job_store=job_store,
graph_writer=graph_writer,
claude_extractor=claude_extractor,
pipeline=pipeline,
)
return _container
@ -170,14 +165,86 @@ async def get_job_graph(job_id: str, container: ServiceContainer = Depends(get_d
)
@app.get("/jobs/{job_id}/report", response_model=ProjectReport)
async def get_job_report(job_id: str, container: ServiceContainer = Depends(get_dependencies)) -> ProjectReport:
"""Get the generated beginner-friendly onboarding report."""
job_store = container.job_store
if not job_store.exists(job_id):
raise HTTPException(status_code=404, detail="Job not found")
job = job_store.get(job_id)
if job.stage != JobStage.COMPLETED:
raise HTTPException(
status_code=409,
detail="Report not ready yet. Job is still processing."
)
if not job.report:
# Check if there was an error during report generation
error_msg = "Report not found. "
if job.error:
# Check if error is specifically about report generation
if "report generation" in job.error.lower() or "claude" in job.error.lower():
error_msg = job.error
else:
error_msg += f"Error during generation: {job.error}"
else:
error_msg += "Report generation may have failed (check logs for details)."
raise HTTPException(
status_code=404,
detail=error_msg
)
return job.report
@app.get("/jobs/{job_id}/report/pdf")
async def get_job_report_pdf(job_id: str, container: ServiceContainer = Depends(get_dependencies)):
"""Download the PDF version of the onboarding report (as per README Step 7.9)."""
job_store = container.job_store
if not job_store.exists(job_id):
raise HTTPException(status_code=404, detail="Job not found")
job = job_store.get(job_id)
if job.stage != JobStage.COMPLETED:
raise HTTPException(
status_code=409,
detail="Report not ready yet. Job is still processing."
)
if not job.report:
raise HTTPException(
status_code=404,
detail="Report not found. Job may have completed without generating report."
)
# Get PDF path from report metadata
pdf_path_str = job.report.metadata.get("pdf_path")
if not pdf_path_str:
raise HTTPException(
status_code=404,
detail="PDF not available. Report may have been generated without PDF conversion."
)
pdf_path = Path(pdf_path_str)
if not pdf_path.exists():
raise HTTPException(
status_code=404,
detail="PDF file not found on server."
)
return FileResponse(
path=pdf_path,
media_type="application/pdf",
filename=f"onboarding_report_{job_id}.pdf"
)
@app.get("/health")
async def healthcheck(container: ServiceContainer = Depends(get_dependencies)):
settings = container.settings
return {
"status": "ok",
"claude_model": settings.claude_model,
"max_input_tokens_per_min": settings.claude_max_input_tokens,
"max_output_tokens_per_min": settings.claude_max_output_tokens,
"qwen_model": settings.qwen_model,
"embedding_model": settings.embedding_model,
"qdrant_url": settings.qdrant_url,
"dowhy_enabled": settings.dowhy_enabled,
}

View File

@ -10,9 +10,10 @@ from pydantic import BaseModel, Field
class JobStage(str, Enum):
RECEIVED = "received"
SAVING_FILES = "saving_files"
EXTRACTING = "extracting"
ANALYZING = "analyzing"
BUILDING_GRAPH = "building_graph"
EXTRACTING = "extracting" # PyMuPDF + Qwen2.5-VL
BUILDING_GRAPH = "building_graph" # DoWhy + Neo4j
INDEXING_VECTORS = "indexing_vectors" # Qdrant
GENERATING_REPORT = "generating_report" # Claude onboarding doc
COMPLETED = "completed"
FAILED = "failed"
@ -34,6 +35,7 @@ class CausalRelation(BaseModel):
explanation: Optional[str] = None
source_file_id: Optional[str] = None
source_snippet: Optional[str] = None
relationship_type: str = Field(default="CAUSES") # DEPENDS_ON, USES, IMPLEMENTS, etc.
metadata: Dict[str, Any] = Field(default_factory=dict)
@ -46,6 +48,7 @@ class JobRecord(BaseModel):
total_files: int = 0
processed_files: int = 0
relations: List[CausalRelation] = Field(default_factory=list)
report: Optional[ProjectReport] = None # Generated onboarding report
created_at: datetime = Field(default_factory=datetime.utcnow)
updated_at: datetime = Field(default_factory=datetime.utcnow)
error: str | None = None
@ -82,3 +85,15 @@ class JobGraphSummary(BaseModel):
edge_count: int
generated_at: datetime
class ProjectReport(BaseModel):
"""Beginner-friendly onboarding report generated from project documents."""
job_id: str
title: str = "Project Onboarding Guide"
content: str # Markdown content
sections: Dict[str, str] = Field(default_factory=dict) # Section name -> content
key_concepts: List[str] = Field(default_factory=list) # Important concepts covered
total_pages: int = 0 # Estimated pages
generated_at: datetime = Field(default_factory=datetime.utcnow)
metadata: Dict[str, Any] = Field(default_factory=dict)

View File

@ -1,24 +0,0 @@
from __future__ import annotations
from typing import Iterable, List
import tiktoken
class TextChunker:
def __init__(self, model_name: str, token_target: int = 800, overlap: int = 200):
self.encoder = tiktoken.encoding_for_model("gpt-4o") if "claude" not in model_name else tiktoken.get_encoding("cl100k_base")
self.token_target = token_target
self.overlap = overlap
def chunk(self, text: str) -> Iterable[str]:
tokens = self.encoder.encode(text)
step = max(self.token_target - self.overlap, 1)
chunks: List[str] = []
for start in range(0, len(tokens), step):
end = min(start + self.token_target, len(tokens))
chunk_tokens = tokens[start:end]
chunk_text = self.encoder.decode(chunk_tokens)
chunks.append(chunk_text)
return chunks

View File

@ -0,0 +1,187 @@
from __future__ import annotations
import logging
from typing import List, Optional
import pandas as pd
from ..config import get_settings
from ..models import CausalRelation
logger = logging.getLogger(__name__)
try:
import dowhy
from dowhy import CausalModel
HAS_DOWHY = True
except ImportError:
HAS_DOWHY = False
logger.warning("DoWhy not available")
class DoWhyAnalyzer:
"""Validate causal relationships using DoWhy Structural Causal Models."""
def __init__(self, confidence_threshold: Optional[float] = None):
if not HAS_DOWHY:
raise ImportError("DoWhy is required for causal analysis")
settings = get_settings()
self.confidence_threshold = confidence_threshold or settings.dowhy_confidence_threshold
self.enabled = settings.dowhy_enabled
def validate_relationships(
self,
relationships: List[CausalRelation],
text_data: Optional[str] = None
) -> List[CausalRelation]:
"""
Validate causal relationships using DoWhy SCM.
Filters out relationships that don't pass validation.
"""
if not self.enabled:
logger.info("DoWhy validation is disabled, returning all relationships")
return relationships
if not relationships:
return []
validated: List[CausalRelation] = []
# Group relationships by cause to build SCM
cause_groups = {}
for rel in relationships:
cause = rel.cause
if cause not in cause_groups:
cause_groups[cause] = []
cause_groups[cause].append(rel)
# Validate each group
for cause, effects in cause_groups.items():
for rel in effects:
try:
is_valid = self._validate_single_relationship(rel, relationships, text_data)
if is_valid:
# Update confidence with validation score
rel.confidence = min(rel.confidence + 0.1, 0.95) # Boost validated relationships
rel.metadata["dowhy_validated"] = True
validated.append(rel)
else:
logger.debug("DoWhy validation failed for: %s -> %s", rel.cause, rel.effect)
except Exception as exc:
logger.warning("DoWhy validation error for %s -> %s: %s",
rel.cause, rel.effect, exc)
# If validation fails, keep the relationship but mark it
rel.metadata["dowhy_validated"] = False
rel.metadata["dowhy_error"] = str(exc)
validated.append(rel) # Keep it but with lower confidence
logger.info("DoWhy validated %d/%d relationships", len(validated), len(relationships))
return validated
def _validate_single_relationship(
self,
relationship: CausalRelation,
all_relationships: List[CausalRelation],
text_data: Optional[str] = None
) -> bool:
"""
Validate a single relationship using DoWhy.
Returns True if relationship is valid, False otherwise.
"""
try:
# Build a simple causal graph from relationships
# Extract unique variables (causes and effects)
variables = set()
for rel in all_relationships:
variables.add(rel.cause)
variables.add(rel.effect)
# Create a simple dataset for DoWhy
# Since we don't have actual data, we'll use a heuristic approach
# based on relationship frequency and structure
# Check if there's a path from cause to effect in the graph
has_path = self._check_causal_path(
relationship.cause,
relationship.effect,
all_relationships
)
if not has_path:
return False
# Additional validation: check for confounders
# If there are many relationships involving both cause and effect,
# it's more likely to be valid
related_count = sum(
1 for rel in all_relationships
if rel.cause == relationship.cause or rel.effect == relationship.effect
)
# If there are multiple relationships involving these concepts,
# it's more likely to be a valid causal relationship
if related_count >= 2:
return True
# For single relationships, use confidence threshold
return relationship.confidence >= 0.6
except Exception as exc:
logger.warning("DoWhy validation error: %s", exc)
return False
def _check_causal_path(
self,
cause: str,
effect: str,
relationships: List[CausalRelation],
max_depth: int = 3
) -> bool:
"""Check if there's a causal path from cause to effect."""
if max_depth == 0:
return False
# Direct relationship
for rel in relationships:
if rel.cause == cause and rel.effect == effect:
return True
# Indirect relationship (transitive)
for rel in relationships:
if rel.cause == cause:
# Check if rel.effect leads to the target effect
if self._check_causal_path(rel.effect, effect, relationships, max_depth - 1):
return True
return False
def build_scm_from_relationships(
self,
relationships: List[CausalRelation]
) -> Optional[CausalModel]:
"""
Build a Structural Causal Model from relationships.
This is a simplified version for text-based causal inference.
"""
if not relationships:
return None
try:
# Extract all unique variables
variables = set()
for rel in relationships:
variables.add(rel.cause)
variables.add(rel.effect)
# Create a simple adjacency matrix representation
# This is a heuristic approach since we don't have actual data
# For now, return None as building a full SCM requires actual data
# The validation uses graph-based heuristics instead
return None
except Exception as exc:
logger.warning("Failed to build SCM: %s", exc)
return None

View File

@ -0,0 +1,85 @@
from __future__ import annotations
import logging
from typing import List
from ..config import get_settings
logger = logging.getLogger(__name__)
try:
from sentence_transformers import SentenceTransformer
HAS_SENTENCE_TRANSFORMERS = True
except ImportError:
HAS_SENTENCE_TRANSFORMERS = False
logger.warning("sentence-transformers not available")
class Embedder:
"""Generate embeddings using sentence-transformers."""
def __init__(self, model_name: str | None = None):
if not HAS_SENTENCE_TRANSFORMERS:
raise ImportError("sentence-transformers is required for embeddings")
settings = get_settings()
self.model_name = model_name or settings.embedding_model
logger.info("Loading embedding model: %s", self.model_name)
try:
self.model = SentenceTransformer(self.model_name)
self.dimension = self.model.get_sentence_embedding_dimension()
logger.info("Loaded embedding model with dimension: %d", self.dimension)
except Exception as exc:
logger.exception("Failed to load embedding model %s: %s", self.model_name, exc)
raise
def embed_text(self, text: str) -> List[float]:
"""Generate embedding for a single text."""
if not text or not text.strip():
# Return zero vector for empty text
return [0.0] * self.dimension
try:
embedding = self.model.encode(text, normalize_embeddings=True)
return embedding.tolist()
except Exception as exc:
logger.warning("Failed to embed text: %s", exc)
return [0.0] * self.dimension
def embed_batch(self, texts: List[str], batch_size: int = 32) -> List[List[float]]:
"""Generate embeddings for a batch of texts."""
if not texts:
return []
try:
embeddings = self.model.encode(
texts,
batch_size=batch_size,
normalize_embeddings=True,
show_progress_bar=False
)
return embeddings.tolist()
except Exception as exc:
logger.warning("Failed to embed batch: %s", exc)
return [[0.0] * self.dimension] * len(texts)
def embed_relation(self, cause: str, effect: str, explanation: str | None = None) -> List[float]:
"""Generate embedding for a cause-effect relationship."""
# Combine cause, effect, and explanation into a single text
parts = [cause, "causes", effect]
if explanation:
parts.append(explanation)
text = " ".join(parts)
return self.embed_text(text)
def embed_concept(self, concept_name: str, description: str | None = None) -> List[float]:
"""Generate embedding for a concept/node."""
if description:
text = f"{concept_name}: {description}"
else:
text = concept_name
return self.embed_text(text)

View File

@ -0,0 +1,253 @@
from __future__ import annotations
import json
import logging
import re
from typing import Dict, List, Set
from anthropic import Anthropic, BadRequestError
from ..config import get_settings
from ..models import CausalRelation
logger = logging.getLogger(__name__)
class EntityResolver:
"""
Resolve entity mentions using Claude AI as per README Stage 4.
Identifies that different mentions refer to the same entity.
"""
def __init__(self):
settings = get_settings()
self.api_key = settings.anthropic_api_key
self.model = settings.claude_model
self.max_output_tokens = settings.claude_max_output_tokens
if not self.api_key:
logger.warning("ANTHROPIC_API_KEY not set - Entity resolution will be skipped")
self.client = None
else:
try:
self.client = Anthropic(api_key=self.api_key)
logger.info("EntityResolver initialized with Claude AI")
except Exception as e:
logger.warning("Failed to initialize Claude AI for entity resolution: %s", e)
self.client = None
def resolve_entities(self, relations: List[CausalRelation]) -> Dict[str, Dict]:
"""
Resolve entity mentions across all documents as per README Step 4.
Step 4.1: Collect all entities
Step 4.2: Group by entity type
Step 4.3: AI-powered resolution (Claude API)
Step 4.4: Create canonical names
Returns mapping: canonical_name -> {mentions, type, role, confidence}
"""
if not self.client:
logger.info("Entity resolution skipped (Claude AI not available)")
return {}
if not relations:
return {}
# Step 4.1: COLLECT ALL ENTITIES
all_mentions: Set[str] = set()
for rel in relations:
all_mentions.add(rel.cause.strip())
all_mentions.add(rel.effect.strip())
if not all_mentions:
return {}
logger.info("Collecting %d entity mentions for resolution", len(all_mentions))
# Step 4.2: GROUP BY ENTITY TYPE (simple heuristic)
people_mentions = []
project_mentions = []
team_mentions = []
other_mentions = []
for mention in all_mentions:
mention_lower = mention.lower()
if any(word in mention_lower for word in ["team", "department", "group", "division"]):
team_mentions.append(mention)
elif any(word in mention_lower for word in ["project", "system", "application", "platform"]):
project_mentions.append(mention)
elif len(mention.split()) <= 3 and not any(char.isdigit() for char in mention):
# Likely a person name (short, no numbers)
people_mentions.append(mention)
else:
other_mentions.append(mention)
# Step 4.3: AI-POWERED RESOLUTION (Claude API)
resolved_entities = {}
# Resolve people
if people_mentions:
people_resolved = self._resolve_with_claude(people_mentions, "Person")
resolved_entities.update(people_resolved)
# Resolve projects
if project_mentions:
projects_resolved = self._resolve_with_claude(project_mentions, "Project")
resolved_entities.update(projects_resolved)
# Resolve teams
if team_mentions:
teams_resolved = self._resolve_with_claude(team_mentions, "Team")
resolved_entities.update(teams_resolved)
# Resolve others
if other_mentions:
others_resolved = self._resolve_with_claude(other_mentions, "Entity")
resolved_entities.update(others_resolved)
logger.info("Resolved %d entities from %d mentions", len(resolved_entities), len(all_mentions))
return resolved_entities
def _resolve_with_claude(self, mentions: List[str], entity_type: str) -> Dict[str, Dict]:
"""Use Claude AI to resolve entity mentions."""
if not self.client or not mentions:
return {}
try:
system_prompt = """You are an expert at entity resolution. Your task is to identify which mentions refer to the same real-world entity.
Analyze the given list of entity mentions and group them by the actual entity they refer to.
Return a JSON object where:
- Key: Canonical name (best/most complete name)
- Value: Object with:
- "mentions": List of all mentions that refer to this entity
- "type": Entity type (Person, Project, Team, etc.)
- "role": Role or description (if applicable)
- "confidence": Confidence score (0.0 to 1.0)
Example:
{
"John Smith": {
"mentions": ["John", "J. Smith", "John Smith", "Smith"],
"type": "Person",
"role": "Project Lead",
"confidence": 0.95
},
"Project Alpha": {
"mentions": ["Project Alpha", "Alpha", "The Alpha Project"],
"type": "Project",
"role": null,
"confidence": 0.90
}
}
Be thorough and group all related mentions together."""
user_prompt = f"""Analyze these {entity_type} entity mentions and resolve which ones refer to the same entity:
{json.dumps(mentions, indent=2)}
Return a JSON object mapping canonical names to their resolved mentions."""
message = self.client.messages.create(
model=self.model,
max_tokens=self.max_output_tokens,
temperature=0.2, # Lower temperature for more consistent resolution
system=system_prompt,
messages=[{"role": "user", "content": user_prompt}]
)
response_text = "".join(
block.text for block in message.content
if hasattr(block, "text")
)
if not response_text:
logger.warning("Empty response from Claude for entity resolution")
return {}
# Parse JSON response
try:
json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
if json_match:
json_text = json_match.group(0)
else:
json_text = response_text
resolved = json.loads(json_text)
# Validate and structure the response
result = {}
for canonical_name, entity_data in resolved.items():
if isinstance(entity_data, dict):
result[canonical_name] = {
"mentions": entity_data.get("mentions", [canonical_name]),
"type": entity_data.get("type", entity_type),
"role": entity_data.get("role"),
"confidence": float(entity_data.get("confidence", 0.85))
}
else:
# Fallback if structure is different
result[canonical_name] = {
"mentions": [canonical_name] if isinstance(entity_data, str) else entity_data,
"type": entity_type,
"role": None,
"confidence": 0.8
}
return result
except json.JSONDecodeError as e:
logger.warning("Failed to parse Claude response as JSON: %s. Response: %s",
e, response_text[:500])
return {}
except BadRequestError as e:
logger.warning("Claude API error during entity resolution: %s", e)
return {}
except Exception as e:
logger.warning("Entity resolution failed: %s", e)
return {}
def apply_resolution_to_relations(
self,
relations: List[CausalRelation],
resolved_entities: Dict[str, Dict]
) -> List[CausalRelation]:
"""
Apply entity resolution to relationships.
Replace mentions with canonical names.
"""
if not resolved_entities:
return relations
# Create reverse mapping: mention -> canonical_name
mention_to_canonical: Dict[str, str] = {}
for canonical_name, entity_data in resolved_entities.items():
mentions = entity_data.get("mentions", [])
for mention in mentions:
mention_to_canonical[mention.lower()] = canonical_name
# Update relations with canonical names
updated_relations = []
for rel in relations:
# Resolve cause
cause_lower = rel.cause.strip().lower()
if cause_lower in mention_to_canonical:
rel.cause = mention_to_canonical[cause_lower]
# Resolve effect
effect_lower = rel.effect.strip().lower()
if effect_lower in mention_to_canonical:
rel.effect = mention_to_canonical[effect_lower]
# Store resolution info in metadata
rel.metadata["entity_resolved"] = True
updated_relations.append(rel)
logger.info("Applied entity resolution to %d relationships", len(updated_relations))
return updated_relations

View File

@ -1,38 +1,65 @@
from __future__ import annotations
import json
import logging
from typing import Iterable
import re
from typing import Dict, Iterable, List, Optional
from anthropic import Anthropic, BadRequestError
from neo4j import GraphDatabase, Transaction
from ..config import get_settings
from ..models import CausalRelation
logger = logging.getLogger(__name__)
MERGE_QUERY = """
MERGE (cause:Concept {name: $cause})
ON CREATE SET cause.created_at = timestamp(), cause.lastSeen = timestamp()
ON MATCH SET cause.lastSeen = timestamp()
MERGE (effect:Concept {name: $effect})
ON CREATE SET effect.created_at = timestamp(), effect.lastSeen = timestamp()
ON MATCH SET effect.lastSeen = timestamp()
MERGE (cause)-[r:CAUSES]->(effect)
ON CREATE SET r.confidence = $confidence,
r.explanation = $explanation,
r.source_file_id = $source_file_id,
r.source_snippet = $source_snippet,
r.job_id = $job_id,
r.model = $model,
r.created_at = timestamp(),
r.updated_at = timestamp()
ON MATCH SET r.confidence = $confidence,
r.explanation = $explanation,
r.source_file_id = $source_file_id,
r.source_snippet = $source_snippet,
r.job_id = $job_id,
r.model = $model,
r.updated_at = timestamp()
# Query to create Document node
CREATE_DOCUMENT_QUERY = """
MERGE (doc:Document {filename: $filename})
ON CREATE SET doc.uploaded_at = timestamp(),
doc.file_path = $file_path,
doc.job_id = $job_id,
doc.created_at = timestamp()
ON MATCH SET doc.lastSeen = timestamp()
"""
# Query to create Entity nodes and relationship with dynamic type
CREATE_ENTITY_RELATIONSHIP_QUERY = """
MERGE (source:Entity:Concept {name: $source})
ON CREATE SET source.created_at = timestamp(),
source.lastSeen = timestamp(),
source.type = COALESCE($source_type, 'Entity')
ON MATCH SET source.lastSeen = timestamp()
MERGE (target:Entity:Concept {name: $target})
ON CREATE SET target.created_at = timestamp(),
target.lastSeen = timestamp(),
target.type = COALESCE($target_type, 'Entity')
ON MATCH SET target.lastSeen = timestamp()
WITH source, target
CALL apoc.merge.relationship(
source,
$rel_type,
{confidence: $confidence,
explanation: $explanation,
source_file_id: $source_file_id,
source_snippet: $source_snippet,
job_id: $job_id,
model: $model,
created_at: timestamp(),
updated_at: timestamp()},
{confidence: $confidence,
explanation: $explanation,
source_file_id: $source_file_id,
source_snippet: $source_snippet,
job_id: $job_id,
model: $model,
updated_at: timestamp()},
target
) YIELD rel
RETURN rel
"""
@ -43,12 +70,42 @@ class GraphWriter:
def close(self) -> None:
self._driver.close()
def write_relations(self, job_id: str, relations: Iterable[CausalRelation]) -> None:
def write_documents(self, job_id: str, files: Iterable) -> None:
"""Create Document nodes for uploaded files."""
files_list = list(files)
if not files_list:
return
logger.info("Creating %d document nodes for job %s", len(files_list), job_id)
with self._driver.session() as session:
def _write_docs(tx: Transaction) -> None:
for file_record in files_list:
try:
tx.run(
CREATE_DOCUMENT_QUERY,
filename=file_record.filename,
file_path=file_record.stored_path,
job_id=job_id
)
logger.debug("Created document node: %s", file_record.filename)
except Exception as exc:
logger.warning("Failed to create document node for %s: %s", file_record.filename, exc)
session.execute_write(_write_docs)
logger.info("Created document nodes for job %s", job_id)
def write_relations(self, job_id: str, relations: Iterable[CausalRelation], files: Iterable = None) -> None:
"""Write entities and relationships to Neo4j with multiple relationship types."""
relations_list = list(relations)
if not relations_list:
logger.warning("No relations to write for job %s", job_id)
return
# Create document nodes if files provided
if files:
self.write_documents(job_id, files)
logger.info("Writing %d relations to Neo4j for job %s", len(relations_list), job_id)
with self._driver.session() as session:
@ -58,11 +115,70 @@ class GraphWriter:
if not relation.cause or not relation.effect:
logger.warning("Skipping relation with empty cause or effect: %s -> %s", relation.cause, relation.effect)
continue
# Get relationship type (default to CAUSES for backward compatibility)
rel_type = getattr(relation, 'relationship_type', None) or "CAUSES"
# Sanitize relationship type (only allow alphanumeric and underscores)
rel_type = re.sub(r'[^A-Z0-9_]', '', rel_type.upper())
if not rel_type:
rel_type = "CAUSES"
# Infer entity types from names (simple heuristic)
source_type = self._infer_entity_type(relation.cause)
target_type = self._infer_entity_type(relation.effect)
try:
# Create source entity
tx.run("""
MERGE (source:Entity:Concept {name: $source})
ON CREATE SET source.created_at = timestamp(),
source.lastSeen = timestamp(),
source.type = $source_type
ON MATCH SET source.lastSeen = timestamp()
""",
source=relation.cause.strip(),
source_type=source_type
)
# Create target entity
tx.run("""
MERGE (target:Entity:Concept {name: $target})
ON CREATE SET target.created_at = timestamp(),
target.lastSeen = timestamp(),
target.type = $target_type
ON MATCH SET target.lastSeen = timestamp()
""",
target=relation.effect.strip(),
target_type=target_type
)
# Create relationship with dynamic type (sanitized)
query = f"""
MATCH (source:Entity {{name: $source}})
MATCH (target:Entity {{name: $target}})
MERGE (source)-[r:{rel_type}]->(target)
ON CREATE SET r.confidence = $confidence,
r.explanation = $explanation,
r.source_file_id = $source_file_id,
r.source_snippet = $source_snippet,
r.job_id = $job_id,
r.model = $model,
r.created_at = timestamp(),
r.updated_at = timestamp()
ON MATCH SET r.confidence = $confidence,
r.explanation = $explanation,
r.source_file_id = $source_file_id,
r.source_snippet = $source_snippet,
r.job_id = $job_id,
r.model = $model,
r.updated_at = timestamp()
"""
result = tx.run(
MERGE_QUERY,
cause=relation.cause.strip(),
effect=relation.effect.strip(),
query,
source=relation.cause.strip(),
target=relation.effect.strip(),
confidence=float(relation.confidence) if relation.confidence else 0.0,
explanation=relation.explanation or "",
source_file_id=relation.source_file_id or "",
@ -70,12 +186,145 @@ class GraphWriter:
job_id=job_id,
model=relation.metadata.get("model") or "",
)
# Link entities to documents if source_file_id is a filename
if relation.source_file_id and relation.source_file_id != "combined_text":
link_query = f"""
MATCH (entity:Entity {{name: $entity_name}})
MATCH (doc:Document {{filename: $filename}})
MERGE (entity)-[:EXTRACTED_FROM]->(doc)
"""
try:
tx.run(link_query, entity_name=relation.cause.strip(), filename=relation.source_file_id)
tx.run(link_query, entity_name=relation.effect.strip(), filename=relation.source_file_id)
except:
pass # Ignore if document doesn't exist
count += 1
logger.debug("Wrote relation: %s -> %s (confidence: %s)", relation.cause, relation.effect, relation.confidence)
logger.debug("Wrote relation: %s -[%s]-> %s (confidence: %s)",
relation.cause, rel_type, relation.effect, relation.confidence)
except Exception as exc:
logger.exception("Failed to write relation %s -> %s: %s", relation.cause, relation.effect, exc)
logger.info("Successfully wrote %d/%d relations to Neo4j", count, len(relations_list))
session.execute_write(_write)
logger.info("Persisted causal relations for job %s", job_id)
logger.info("Persisted relations for job %s", job_id)
def _infer_entity_type(self, entity_name: str) -> str:
"""Infer entity type from name (simple heuristic)."""
name_lower = entity_name.lower()
# Technology patterns
if any(tech in name_lower for tech in ['react', 'node', 'python', 'java', 'postgres', 'mysql', 'redis', 'mongodb', 'docker', 'kubernetes']):
return "Technology"
# Service patterns
if any(word in name_lower for word in ['service', 'api', 'gateway', 'auth', 'payment', 'notification']):
return "Service"
# Component patterns
if any(word in name_lower for word in ['component', 'module', 'system', 'application', 'platform']):
return "Component"
# Process patterns
if any(word in name_lower for word in ['flow', 'process', 'workflow', 'pipeline', 'procedure']):
return "Process"
# Default
return "Entity"
def query_causal_chains(
self,
job_id: str,
min_length: int = 2,
max_length: int = 4,
min_confidence: float = 0.8,
limit: int = 20
) -> List[Dict]:
"""
Query Neo4j for causal chains as per README Step 7.3.
Returns sequences of connected events.
"""
# Query for causal chains - match any relationship type
query = f"""
MATCH path = (start:Entity)-[r*{min_length}..{max_length}]->(end:Entity)
WHERE ALL(rel in relationships(path) WHERE rel.job_id = $job_id AND rel.confidence >= $min_confidence)
WITH path,
[node in nodes(path) | node.name] as chain,
[rel in relationships(path) | rel.confidence] as confidences,
[rel in relationships(path) | type(rel)] as rel_types,
[rel in relationships(path) | rel.explanation] as explanations
RETURN chain, confidences, rel_types, explanations
ORDER BY reduce(conf = 0.0, c in confidences | conf + c) DESC
LIMIT $limit
"""
try:
with self._driver.session() as session:
result = session.run(
query,
job_id=job_id,
min_confidence=min_confidence,
limit=limit
)
chains = []
for record in result:
chain = record["chain"]
confidences = record["confidences"]
rel_types = record["rel_types"]
explanations = record["explanations"]
# Calculate average confidence
avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
chains.append({
"chain": chain,
"confidences": confidences,
"rel_types": rel_types,
"explanations": explanations,
"avg_confidence": avg_confidence,
"length": len(chain) - 1
})
logger.info("Found %d causal chains for job %s", len(chains), job_id)
return chains
except Exception as exc:
logger.exception("Failed to query causal chains: %s", exc)
return []
def query_key_entities(self, job_id: str, limit: int = 20) -> List[Dict]:
"""
Query Neo4j for key entities (most involved) as per README Step 7.3.
"""
query = """
MATCH (e:Entity)-[r]->(target)
WHERE r.job_id = $job_id
WITH e, count(r) as relation_count, collect(DISTINCT type(r)) as rel_types
RETURN e.name as name,
e.type as type,
relation_count,
rel_types
ORDER BY relation_count DESC
LIMIT $limit
"""
try:
with self._driver.session() as session:
result = session.run(query, job_id=job_id, limit=limit)
entities = []
for record in result:
entities.append({
"name": record["name"],
"type": record.get("type", "Entity"),
"relation_count": record["relation_count"],
"relation_types": record["rel_types"]
})
logger.info("Found %d key entities for job %s", len(entities), job_id)
return entities
except Exception as exc:
logger.exception("Failed to query key entities: %s", exc)
return []

View File

@ -0,0 +1,625 @@
from __future__ import annotations
import json
import logging
import re
from typing import Dict, List, Optional
from anthropic import Anthropic, BadRequestError
from ..config import get_settings
from ..models import CausalRelation
logger = logging.getLogger(__name__)
# Try to import SpaCy
try:
import spacy
from spacy.lang.en import English
HAS_SPACY = True
except ImportError:
HAS_SPACY = False
logger.warning("spacy not available - NLP detection will be skipped")
class RelationshipExtractor:
"""Extract potential cause-effect relationships from text using NLP (SpaCy) + Claude AI."""
# Causal keywords for NLP detection (Step 3.1)
CAUSAL_KEYWORDS = [
"because", "due to", "as a result", "led to", "caused", "therefore",
"consequently", "hence", "thus", "so", "since", "owing to",
"resulted in", "brought about", "gave rise to", "triggered",
"provoked", "induced", "generated", "produced", "created"
]
# Common cause-effect patterns (expanded for architecture/technical documents)
CAUSE_EFFECT_PATTERNS = [
# Direct causal patterns
(r"(\w+(?:\s+\w+){0,15})\s+causes?\s+(\w+(?:\s+\w+){0,15})", "causes"),
(r"(\w+(?:\s+\w+){0,15})\s+leads?\s+to\s+(\w+(?:\s+\w+){0,15})", "leads_to"),
(r"(\w+(?:\s+\w+){0,15})\s+results?\s+in\s+(\w+(?:\s+\w+){0,15})", "results_in"),
(r"(\w+(?:\s+\w+){0,15})\s+triggers?\s+(\w+(?:\s+\w+){0,15})", "triggers"),
(r"(\w+(?:\s+\w+){0,15})\s+produces?\s+(\w+(?:\s+\w+){0,15})", "produces"),
(r"(\w+(?:\s+\w+){0,15})\s+enables?\s+(\w+(?:\s+\w+){0,15})", "enables"),
(r"(\w+(?:\s+\w+){0,15})\s+allows?\s+(\w+(?:\s+\w+){0,15})", "allows"),
(r"(\w+(?:\s+\w+){0,15})\s+facilitates?\s+(\w+(?:\s+\w+){0,15})", "facilitates"),
# Dependency patterns
(r"(\w+(?:\s+\w+){0,15})\s+depends?\s+on\s+(\w+(?:\s+\w+){0,15})", "depends_on"),
(r"(\w+(?:\s+\w+){0,15})\s+requires?\s+(\w+(?:\s+\w+){0,15})", "requires"),
(r"(\w+(?:\s+\w+){0,15})\s+needs?\s+(\w+(?:\s+\w+){0,15})", "needs"),
(r"(\w+(?:\s+\w+){0,15})\s+relies?\s+on\s+(\w+(?:\s+\w+){0,15})", "relies_on"),
(r"(\w+(?:\s+\w+){0,15})\s+uses?\s+(\w+(?:\s+\w+){0,15})", "uses"),
(r"(\w+(?:\s+\w+){0,15})\s+utilizes?\s+(\w+(?:\s+\w+){0,15})", "utilizes"),
(r"(\w+(?:\s+\w+){0,15})\s+leverages?\s+(\w+(?:\s+\w+){0,15})", "leverages"),
# Architectural/System patterns
(r"(\w+(?:\s+\w+){0,15})\s+connects?\s+to\s+(\w+(?:\s+\w+){0,15})", "connects_to"),
(r"(\w+(?:\s+\w+){0,15})\s+communicates?\s+with\s+(\w+(?:\s+\w+){0,15})", "communicates_with"),
(r"(\w+(?:\s+\w+){0,15})\s+interacts?\s+with\s+(\w+(?:\s+\w+){0,15})", "interacts_with"),
(r"(\w+(?:\s+\w+){0,15})\s+integrates?\s+with\s+(\w+(?:\s+\w+){0,15})", "integrates_with"),
(r"(\w+(?:\s+\w+){0,15})\s+provides?\s+(\w+(?:\s+\w+){0,15})", "provides"),
(r"(\w+(?:\s+\w+){0,15})\s+supports?\s+(\w+(?:\s+\w+){0,15})", "supports"),
(r"(\w+(?:\s+\w+){0,15})\s+handles?\s+(\w+(?:\s+\w+){0,15})", "handles"),
(r"(\w+(?:\s+\w+){0,15})\s+manages?\s+(\w+(?:\s+\w+){0,15})", "manages"),
(r"(\w+(?:\s+\w+){0,15})\s+controls?\s+(\w+(?:\s+\w+){0,15})", "controls"),
(r"(\w+(?:\s+\w+){0,15})\s+processes?\s+(\w+(?:\s+\w+){0,15})", "processes"),
(r"(\w+(?:\s+\w+){0,15})\s+generates?\s+(\w+(?:\s+\w+){0,15})", "generates"),
(r"(\w+(?:\s+\w+){0,15})\s+creates?\s+(\w+(?:\s+\w+){0,15})", "creates"),
(r"(\w+(?:\s+\w+){0,15})\s+implements?\s+(\w+(?:\s+\w+){0,15})", "implements"),
(r"(\w+(?:\s+\w+){0,15})\s+delivers?\s+(\w+(?:\s+\w+){0,15})", "delivers"),
# Flow patterns
(r"(\w+(?:\s+\w+){0,15})\s+flows?\s+to\s+(\w+(?:\s+\w+){0,15})", "flows_to"),
(r"(\w+(?:\s+\w+){0,15})\s+sends?\s+to\s+(\w+(?:\s+\w+){0,15})", "sends_to"),
(r"(\w+(?:\s+\w+){0,15})\s+transmits?\s+to\s+(\w+(?:\s+\w+){0,15})", "transmits_to"),
(r"(\w+(?:\s+\w+){0,15})\s+receives?\s+from\s+(\w+(?:\s+\w+){0,15})", "receives_from"),
# Conditional patterns
(r"if\s+(\w+(?:\s+\w+){0,15}),\s+then\s+(\w+(?:\s+\w+){0,15})", "if_then"),
(r"when\s+(\w+(?:\s+\w+){0,15}),\s+(\w+(?:\s+\w+){0,15})\s+occurs?", "when_then"),
(r"(\w+(?:\s+\w+){0,15})\s+implies?\s+(\w+(?:\s+\w+){0,15})", "implies"),
(r"(\w+(?:\s+\w+){0,15})\s+ensures?\s+(\w+(?:\s+\w+){0,15})", "ensures"),
# Sequential patterns
(r"(\w+(?:\s+\w+){0,15})\s+follows?\s+(\w+(?:\s+\w+){0,15})", "follows"),
(r"(\w+(?:\s+\w+){0,15})\s+comes?\s+after\s+(\w+(?:\s+\w+){0,15})", "comes_after"),
(r"first\s+(\w+(?:\s+\w+){0,15}),\s+then\s+(\w+(?:\s+\w+){0,15})", "first_then"),
(r"(\w+(?:\s+\w+){0,15})\s+precedes?\s+(\w+(?:\s+\w+){0,15})", "precedes"),
# Containment patterns
(r"(\w+(?:\s+\w+){0,15})\s+contains?\s+(\w+(?:\s+\w+){0,15})", "contains"),
(r"(\w+(?:\s+\w+){0,15})\s+includes?\s+(\w+(?:\s+\w+){0,15})", "includes"),
(r"(\w+(?:\s+\w+){0,15})\s+consists?\s+of\s+(\w+(?:\s+\w+){0,15})", "consists_of"),
# Influence patterns
(r"(\w+(?:\s+\w+){0,15})\s+affects?\s+(\w+(?:\s+\w+){0,15})", "affects"),
(r"(\w+(?:\s+\w+){0,15})\s+impacts?\s+(\w+(?:\s+\w+){0,15})", "impacts"),
(r"(\w+(?:\s+\w+){0,15})\s+influences?\s+(\w+(?:\s+\w+){0,15})", "influences"),
]
def __init__(self):
"""Initialize NLP and Claude AI components."""
settings = get_settings()
# Initialize SpaCy NLP model (Step 3.1)
self.nlp = None
if HAS_SPACY:
try:
# Try to load English model, fallback to blank if not available
try:
self.nlp = spacy.load("en_core_web_sm")
except OSError:
logger.warning("en_core_web_sm model not found, using blank English model")
self.nlp = English()
self.nlp.add_pipe("sentencizer")
logger.info("SpaCy NLP model loaded")
except Exception as e:
logger.warning("Failed to load SpaCy model: %s", e)
self.nlp = None
# Initialize Claude AI client (Step 3.2)
self.claude_client = None
self.claude_model = settings.claude_model
self.claude_max_input_tokens = settings.claude_max_input_tokens
self.claude_max_output_tokens = settings.claude_max_output_tokens
if settings.anthropic_api_key:
try:
self.claude_client = Anthropic(api_key=settings.anthropic_api_key)
logger.info("Claude AI client initialized")
except Exception as e:
logger.warning("Failed to initialize Claude AI client: %s", e)
else:
logger.warning("ANTHROPIC_API_KEY not set - Claude AI extraction will be skipped")
def extract_from_text(self, text: str, source_file_id: str) -> List[CausalRelation]:
"""
Extract cause-effect relationships using NLP (SpaCy) + Claude AI.
Implements Step 3.1 (NLP Detection) and Step 3.2 (Claude AI Extraction).
"""
if not text or not text.strip():
return []
all_relationships: List[CausalRelation] = []
# Step 3.1: BASIC NLP DETECTION (SpaCy)
nlp_relationships = self._extract_with_nlp(text, source_file_id)
all_relationships.extend(nlp_relationships)
logger.info("NLP (SpaCy) extracted %d candidate relationships (low confidence)",
len(nlp_relationships))
# Step 3.2: AI-POWERED EXTRACTION (Claude API)
if self.claude_client:
claude_relationships = self._extract_with_claude(text, source_file_id)
all_relationships.extend(claude_relationships)
logger.info("Claude AI extracted %d relationships (high confidence)",
len(claude_relationships))
else:
logger.info("Claude AI extraction skipped (API key not configured)")
# Also run pattern matching as fallback
pattern_relationships = self._extract_with_patterns(text, source_file_id)
all_relationships.extend(pattern_relationships)
logger.info("Pattern matching extracted %d relationships", len(pattern_relationships))
# Deduplicate relationships
seen = set()
unique_relationships = []
for rel in all_relationships:
key = (rel.cause.lower().strip(), rel.effect.lower().strip())
if key not in seen:
seen.add(key)
unique_relationships.append(rel)
logger.info("Total unique relationships extracted: %d (from %d total)",
len(unique_relationships), len(all_relationships))
return unique_relationships
def _extract_with_nlp(self, text: str, source_file_id: str) -> List[CausalRelation]:
"""
Step 3.1: Basic NLP Detection using SpaCy.
Look for causal keywords and find sentences containing these patterns.
Returns potential causal relationships (low confidence).
"""
if not self.nlp:
return []
relationships: List[CausalRelation] = []
try:
# Process text with SpaCy
doc = self.nlp(text)
# Find sentences containing causal keywords
for sent in doc.sents:
sent_text = sent.text.strip()
if len(sent_text) < 10:
continue
# Check if sentence contains causal keywords
sent_lower = sent_text.lower()
has_causal_keyword = any(keyword in sent_lower for keyword in self.CAUSAL_KEYWORDS)
if has_causal_keyword:
# Try to extract cause-effect using dependency parsing
cause = None
effect = None
# Look for causal conjunctions
for token in sent:
if token.text.lower() in ["because", "due", "since", "as"]:
# Find the clause after the causal conjunction
if token.dep_ in ["mark", "prep"]:
# Try to extract cause and effect
cause_span = None
effect_span = None
# Simple heuristic: text before "because/due to" is effect, after is cause
if "because" in sent_lower or "since" in sent_lower:
parts = re.split(r'\b(because|since)\b', sent_text, flags=re.IGNORECASE)
if len(parts) >= 3:
effect = parts[0].strip()
cause = parts[2].strip()
elif "due to" in sent_lower:
parts = re.split(r'\bdue to\b', sent_text, flags=re.IGNORECASE)
if len(parts) >= 2:
effect = parts[0].strip()
cause = parts[1].strip()
if cause and effect:
# Clean up cause and effect
cause = re.sub(r'^[,\s]+|[,\s]+$', '', cause)
effect = re.sub(r'^[,\s]+|[,\s]+$', '', effect)
if len(cause) >= 3 and len(effect) >= 3:
relationships.append(CausalRelation(
cause=cause,
effect=effect,
confidence=0.5, # Low confidence for NLP
explanation=f"Extracted using NLP (SpaCy) - found causal keyword",
source_file_id=source_file_id,
source_snippet=sent_text[:200],
relationship_type="CAUSES",
metadata={
"extraction_method": "spacy_nlp",
"sentence": sent_text
}
))
except Exception as e:
logger.warning("NLP extraction failed: %s", e)
return relationships
def _extract_with_claude(self, text: str, source_file_id: str) -> List[CausalRelation]:
"""
Step 3.2: AI-Powered Extraction using Claude API.
Send full document text to Claude AI and ask it to find ALL causal relationships.
Returns high-quality causal relationships (high confidence).
"""
if not self.claude_client:
return []
relationships: List[CausalRelation] = []
try:
# Prepare prompt for Claude
system_prompt = """You are an expert at analyzing text and extracting cause-effect relationships.
Your task is to identify ALL causal relationships in the given text, including both explicit and implicit ones.
For each causal relationship, extract:
- Cause: What triggered or led to this?
- Effect: What was the result or outcome?
- Context: Additional background information
- Entities: Who or what is involved (people, teams, projects, systems)
- Confidence: How certain are you? (0.0 to 1.0)
- Source sentence: The sentence or passage where this relationship was found
- Date: When did this happen (if mentioned)
Return the results as a JSON array of objects with this structure:
[
{
"cause": "string",
"effect": "string",
"context": "string (optional)",
"entities": ["string"],
"confidence": 0.0-1.0,
"source_sentence": "string",
"date": "string (optional)"
}
]
Focus on:
- Explicit relationships ("because X, therefore Y")
- Implicit relationships (strongly implied cause-effect)
- Technical and architectural dependencies
- Business decisions and their impacts
- Process flows and sequences"""
# Truncate text to fit within token limits (rough estimate: 1 token ≈ 4 characters)
max_chars = (self.claude_max_input_tokens - 1000) * 4
truncated_text = text[:max_chars] if len(text) > max_chars else text
user_prompt = f"""Analyze the following text and extract ALL causal relationships.
Text:
{truncated_text}
Return a JSON array of causal relationships. Be thorough and find both explicit and implicit relationships."""
# Call Claude API
message = self.claude_client.messages.create(
model=self.claude_model,
max_tokens=self.claude_max_output_tokens,
temperature=0.3, # Lower temperature for more focused extraction
system=system_prompt,
messages=[
{
"role": "user",
"content": user_prompt
}
]
)
# Extract response text
content_blocks = message.content or []
response_text = "".join(
block.text for block in content_blocks
if hasattr(block, "text")
)
if not response_text:
logger.warning("Empty response from Claude AI")
return []
# Parse JSON response
try:
# Try to extract JSON from response (might have markdown code blocks)
json_match = re.search(r'\[.*\]', response_text, re.DOTALL)
if json_match:
json_text = json_match.group(0)
else:
json_text = response_text
claude_results = json.loads(json_text)
# Convert Claude results to CausalRelation objects
for result in claude_results:
cause = result.get("cause", "").strip()
effect = result.get("effect", "").strip()
context = result.get("context", "")
entities = result.get("entities", [])
confidence = float(result.get("confidence", 0.85))
source_sentence = result.get("source_sentence", "")
date = result.get("date", "")
if not cause or not effect:
continue
# Map to Neo4j relationship type (default to CAUSES)
relationship_type = "CAUSES"
explanation = context or f"Extracted by Claude AI"
if entities:
explanation += f" (Entities: {', '.join(entities)})"
relationships.append(CausalRelation(
cause=cause,
effect=effect,
confidence=min(confidence, 0.95), # Cap at 0.95
explanation=explanation,
source_file_id=source_file_id,
source_snippet=source_sentence[:200] if source_sentence else "",
relationship_type=relationship_type,
metadata={
"extraction_method": "claude_ai",
"context": context,
"entities": entities,
"date": date,
"source_sentence": source_sentence
}
))
logger.info("Claude AI successfully extracted %d relationships", len(relationships))
except json.JSONDecodeError as e:
logger.warning("Failed to parse Claude AI response as JSON: %s. Response: %s",
e, response_text[:500])
except Exception as e:
logger.warning("Error processing Claude AI response: %s", e)
except BadRequestError as e:
logger.warning("Claude API error: %s", e)
except Exception as e:
logger.warning("Claude AI extraction failed: %s", e)
return relationships
def _extract_with_patterns(self, text: str, source_file_id: str) -> List[CausalRelation]:
"""
Fallback: Pattern-based extraction (original method).
Returns candidate relationships for DoWhy validation.
"""
if not text or not text.strip():
return []
relationships: List[CausalRelation] = []
seen = set() # Avoid duplicates
# Normalize text
text = re.sub(r'\s+', ' ', text)
sentences = re.split(r'[.!?]\s+', text)
for sentence in sentences:
sentence = sentence.strip()
if len(sentence) < 10: # Skip very short sentences
continue
for pattern, rel_type in self.CAUSE_EFFECT_PATTERNS:
matches = re.finditer(pattern, sentence, re.IGNORECASE)
for match in matches:
cause = match.group(1).strip()
effect = match.group(2).strip()
# Filter out very short or very long phrases (increased limit for technical terms)
if len(cause) < 3 or len(cause) > 150:
continue
if len(effect) < 3 or len(effect) > 150:
continue
# Skip common false positives
if cause.lower() in ["this", "that", "it", "they", "we"]:
continue
if effect.lower() in ["this", "that", "it", "they", "we"]:
continue
# Create unique key
key = (cause.lower(), effect.lower())
if key in seen:
continue
seen.add(key)
# Calculate confidence based on pattern type
confidence = self._calculate_confidence(rel_type, sentence)
# Map pattern type to Neo4j relationship type (uppercase with underscores)
neo4j_rel_type = self._map_to_neo4j_relationship_type(rel_type)
relationships.append(CausalRelation(
cause=cause,
effect=effect,
confidence=confidence,
explanation=f"Extracted from text using pattern: {rel_type}",
source_file_id=source_file_id,
source_snippet=sentence[:200], # First 200 chars
relationship_type=neo4j_rel_type,
metadata={
"extraction_method": "pattern_matching",
"pattern_type": rel_type,
"sentence": sentence
}
))
logger.info("Extracted %d candidate relationships from text (source: %s)",
len(relationships), source_file_id)
return relationships
def _calculate_confidence(self, rel_type: str, sentence: str) -> float:
"""Calculate confidence score based on pattern type and sentence quality."""
base_confidence = {
"causes": 0.8,
"leads_to": 0.75,
"results_in": 0.75,
"triggers": 0.7,
"produces": 0.7,
"depends_on": 0.65,
"requires": 0.65,
"needs": 0.6,
"if_then": 0.8,
"when_then": 0.75,
"implies": 0.7,
"follows": 0.6,
"comes_after": 0.6,
"first_then": 0.7,
"enables": 0.7,
"allows": 0.65,
"facilitates": 0.65,
"relies_on": 0.65,
"uses": 0.6,
"utilizes": 0.6,
"leverages": 0.6,
"connects_to": 0.7,
"communicates_with": 0.7,
"interacts_with": 0.7,
"integrates_with": 0.7,
"provides": 0.7,
"supports": 0.7,
"handles": 0.65,
"manages": 0.65,
"controls": 0.65,
"processes": 0.65,
"generates": 0.7,
"creates": 0.7,
"implements": 0.7,
"delivers": 0.7,
"flows_to": 0.7,
"sends_to": 0.7,
"transmits_to": 0.7,
"receives_from": 0.7,
"ensures": 0.75,
"precedes": 0.6,
"contains": 0.6,
"includes": 0.6,
"consists_of": 0.6,
"affects": 0.65,
"impacts": 0.65,
"influences": 0.65,
}.get(rel_type, 0.5)
# Adjust based on sentence length (longer sentences might be more descriptive)
if len(sentence) > 50:
base_confidence += 0.05
return min(base_confidence, 0.95)
def _map_to_neo4j_relationship_type(self, pattern_type: str) -> str:
"""Map pattern type to Neo4j relationship type (uppercase with underscores)."""
# Map lowercase pattern types to Neo4j relationship types
mapping = {
"causes": "CAUSES",
"leads_to": "LEADS_TO",
"results_in": "RESULTS_IN",
"triggers": "TRIGGERS",
"produces": "PRODUCES",
"depends_on": "DEPENDS_ON",
"requires": "REQUIRES",
"needs": "NEEDS",
"relies_on": "RELIES_ON",
"uses": "USES",
"utilizes": "UTILIZES",
"leverages": "LEVERAGES",
"connects_to": "CONNECTS_TO",
"communicates_with": "COMMUNICATES_WITH",
"interacts_with": "INTERACTS_WITH",
"integrates_with": "INTEGRATES_WITH",
"provides": "PROVIDES",
"supports": "SUPPORTS",
"handles": "HANDLES",
"manages": "MANAGES",
"controls": "CONTROLS",
"processes": "PROCESSES",
"generates": "GENERATES",
"creates": "CREATES",
"implements": "IMPLEMENTS",
"delivers": "DELIVERS",
"flows_to": "FLOWS_TO",
"sends_to": "SENDS_TO",
"transmits_to": "TRANSMITS_TO",
"receives_from": "RECEIVES_FROM",
"if_then": "IF_THEN",
"when_then": "WHEN_THEN",
"implies": "IMPLIES",
"ensures": "ENSURES",
"follows": "FOLLOWS",
"comes_after": "COMES_AFTER",
"first_then": "FIRST_THEN",
"precedes": "PRECEDES",
"contains": "CONTAINS",
"includes": "INCLUDES",
"consists_of": "CONSISTS_OF",
"affects": "AFFECTS",
"impacts": "IMPACTS",
"influences": "INFLUENCES",
"enables": "ENABLES",
"allows": "ALLOWS",
"facilitates": "FACILITATES",
}
return mapping.get(pattern_type, "CAUSES") # Default to CAUSES if not found
def extract_from_qwen_results(self, qwen_results: List[Dict], source_file_id: str) -> List[CausalRelation]:
"""Convert Qwen2.5-VL extraction results to CausalRelation objects."""
relationships: List[CausalRelation] = []
for result in qwen_results:
entity1 = result.get("entity1", "").strip()
entity2 = result.get("entity2", "").strip()
rel_type = result.get("relationship_type", "").strip()
description = result.get("description", "").strip()
confidence = float(result.get("confidence", 0.7))
if not entity1 or not entity2:
continue
# Map relationship type to cause-effect
# For most types, entity1 is cause, entity2 is effect
cause = entity1
effect = entity2
# Some relationship types might need reversal
if rel_type in ["depends_on", "requires", "needs"]:
# If A depends on B, then B is the cause, A is the effect
cause, effect = effect, cause
# Map Qwen relationship type to Neo4j format
neo4j_rel_type = self._map_to_neo4j_relationship_type(rel_type.lower().replace("-", "_"))
relationships.append(CausalRelation(
cause=cause,
effect=effect,
confidence=confidence,
explanation=description or f"Extracted from diagram: {rel_type}",
source_file_id=source_file_id,
source_snippet=description,
relationship_type=neo4j_rel_type,
metadata={
"extraction_method": "qwen2.5-vl",
"relationship_type": rel_type,
"original_entity1": entity1,
"original_entity2": entity2
}
))
return relationships

View File

@ -0,0 +1,570 @@
from __future__ import annotations
import json
import logging
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Set
from anthropic import Anthropic, BadRequestError
from ..config import get_settings
from ..models import CausalRelation, ProjectReport
logger = logging.getLogger(__name__)
# Try to import PDF generation libraries
try:
import markdown
from markdown.extensions import codehilite, fenced_code, tables
HAS_MARKDOWN = True
except ImportError:
HAS_MARKDOWN = False
logger.warning("markdown library not available - PDF conversion will be limited")
try:
from weasyprint import HTML, CSS
from weasyprint.text.fonts import FontConfiguration
HAS_WEASYPRINT = True
except ImportError:
HAS_WEASYPRINT = False
logger.warning("weasyprint not available - PDF conversion will be skipped")
class ReportGenerator:
"""Generate beginner-friendly onboarding reports from knowledge graph."""
def __init__(self, api_key: str | None = None, model: str | None = None):
settings = get_settings()
self.api_key = api_key or settings.anthropic_api_key
self.model = model or settings.claude_model
self.max_output_tokens = settings.claude_max_output_tokens
if not self.api_key:
raise ValueError("Anthropic API key is required for report generation")
self.client = Anthropic(api_key=self.api_key)
def generate_onboarding_report(
self,
job_id: str,
relations: List[CausalRelation],
vector_store,
embedder,
graph_writer=None,
kg_summary: Dict | None = None
) -> ProjectReport:
"""
Generate a beginner-friendly onboarding report from the knowledge graph.
"""
logger.info("Generating onboarding report for job %s", job_id)
# Step 1: Analyze KG structure
key_concepts = self._analyze_kg_structure(relations)
# Step 2: Semantic search for different topics
overview_content = self._search_topic(
"project overview main purpose goals objectives",
vector_store, embedder, job_id, top_k=10
)
concepts_content = self._search_topic(
"core concepts definitions key terms important ideas",
vector_store, embedder, job_id, top_k=15
)
processes_content = self._search_topic(
"how system works processes flows procedures steps",
vector_store, embedder, job_id, top_k=15
)
relationships_content = self._search_topic(
"cause effect dependencies relationships connections",
vector_store, embedder, job_id, top_k=20
)
components_content = self._search_topic(
"components modules systems parts architecture",
vector_store, embedder, job_id, top_k=15
)
# Step 3: Query Neo4j for causal chains (as per README Step 7.3)
causal_chains = []
key_entities = []
if graph_writer:
try:
# Query 1: Get critical causal chains
causal_chains = graph_writer.query_causal_chains(
job_id=job_id,
min_length=2,
max_length=4,
min_confidence=0.8,
limit=20
)
logger.info("Retrieved %d causal chains from Neo4j", len(causal_chains))
# Query 2: Get key entities
key_entities = graph_writer.query_key_entities(job_id=job_id, limit=20)
logger.info("Retrieved %d key entities from Neo4j", len(key_entities))
except Exception as neo4j_exc:
logger.warning("Failed to query Neo4j: %s", neo4j_exc)
# Step 4: Organize content hierarchically
organized_content = self._organize_content(
key_concepts,
overview_content,
concepts_content,
processes_content,
relationships_content,
components_content,
causal_chains,
key_entities
)
# Step 5: Generate report with Claude
report_content = self._claude_generate_report(
job_id=job_id,
relations=relations,
organized_content=organized_content,
kg_summary=kg_summary or {}
)
# Step 6: Parse sections
sections = self._parse_sections(report_content)
# Step 7: Convert to PDF (as per README Step 7.8)
pdf_path = None
if HAS_WEASYPRINT and HAS_MARKDOWN:
try:
pdf_path = self._convert_to_pdf(report_content, job_id)
logger.info("Generated PDF report: %s", pdf_path)
except Exception as pdf_exc:
logger.warning("PDF conversion failed: %s", pdf_exc)
# Estimate pages (rough: ~500 words per page)
word_count = len(report_content.split())
estimated_pages = max(1, word_count // 500)
return ProjectReport(
job_id=job_id,
title="Project Onboarding Guide",
content=report_content,
sections=sections,
key_concepts=list(key_concepts)[:20], # Top 20 concepts
total_pages=estimated_pages,
generated_at=datetime.utcnow(),
metadata={
"total_relations": len(relations),
"total_concepts": len(key_concepts),
"causal_chains_count": len(causal_chains),
"key_entities_count": len(key_entities),
"model": self.model,
"pdf_path": str(pdf_path) if pdf_path else None
}
)
def _analyze_kg_structure(self, relations: List[CausalRelation]) -> Set[str]:
"""Identify key concepts from the knowledge graph."""
concepts = set()
for rel in relations:
concepts.add(rel.cause)
concepts.add(rel.effect)
# Identify high-degree nodes (concepts involved in many relationships)
cause_counts: Dict[str, int] = {}
effect_counts: Dict[str, int] = {}
for rel in relations:
cause_counts[rel.cause] = cause_counts.get(rel.cause, 0) + 1
effect_counts[rel.effect] = effect_counts.get(rel.effect, 0) + 1
# Key concepts are those with high degree (appear in many relationships)
all_counts = {**cause_counts, **effect_counts}
threshold = max(1, len(relations) // 10) # Top 10% most connected
key_concepts = {
concept for concept, count in all_counts.items()
if count >= threshold
}
# If threshold is too high, use top N concepts
if len(key_concepts) < 5:
sorted_concepts = sorted(all_counts.items(), key=lambda x: x[1], reverse=True)
key_concepts = {concept for concept, _ in sorted_concepts[:20]}
logger.info("Identified %d key concepts from %d relationships",
len(key_concepts), len(relations))
return key_concepts
def _search_topic(
self,
query: str,
vector_store,
embedder,
job_id: str,
top_k: int = 10
) -> List[Dict]:
"""Search for content related to a topic."""
try:
results = vector_store.search_by_text(
query_text=query,
embedder=embedder,
job_id=job_id,
top_k=top_k
)
return results
except Exception as exc:
logger.warning("Search failed for topic '%s': %s", query, exc)
return []
def _organize_content(
self,
key_concepts: Set[str],
overview_content: List[Dict],
concepts_content: List[Dict],
processes_content: List[Dict],
relationships_content: List[Dict],
components_content: List[Dict],
causal_chains: List[Dict] = None,
key_entities: List[Dict] = None
) -> Dict:
"""Organize retrieved content into a structured format."""
return {
"key_concepts": list(key_concepts),
"overview": [r.get("payload", {}) for r in overview_content],
"concepts": [r.get("payload", {}) for r in concepts_content],
"processes": [r.get("payload", {}) for r in processes_content],
"relationships": [r.get("payload", {}) for r in relationships_content],
"components": [r.get("payload", {}) for r in components_content],
"causal_chains": causal_chains or [],
"key_entities": key_entities or [],
}
def _claude_generate_report(
self,
job_id: str,
relations: List[CausalRelation],
organized_content: Dict,
kg_summary: Dict
) -> str:
"""Generate report using Claude AI."""
# Build KG summary text
kg_summary_text = self._build_kg_summary(relations, organized_content)
# Build system prompt
system_prompt = """You are an expert technical writer specializing in creating beginner-friendly onboarding documentation for new team members.
Your goal is to explain complex project information in simple, clear language that anyone can understand, even without technical background.
Guidelines:
- Use simple, clear language - avoid jargon or explain it when necessary
- Use examples and analogies to make concepts relatable
- Structure information logically (basics first, then advanced)
- Make it engaging and easy to follow
- Cover all important aspects comprehensively
- Write in a friendly, welcoming tone
- Use headings, bullet points, and clear sections
- Explain "why" not just "what"
Generate a comprehensive onboarding document that helps a new team member understand the entire project."""
# Format causal chains from Neo4j
causal_chains_text = self._format_causal_chains(organized_content.get('causal_chains', []))
key_entities_text = self._format_key_entities(organized_content.get('key_entities', []))
# Build user prompt
user_prompt = f"""Generate a comprehensive, beginner-friendly onboarding document for this project.
KNOWLEDGE GRAPH SUMMARY:
{kg_summary_text}
IMPORTANT RELATIONSHIPS:
{self._format_relationships(relations[:50])} # Top 50 relationships
CAUSAL CHAINS (from Knowledge Graph):
{causal_chains_text}
KEY ENTITIES (from Knowledge Graph):
{key_entities_text}
KEY CONCEPTS:
{', '.join(organized_content.get('key_concepts', [])[:30])}
REQUIRED SECTIONS:
1. Project Overview
- What is this project about?
- Main purpose and goals
- Key stakeholders or users
2. Core Concepts (Explained Simply)
- Explain each important concept in simple terms
- Why each concept matters
- How concepts relate to each other
3. How Things Work Together
- System flow (simple explanation)
- Key processes and workflows
- Dependencies explained simply
4. Important Relationships
- Cause Effect relationships (explained in plain language)
- "When X happens, Y occurs because..."
- Visual flow if possible (describe it)
5. Key Components
- Main modules/systems/components
- What each does (beginner-friendly)
- How they interact
6. Getting Started
- Where to start learning
- What to understand first
- Recommended learning path
7. Common Questions
- FAQ based on the knowledge graph
- Answers in simple terms
Generate the complete onboarding document in Markdown format. Make it comprehensive, beginner-friendly, and easy to follow."""
try:
message = self.client.messages.create(
model=self.model,
max_tokens=self.max_output_tokens,
temperature=0.3, # Slightly creative but focused
system=system_prompt,
messages=[
{
"role": "user",
"content": user_prompt
}
]
)
content_blocks = message.content or []
report_text = "".join(
block.text for block in content_blocks
if hasattr(block, "text")
)
if not report_text:
logger.warning("Empty report generated")
return "# Project Onboarding Guide\n\nNo content available."
logger.info("Generated onboarding report (%d characters)", len(report_text))
return report_text
except BadRequestError as e:
# Handle API credit/authentication errors gracefully
error_msg = str(e)
if "credit balance" in error_msg.lower() or "too low" in error_msg.lower():
logger.error("Claude API credit balance too low. Cannot generate report.")
raise ValueError("Claude API credit balance is too low. Please add credits to your Anthropic account to generate reports.")
elif "invalid_request_error" in error_msg.lower():
logger.error("Claude API invalid request: %s", error_msg)
raise ValueError(f"Claude API request failed: {error_msg}")
else:
raise
except Exception as e:
logger.exception("Failed to generate report: %s", e)
raise
def _build_kg_summary(
self,
relations: List[CausalRelation],
organized_content: Dict
) -> str:
"""Build a text summary of the knowledge graph."""
summary_parts = [
f"Total Relationships: {len(relations)}",
f"Total Concepts: {len(organized_content.get('key_concepts', []))}",
"",
"Top Relationships:",
]
# Show top relationships by confidence
top_relations = sorted(relations, key=lambda r: r.confidence, reverse=True)[:20]
for i, rel in enumerate(top_relations, 1):
summary_parts.append(
f"{i}. {rel.cause}{rel.effect} "
f"(confidence: {rel.confidence:.2f})"
)
return "\n".join(summary_parts)
def _format_relationships(self, relations: List[CausalRelation]) -> str:
"""Format relationships for the prompt."""
if not relations:
return "No relationships found."
lines = []
for rel in relations[:50]: # Limit to 50
line = f"- {rel.cause}{rel.effect}"
if rel.explanation:
line += f" ({rel.explanation[:100]})"
lines.append(line)
return "\n".join(lines)
def _parse_sections(self, content: str) -> Dict[str, str]:
"""Parse markdown content into sections."""
sections = {}
current_section = None
current_content = []
lines = content.split('\n')
for line in lines:
# Check if it's a heading (starts with #)
if line.strip().startswith('#'):
# Save previous section
if current_section:
sections[current_section] = '\n'.join(current_content).strip()
# Start new section
current_section = line.strip().lstrip('#').strip()
current_content = [line]
else:
if current_section:
current_content.append(line)
else:
# Content before first heading
if 'introduction' not in sections:
sections['introduction'] = line
else:
sections['introduction'] += '\n' + line
# Save last section
if current_section:
sections[current_section] = '\n'.join(current_content).strip()
return sections
def _format_causal_chains(self, causal_chains: List[Dict]) -> str:
"""Format causal chains from Neo4j for the prompt."""
if not causal_chains:
return "No causal chains found in knowledge graph."
lines = []
for i, chain_data in enumerate(causal_chains[:20], 1): # Top 20 chains
chain = chain_data.get("chain", [])
avg_confidence = chain_data.get("avg_confidence", 0.0)
if len(chain) >= 2:
chain_text = "".join(chain)
lines.append(f"{i}. {chain_text} (confidence: {avg_confidence:.2f})")
return "\n".join(lines) if lines else "No causal chains found."
def _format_key_entities(self, key_entities: List[Dict]) -> str:
"""Format key entities from Neo4j for the prompt."""
if not key_entities:
return "No key entities found in knowledge graph."
lines = []
for entity in key_entities[:20]: # Top 20 entities
name = entity.get("name", "")
entity_type = entity.get("type", "Entity")
relation_count = entity.get("relation_count", 0)
lines.append(f"- {name} ({entity_type}): involved in {relation_count} relationships")
return "\n".join(lines) if lines else "No key entities found."
def _convert_to_pdf(self, markdown_content: str, job_id: str) -> Optional[Path]:
"""
Convert Markdown report to PDF as per README Step 7.8.
Uses markdown + weasyprint for PDF generation.
"""
if not HAS_MARKDOWN or not HAS_WEASYPRINT:
return None
try:
# Convert Markdown to HTML
html_content = markdown.markdown(
markdown_content,
extensions=['codehilite', 'fenced_code', 'tables']
)
# Add CSS styling
css_style = """
@page {
size: A4;
margin: 2cm;
}
body {
font-family: 'Georgia', serif;
line-height: 1.6;
color: #333;
}
h1, h2, h3, h4 {
color: #2c3e50;
margin-top: 1.5em;
margin-bottom: 0.5em;
}
h1 { font-size: 2em; border-bottom: 2px solid #3498db; padding-bottom: 0.3em; }
h2 { font-size: 1.5em; border-bottom: 1px solid #95a5a6; padding-bottom: 0.2em; }
h3 { font-size: 1.2em; }
code {
background-color: #f4f4f4;
padding: 2px 4px;
border-radius: 3px;
font-family: 'Courier New', monospace;
}
pre {
background-color: #f4f4f4;
padding: 1em;
border-radius: 5px;
overflow-x: auto;
}
table {
border-collapse: collapse;
width: 100%;
margin: 1em 0;
}
th, td {
border: 1px solid #ddd;
padding: 8px;
text-align: left;
}
th {
background-color: #3498db;
color: white;
}
"""
# Create full HTML document
full_html = f"""
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Project Onboarding Guide</title>
</head>
<body>
{html_content}
</body>
</html>
"""
# Generate PDF
settings = get_settings()
storage_root = Path(settings.storage_root)
reports_dir = storage_root / "reports"
reports_dir.mkdir(parents=True, exist_ok=True)
pdf_path = reports_dir / f"report_{job_id}.pdf"
HTML(string=full_html).write_pdf(
pdf_path,
stylesheets=[CSS(string=css_style)]
)
logger.info("PDF report generated: %s", pdf_path)
return pdf_path
except Exception as exc:
logger.exception("Failed to convert Markdown to PDF: %s", exc)
return None

View File

@ -0,0 +1,269 @@
from __future__ import annotations
import logging
from typing import Dict, List, Optional
from uuid import uuid4
from ..config import get_settings
from ..models import CausalRelation
logger = logging.getLogger(__name__)
try:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
HAS_QDRANT = True
except ImportError:
HAS_QDRANT = False
logger.warning("qdrant-client not available")
class VectorStore:
"""Qdrant vector database client for storing KG embeddings."""
def __init__(
self,
url: str | None = None,
collection_name: str | None = None,
vector_size: int | None = None
):
if not HAS_QDRANT:
raise ImportError("qdrant-client is required for vector storage")
settings = get_settings()
self.url = url or settings.qdrant_url
self.collection_name = collection_name or settings.qdrant_collection_name
self.vector_size = vector_size or settings.qdrant_vector_size
logger.info("Connecting to Qdrant at %s", self.url)
try:
self.client = QdrantClient(url=self.url)
logger.info("Connected to Qdrant")
except Exception as exc:
logger.exception("Failed to connect to Qdrant: %s", exc)
raise
# Ensure collection exists
self._ensure_collection()
def _ensure_collection(self) -> None:
"""Create collection if it doesn't exist."""
try:
collections = self.client.get_collections()
collection_names = [col.name for col in collections.collections]
if self.collection_name not in collection_names:
logger.info("Creating Qdrant collection: %s", self.collection_name)
try:
self.client.create_collection(
collection_name=self.collection_name,
vectors_config=VectorParams(
size=self.vector_size,
distance=Distance.COSINE
)
)
logger.info("Created collection: %s", self.collection_name)
except Exception as create_exc:
# Collection might have been created by another instance
if "already exists" in str(create_exc).lower() or "409" in str(create_exc):
logger.info("Collection %s already exists (created by another instance)", self.collection_name)
else:
raise
else:
logger.debug("Collection %s already exists", self.collection_name)
except Exception as exc:
logger.exception("Failed to ensure collection: %s", exc)
raise
def store_relation(
self,
relation: CausalRelation,
embedding: List[float],
job_id: str
) -> str:
"""Store a relationship embedding in Qdrant."""
point_id = str(uuid4())
payload = {
"job_id": job_id,
"cause": relation.cause,
"effect": relation.effect,
"confidence": relation.confidence,
"source_file_id": relation.source_file_id or "",
"source_snippet": relation.source_snippet or "",
"explanation": relation.explanation or "",
}
point = PointStruct(
id=point_id,
vector=embedding,
payload=payload
)
try:
self.client.upsert(
collection_name=self.collection_name,
points=[point]
)
logger.debug("Stored relation embedding: %s -> %s", relation.cause, relation.effect)
return point_id
except Exception as exc:
logger.warning("Failed to store relation: %s", exc)
return ""
def store_concept(
self,
concept_name: str,
embedding: List[float],
job_id: str,
description: str | None = None
) -> str:
"""Store a concept/node embedding in Qdrant."""
point_id = str(uuid4())
payload = {
"job_id": job_id,
"concept_name": concept_name,
"description": description or "",
"type": "concept"
}
point = PointStruct(
id=point_id,
vector=embedding,
payload=payload
)
try:
self.client.upsert(
collection_name=self.collection_name,
points=[point]
)
logger.debug("Stored concept embedding: %s", concept_name)
return point_id
except Exception as exc:
logger.warning("Failed to store concept: %s", exc)
return ""
def search(
self,
query_embedding: List[float],
job_id: str | None = None,
top_k: int = 10,
score_threshold: float = 0.5
) -> List[Dict]:
"""Search for similar vectors in Qdrant."""
try:
# Build filter if job_id is provided
query_filter = None
if job_id:
query_filter = Filter(
must=[
FieldCondition(
key="job_id",
match=MatchValue(value=job_id)
)
]
)
# Use the collections API for search
# Check if client has search method (newer versions) or use query_points (older)
if hasattr(self.client, 'search'):
results = self.client.search(
collection_name=self.collection_name,
query_vector=query_embedding,
query_filter=query_filter,
limit=top_k,
score_threshold=score_threshold
)
elif hasattr(self.client, 'query_points'):
# Fallback for older API
results = self.client.query_points(
collection_name=self.collection_name,
query=query_embedding,
query_filter=query_filter,
top=top_k,
score_threshold=score_threshold
)
else:
# Try using the collection directly
collection = self.client.get_collection(self.collection_name)
if hasattr(collection, 'search'):
results = collection.search(
query_vector=query_embedding,
query_filter=query_filter,
limit=top_k,
score_threshold=score_threshold
)
else:
logger.error("QdrantClient does not have search or query_points method")
return []
# Convert to list of dicts
search_results = []
for result in results:
search_results.append({
"id": str(result.id),
"score": result.score,
"payload": result.payload
})
return search_results
except Exception as exc:
logger.warning("Vector search failed: %s", exc)
import traceback
logger.debug("Search error traceback: %s", traceback.format_exc())
return []
def search_by_text(
self,
query_text: str,
embedder,
job_id: str | None = None,
top_k: int = 10
) -> List[Dict]:
"""Search using text query (embeds it first)."""
query_embedding = embedder.embed_text(query_text)
return self.search(query_embedding, job_id=job_id, top_k=top_k)
def delete_job_vectors(self, job_id: str) -> int:
"""Delete all vectors for a specific job."""
try:
# Qdrant doesn't have a direct delete by filter, so we need to:
# 1. Search for all points with job_id
# 2. Delete them by ID
# This is a simplified version - in production, you might want
# to use scroll API for large datasets
query_filter = Filter(
must=[
FieldCondition(
key="job_id",
match=MatchValue(value=job_id)
)
]
)
# Scroll to get all points
points, _ = self.client.scroll(
collection_name=self.collection_name,
scroll_filter=query_filter,
limit=10000 # Adjust based on expected size
)
if points:
point_ids = [str(point.id) for point in points]
self.client.delete(
collection_name=self.collection_name,
points_selector=point_ids
)
logger.info("Deleted %d vectors for job %s", len(point_ids), job_id)
return len(point_ids)
return 0
except Exception as exc:
logger.warning("Failed to delete job vectors: %s", exc)
return 0

View File

@ -4,14 +4,19 @@ import logging
from pathlib import Path
from typing import Iterable, List
from ..claude_client import ClaudeCausalExtractor
from ..config import get_settings
from ..extractors.auto import extract_text
from ..extractors.image_extractor import extract_images_from_file
from ..extractors.pymupdf_extractor import extract_all_text, extract_text_with_context
from ..extractors.qwen_vision import QwenVisionClient
from ..jobs import JobStore
from ..models import CausalRelation, JobStage
from ..processors.chunker import TextChunker
from ..processors.dowhy_analyzer import DoWhyAnalyzer
from ..processors.embedder import Embedder
from ..processors.entity_resolver import EntityResolver
from ..processors.graph_writer import GraphWriter
from ..processors.relationship_extractor import RelationshipExtractor
from ..processors.report_generator import ReportGenerator
from ..processors.vector_store import VectorStore
from ..storage import StorageManager
logger = logging.getLogger(__name__)
@ -23,31 +28,60 @@ class JobPipeline:
job_store: JobStore,
storage: StorageManager,
graph_writer: GraphWriter,
claude_extractor: ClaudeCausalExtractor,
):
self.job_store = job_store
self.storage = storage
self.graph_writer = graph_writer
self.claude_extractor = claude_extractor
settings = get_settings()
self.chunker = TextChunker(
model_name=settings.claude_model,
token_target=settings.chunk_token_target,
overlap=settings.chunk_token_overlap,
)
# Initialize extractors
self.qwen_client = QwenVisionClient() # Only for images/diagrams
self.relationship_extractor = RelationshipExtractor() # NLP (SpaCy) + Claude AI for text (as per README)
self.entity_resolver = EntityResolver() # Claude AI entity resolution (as per README Stage 4)
# Initialize processors
try:
self.dowhy_analyzer = DoWhyAnalyzer() if settings.dowhy_enabled else None
except Exception as e:
logger.warning("DoWhy not available: %s", e)
self.dowhy_analyzer = None
try:
self.embedder = Embedder()
self.vector_store = VectorStore()
except Exception as e:
logger.warning("Vector store not available: %s", e)
self.embedder = None
self.vector_store = None
try:
self.report_generator = ReportGenerator()
except Exception as e:
logger.warning("Report generator not available: %s", e)
self.report_generator = None
def process_job(self, job_id: str, saved_files: Iterable[str]) -> None:
job = self.job_store.get(job_id)
logger.info("Processing job %s with %d files", job_id, job.total_files)
relations: List[CausalRelation] = []
all_text_content: List[str] = []
all_relations: List[CausalRelation] = []
try:
self.job_store.update(job_id, stage=JobStage.EXTRACTING, status_message="Extracting content")
# ============================================================
# STEP 1: CONTENT EXTRACTION (PyMuPDF + Qwen2.5-VL)
# ============================================================
self.job_store.update(
job_id,
stage=JobStage.EXTRACTING,
status_message="Extracting content from documents"
)
for count, file_path in enumerate(saved_files, start=1):
file_path_obj = Path(file_path)
file_record = next((f for f in job.files if f.stored_path == file_path), None)
logger.info("Processing %s", file_path_obj.name)
logger.info("Processing %s (%d/%d)", file_path_obj.name, count, job.total_files)
source_file_id = file_record.id if file_record else file_path_obj.name
suffix = file_path_obj.suffix.lower()
@ -55,27 +89,36 @@ class JobPipeline:
is_direct_image = suffix in {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}
try:
# Extract text from document (if not a direct image)
# Step 2.1: IDENTIFY FILE TYPE and route to appropriate extractor
# Step 2.2: Extract text based on file type (as per README)
text = ""
if not is_direct_image:
try:
text = extract_text(file_path_obj)
# extract_all_text() handles routing:
# - PDF → PyMuPDF (Step 2.2a)
# - DOCX → python-docx (Step 2.2b)
# - PPTX → python-pptx (Step 2.2c)
# - CSV/XLSX → pandas (Step 2.2d)
# - Text files → direct read
# Also performs Step 2.3: Text cleaning
text = extract_all_text(file_path_obj)
# Process text if available
if text and text.strip():
# Validate text is readable
# Validate text is readable (basic check)
printable_chars = sum(1 for c in text if c.isprintable() or c.isspace())
total_chars = len(text)
if total_chars > 100 and printable_chars / total_chars < 0.3:
logger.warning("Text from %s appears to be binary, skipping text processing", file_path_obj.name)
logger.warning("Text from %s appears to be binary, skipping", file_path_obj.name)
text = ""
else:
# Step 2.4: STORE EXTRACTED TEXT
all_text_content.append(text)
extracted_path = self.storage.stage_extracted_content(job_id, file_path_obj.name, text)
if file_record:
file_record.extracted_path = str(extracted_path)
logger.info("Successfully extracted %d characters from %s", len(text), file_path_obj.name)
logger.info("Extracted %d characters from %s", len(text), file_path_obj.name)
except Exception as text_exc:
logger.warning("Text extraction failed for %s: %s. Will continue with image extraction if available.", file_path_obj.name, text_exc)
logger.warning("Text extraction failed for %s: %s", file_path_obj.name, text_exc)
text = ""
# Extract images from documents (PDF, DOCX, PPTX)
@ -93,7 +136,25 @@ class JobPipeline:
extracted_images = [file_path_obj]
logger.info("Direct image upload detected: %s", file_path_obj.name)
except Exception as exc: # noqa: BLE001
# Process images with Qwen2.5-VL
if extracted_images:
for image_path in extracted_images:
try:
qwen_results = self.qwen_client.extract_relationships_from_image(
image_path, source_file_id
)
if qwen_results:
# Convert Qwen results to CausalRelation objects
qwen_relations = self.relationship_extractor.extract_from_qwen_results(
qwen_results, source_file_id
)
all_relations.extend(qwen_relations)
logger.info("Extracted %d relations from image %s using Qwen2.5-VL",
len(qwen_relations), image_path.name)
except Exception as img_exc:
logger.warning("Failed to analyze image %s with Qwen: %s", image_path, img_exc)
except Exception as exc:
logger.exception("Extraction failed for %s", file_path_obj)
if file_record:
file_record.error = str(exc)
@ -103,62 +164,188 @@ class JobPipeline:
job_id,
files=job.files,
processed_files=count,
status_message=f"Analyzing causal relations ({count}/{job.total_files})",
stage=JobStage.ANALYZING,
status_message=f"Extracting content ({count}/{job.total_files})",
)
# Process text content
if text and text.strip():
chunks = self.chunker.chunk(text)
text_relations = self.claude_extractor.analyze(chunks, source_file_id=source_file_id)
relations.extend(text_relations)
logger.info("Extracted %d relations from text in %s", len(text_relations), file_path_obj.name)
# ============================================================
# STEP 2: RELATIONSHIP EXTRACTION (NLP + Claude AI as per README)
# ============================================================
logger.info("Extracting relationships from text content using NLP (SpaCy) + Claude AI")
combined_text = "\n\n".join(all_text_content)
# Process images (extracted from documents or direct uploads)
if extracted_images:
for image_path in extracted_images:
try:
image_relations = self.claude_extractor.analyze_image(image_path, source_file_id=source_file_id)
relations.extend(image_relations)
logger.info("Extracted %d relations from image %s", len(image_relations), image_path.name)
except Exception as img_exc:
logger.warning("Failed to analyze image %s: %s", image_path, img_exc)
# Continue with other images
elif not text or not text.strip():
# No text and no images - file might be empty or unsupported
logger.warning("File %s has no extractable text or images", file_path_obj.name)
if file_record:
file_record.error = "No extractable content found (no text or images)"
if combined_text.strip():
# Extract relationships using NLP (Step 3.1) + Claude AI (Step 3.2)
# This implements the flow described in README.md
text_relations = self.relationship_extractor.extract_from_text(
combined_text,
source_file_id="combined_text"
)
all_relations.extend(text_relations)
logger.info("NLP + Claude AI extracted %d relationships from text", len(text_relations))
# Write relations to Neo4j if any were found
if relations:
self.job_store.update(job_id, status_message="Writing to knowledge graph", stage=JobStage.BUILDING_GRAPH)
# ============================================================
# STEP 3: ENTITY RESOLUTION (Claude AI as per README Stage 4)
# ============================================================
if all_relations and self.entity_resolver.client:
logger.info("Resolving entities using Claude AI")
resolved_entities = self.entity_resolver.resolve_entities(all_relations)
if resolved_entities:
# Apply resolution to relationships
all_relations = self.entity_resolver.apply_resolution_to_relations(
all_relations, resolved_entities
)
logger.info("Entity resolution completed: %d canonical entities", len(resolved_entities))
else:
logger.info("Entity resolution returned no results")
else:
if not self.entity_resolver.client:
logger.info("Entity resolution skipped (Claude AI not available)")
# ============================================================
# STEP 4: DOWHY VALIDATION
# ============================================================
if self.dowhy_analyzer and all_relations:
self.job_store.update(
job_id,
status_message="Validating relationships with DoWhy",
stage=JobStage.BUILDING_GRAPH
)
logger.info("Validating %d relationships with DoWhy", len(all_relations))
validated_relations = self.dowhy_analyzer.validate_relationships(
all_relations,
text_data=combined_text
)
all_relations = validated_relations
logger.info("DoWhy validated %d relationships", len(all_relations))
else:
if not self.dowhy_analyzer:
logger.info("DoWhy validation skipped (not available)")
self.job_store.update(
job_id,
status_message="Building knowledge graph",
stage=JobStage.BUILDING_GRAPH
)
# ============================================================
# STEP 5: WRITE TO NEO4J (Documents, Entities, Relationships)
# ============================================================
if all_relations:
try:
self.graph_writer.write_relations(job_id, relations)
logger.info("Wrote %d relations to Neo4j for job %s", len(relations), job_id)
status_message = f"Completed with {len(relations)} causal relationship(s) written to Neo4j"
# Write documents, entities, and relationships with types
self.graph_writer.write_relations(job_id, all_relations, files=job.files)
logger.info("Wrote %d relations to Neo4j for job %s", len(all_relations), job_id)
except Exception as graph_exc:
logger.exception("Failed to write relations to Neo4j for job %s: %s", job_id, graph_exc)
status_message = f"Completed with {len(relations)} relations extracted, but failed to write to Neo4j: {graph_exc}"
else:
logger.warning("Job %s completed with 0 relations - no causal relationships found", job_id)
# Check if any files failed to extract
failed_files = [f for f in job.files if f.error]
if failed_files:
status_message = f"Completed but {len(failed_files)} file(s) failed to extract. No relations found."
else:
status_message = "Completed but no causal relationships were found in the documents."
logger.exception("Failed to write relations to Neo4j: %s", graph_exc)
raise
# ============================================================
# STEP 6: VECTOR DATABASE INDEXING (Qdrant)
# ============================================================
if self.vector_store and self.embedder and all_relations:
self.job_store.update(
job_id,
status_message="Indexing knowledge graph in vector database",
stage=JobStage.INDEXING_VECTORS
)
logger.info("Indexing %d relationships in Qdrant", len(all_relations))
indexed_count = 0
for relation in all_relations:
try:
# Generate embedding for the relationship
embedding = self.embedder.embed_relation(
relation.cause,
relation.effect,
relation.explanation
)
# Store in Qdrant
self.vector_store.store_relation(relation, embedding, job_id)
indexed_count += 1
except Exception as e:
logger.warning("Failed to index relation %s -> %s: %s",
relation.cause, relation.effect, e)
# Also index concepts (nodes)
concepts = set()
for rel in all_relations:
concepts.add(rel.cause)
concepts.add(rel.effect)
for concept in concepts:
try:
embedding = self.embedder.embed_concept(concept)
self.vector_store.store_concept(concept, embedding, job_id)
except Exception as e:
logger.warning("Failed to index concept %s: %s", concept, e)
logger.info("Indexed %d relationships and %d concepts in Qdrant",
indexed_count, len(concepts))
# ============================================================
# STEP 7: GENERATE ONBOARDING REPORT
# ============================================================
if self.report_generator and self.vector_store and self.embedder:
self.job_store.update(
job_id,
status_message="Generating beginner-friendly onboarding report",
stage=JobStage.GENERATING_REPORT
)
logger.info("Generating onboarding report for job %s", job_id)
try:
kg_summary = {
"total_relations": len(all_relations),
"total_files": job.total_files,
"processed_files": job.processed_files
}
report = self.report_generator.generate_onboarding_report(
job_id=job_id,
relations=all_relations,
vector_store=self.vector_store,
embedder=self.embedder,
graph_writer=self.graph_writer, # Pass graph_writer for Neo4j queries
kg_summary=kg_summary
)
logger.info("Generated onboarding report: %d sections, %d pages",
len(report.sections), report.total_pages)
except Exception as report_exc:
logger.exception("Failed to generate report: %s", report_exc)
report = None
# Store report generation error in job metadata
report_error_msg = str(report_exc)
if "credit balance" in report_error_msg.lower() or "too low" in report_error_msg.lower():
report_error_msg = "Report generation failed: Claude API credit balance is too low. Please add credits to your Anthropic account."
self.job_store.update(
job_id,
error=f"Report generation failed: {report_error_msg}"
)
else:
logger.warning("Report generation skipped (components not available)")
report = None
# ============================================================
# FINAL UPDATE
# ============================================================
status_message = f"Completed successfully"
if all_relations:
status_message += f" with {len(all_relations)} relationships"
if report:
status_message += f" and generated onboarding report"
# Final update
self.job_store.update(
job_id,
stage=JobStage.COMPLETED,
status_message=status_message,
relations=relations,
relations=all_relations,
report=report,
processed_files=job.total_files,
)
logger.info("Job %s completed with %d relations", job_id, len(relations))
except Exception as exc: # noqa: BLE001
logger.info("Job %s completed successfully", job_id)
except Exception as exc:
logger.exception("Job %s failed: %s", job_id, exc)
self.job_store.mark_error(job_id, f"Pipeline failed: {exc}")