added qdrant db in multi doc service

2025-12-01 09:04:09 +05:30 · 2025-12-01 09:04:09 +05:30 · 72fea0dee8
commit 72fea0dee8
parent 603e9b4b20
35 changed files with 5398 additions and 1765 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -196,27 +196,45 @@ services:
  #     retries: 5
  #     start_period: 60s 
-  chromadb:
+  # chromadb:
-    image: chromadb/chroma:latest
+  #   image: chromadb/chroma:latest
-    container_name: pipeline_chromadb
+  #   container_name: pipeline_chromadb
  #   ports:
  #     - "8010:8000"
  #   environment:
  #     - CHROMA_SERVER_HOST=0.0.0.0
  #     - CHROMA_SERVER_HTTP_PORT=8000
  #     - IS_PERSISTENT=TRUE
  #     - PERSIST_DIRECTORY=/chroma/chroma
  #     - ANONYMIZED_TELEMETRY=TRUE
  #   volumes:
  #     - chromadb_data:/chroma/chroma
  #   networks:
  #     - pipeline_network
  #   healthcheck:
  #     test: ["CMD-SHELL", "timeout 5 bash -c '</dev/tcp/127.0.0.1/8000' || exit 1"]
  #     interval: 15s
  #     timeout: 10s
  #     retries: 3
  #     start_period: 30s
  qdrant:
    image: qdrant/qdrant:latest
    container_name: pipeline_qdrant
    ports:
-      - "8010:8000"
+      - "6333:6333"
-    environment:
+      - "6334:6334"
      - CHROMA_SERVER_HOST=0.0.0.0
      - CHROMA_SERVER_HTTP_PORT=8000
      - IS_PERSISTENT=TRUE
      - PERSIST_DIRECTORY=/chroma/chroma
      - ANONYMIZED_TELEMETRY=TRUE
    volumes:
-      - chromadb_data:/chroma/chroma
+      - qdrant_data:/qdrant/storage
    networks:
      - pipeline_network
    healthcheck:
-      test: ["CMD-SHELL", "timeout 5 bash -c '</dev/tcp/127.0.0.1/8000' || exit 1"]
+      test: ["CMD-SHELL", "timeout 2 bash -c '</dev/tcp/127.0.0.1/6333' || exit 1"]
-      interval: 15s
+      interval: 30s
      timeout: 10s
-      retries: 3
+      retries: 5
      start_period: 30s
    restart: unless-stopped
@ -294,97 +312,97 @@ services:
      start_period: 40s
    restart: unless-stopped
-  requirement-processor:
+  # requirement-processor:
-    build: ./services/requirement-processor
+  #   build: ./services/requirement-processor
-    container_name: pipeline_requirement_processor
+  #   container_name: pipeline_requirement_processor
-    ports:
+  #   ports:
-      - "8001:8001"
+  #     - "8001:8001"
-    environment:
+  #   environment:
-      - POSTGRES_HOST=postgres
+  #     - POSTGRES_HOST=postgres
-      - POSTGRES_PORT=5432
+  #     - POSTGRES_PORT=5432
-      - POSTGRES_DB=dev_pipeline
+  #     - POSTGRES_DB=dev_pipeline
-      - POSTGRES_USER=pipeline_admin
+  #     - POSTGRES_USER=pipeline_admin
-      - POSTGRES_PASSWORD=secure_pipeline_2024
+  #     - POSTGRES_PASSWORD=secure_pipeline_2024
-      - DATABASE_URL=postgresql://pipeline_admin:secure_pipeline_2024@postgres:5432/dev_pipeline
+  #     - DATABASE_URL=postgresql://pipeline_admin:secure_pipeline_2024@postgres:5432/dev_pipeline
-      - REDIS_HOST=redis
+  #     - REDIS_HOST=redis
-      - REDIS_PORT=6379
+  #     - REDIS_PORT=6379
-      - REDIS_PASSWORD=redis_secure_2024
+  #     - REDIS_PASSWORD=redis_secure_2024
-      - MONGODB_HOST=mongodb
+  #     - MONGODB_HOST=mongodb
-      - MONGODB_PORT=27017
+  #     - MONGODB_PORT=27017
-      - NEO4J_URI=bolt://neo4j:7687
+  #     - NEO4J_URI=bolt://neo4j:7687
-      - NEO4J_USER=neo4j
+  #     - NEO4J_USER=neo4j
-      - NEO4J_PASSWORD=password
+  #     - NEO4J_PASSWORD=password
-      - CHROMA_HOST=chromadb
+  #     - CHROMA_HOST=chromadb
-      - CHROMA_PORT=8000
+  #     - CHROMA_PORT=8000
-      - REDIS_URL=redis://:redis_secure_2024@redis:6379
+  #     - REDIS_URL=redis://:redis_secure_2024@redis:6379
-    networks:
+  #   networks:
-      - pipeline_network
+  #     - pipeline_network
-    depends_on:
+  #   depends_on:
-      postgres:
+  #     postgres:
-        condition: service_healthy
+  #       condition: service_healthy
-      redis:
+  #     redis:
-        condition: service_healthy
+  #       condition: service_healthy
-      mongodb:
+  #     mongodb:
-        condition: service_started
+  #       condition: service_started
-      migrations:
+  #     migrations:
-        condition: service_completed_successfully
+  #       condition: service_completed_successfully
-  tech-stack-selector:
+  # tech-stack-selector:
-    build: ./services/tech-stack-selector
+  #   build: ./services/tech-stack-selector
-    container_name: pipeline_tech_stack_selector
+  #   container_name: pipeline_tech_stack_selector
-    ports:
+  #   ports:
-      - "8002:8002"
+  #     - "8002:8002"
-    environment:
+  #   environment:
-      - POSTGRES_HOST=postgres
+  #     - POSTGRES_HOST=postgres
-      - POSTGRES_PORT=5432
+  #     - POSTGRES_PORT=5432
-      - POSTGRES_DB=dev_pipeline
+  #     - POSTGRES_DB=dev_pipeline
-      - POSTGRES_USER=pipeline_admin
+  #     - POSTGRES_USER=pipeline_admin
-      - POSTGRES_PASSWORD=secure_pipeline_2024
+  #     - POSTGRES_PASSWORD=secure_pipeline_2024
-      - REDIS_HOST=redis
+  #     - REDIS_HOST=redis
-      - REDIS_PORT=6379
+  #     - REDIS_PORT=6379
-      - REDIS_PASSWORD=redis_secure_2024
+  #     - REDIS_PASSWORD=redis_secure_2024
-      - CLAUDE_API_KEY=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
+  #     - CLAUDE_API_KEY=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
-    networks:
+  #   networks:
-      - pipeline_network
+  #     - pipeline_network
-    depends_on:
+  #   depends_on:
-      postgres:
+  #     postgres:
-        condition: service_healthy
+  #       condition: service_healthy
-      redis:
+  #     redis:
-        condition: service_healthy
+  #       condition: service_healthy
-      migrations:
+  #     migrations:
-        condition: service_completed_successfully
+  #       condition: service_completed_successfully
-  architecture-designer:
+  # architecture-designer:
-    build: ./services/architecture-designer
+  #   build: ./services/architecture-designer
-    container_name: pipeline_architecture_designer
+  #   container_name: pipeline_architecture_designer
-    ports:
+  #   ports:
-      - "8003:8003"
+  #     - "8003:8003"
-    environment:
+  #   environment:
-      - PORT=8003
+  #     - PORT=8003
-      - HOST=0.0.0.0
+  #     - HOST=0.0.0.0
-      - CLAUDE_API_KEY=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
+  #     - CLAUDE_API_KEY=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
-      - ANTHROPIC_API_KEY=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
+  #     - ANTHROPIC_API_KEY=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
-      - POSTGRES_HOST=postgres
+  #     - POSTGRES_HOST=postgres
-      - POSTGRES_PORT=5432
+  #     - POSTGRES_PORT=5432
-      - POSTGRES_DB=dev_pipeline
+  #     - POSTGRES_DB=dev_pipeline
-      - POSTGRES_USER=pipeline_admin
+  #     - POSTGRES_USER=pipeline_admin
-      - POSTGRES_PASSWORD=secure_pipeline_2024
+  #     - POSTGRES_PASSWORD=secure_pipeline_2024
-      - MONGODB_HOST=mongodb
+  #     - MONGODB_HOST=mongodb
-      - MONGODB_PORT=27017
+  #     - MONGODB_PORT=27017
-    networks:
+  #   networks:
-      - pipeline_network
+  #     - pipeline_network
-    depends_on:
+  #   depends_on:
-      postgres:
+  #     postgres:
-        condition: service_healthy
+  #       condition: service_healthy
-      mongodb:
+  #     mongodb:
-        condition: service_started
+  #       condition: service_started
-      migrations:
+  #     migrations:
-        condition: service_completed_successfully
+  #       condition: service_completed_successfully
-    healthcheck:
+  #   healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:8003/health"]
+  #     test: ["CMD", "curl", "-f", "http://localhost:8003/health"]
-      interval: 30s
+  #     interval: 30s
-      timeout: 10s
+  #     timeout: 10s
-      retries: 3
+  #     retries: 3
  # code-generator:
  #   build: ./services/code-generator
@ -461,34 +479,34 @@ services:
      migrations:
        condition: service_completed_successfully
-  deployment-manager:
+  # deployment-manager:
-    build: ./services/deployment-manager
+  #   build: ./services/deployment-manager
-    container_name: pipeline_deployment_manager
+  #   container_name: pipeline_deployment_manager
-    ports:
+  #   ports:
-      - "8006:8006"
+  #     - "8006:8006"
-    environment:
+  #   environment:
-      - POSTGRES_HOST=postgres
+  #     - POSTGRES_HOST=postgres
-      - POSTGRES_PORT=5432
+  #     - POSTGRES_PORT=5432
-      - POSTGRES_DB=dev_pipeline
+  #     - POSTGRES_DB=dev_pipeline
-      - POSTGRES_USER=pipeline_admin
+  #     - POSTGRES_USER=pipeline_admin
-      - POSTGRES_PASSWORD=secure_pipeline_2024
+  #     - POSTGRES_PASSWORD=secure_pipeline_2024
-      - MONGODB_HOST=mongodb
+  #     - MONGODB_HOST=mongodb
-      - MONGODB_PORT=27017
+  #     - MONGODB_PORT=27017
-      - RABBITMQ_HOST=rabbitmq
+  #     - RABBITMQ_HOST=rabbitmq
-      - RABBITMQ_PORT=5672
+  #     - RABBITMQ_PORT=5672
-      - RABBITMQ_USER=pipeline_admin
+  #     - RABBITMQ_USER=pipeline_admin
-      - RABBITMQ_PASSWORD=rabbit_secure_2024
+  #     - RABBITMQ_PASSWORD=rabbit_secure_2024
-    networks:
+  #   networks:
-      - pipeline_network
+  #     - pipeline_network
-    depends_on:
+  #   depends_on:
-      postgres:
+  #     postgres:
-        condition: service_healthy
+  #       condition: service_healthy
-      rabbitmq:
+  #     rabbitmq:
-        condition: service_healthy
+  #       condition: service_healthy
-      mongodb:
+  #     mongodb:
-        condition: service_started
+  #       condition: service_started
-      migrations:
+  #     migrations:
-        condition: service_completed_successfully
+  #       condition: service_completed_successfully
  user-auth:
    build: ./services/user-auth
@ -583,38 +601,38 @@ services:
    restart: unless-stopped
  # AI Mockup / Wireframe Generation Service
-  ai-mockup-service:
+  # ai-mockup-service:
-    build: ./services/ai-mockup-service
+  #   build: ./services/ai-mockup-service
-    container_name: pipeline_ai_mockup_service
+  #   container_name: pipeline_ai_mockup_service
-    ports:
+  #   ports:
-      - "8021:8021"
+  #     - "8021:8021"
-    environment:
+  #   environment:
-      - PORT=8021
+  #     - PORT=8021
-      - HOST=0.0.0.0
+  #     - HOST=0.0.0.0
-      - CLAUDE_API_KEY=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
+  #     - CLAUDE_API_KEY=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
-      - POSTGRES_HOST=postgres
+  #     - POSTGRES_HOST=postgres
-      - POSTGRES_PORT=5432
+  #     - POSTGRES_PORT=5432
-      - POSTGRES_DB=dev_pipeline
+  #     - POSTGRES_DB=dev_pipeline
-      - POSTGRES_USER=pipeline_admin
+  #     - POSTGRES_USER=pipeline_admin
-      - POSTGRES_PASSWORD=secure_pipeline_2024
+  #     - POSTGRES_PASSWORD=secure_pipeline_2024
-      - REDIS_HOST=redis
+  #     - REDIS_HOST=redis
-      - REDIS_PORT=6379
+  #     - REDIS_PORT=6379
-      - REDIS_PASSWORD=redis_secure_2024
+  #     - REDIS_PASSWORD=redis_secure_2024
-      - JWT_ACCESS_SECRET=access-secret-key-2024-tech4biz-secure_pipeline_2024
+  #     - JWT_ACCESS_SECRET=access-secret-key-2024-tech4biz-secure_pipeline_2024
-      - USER_AUTH_SERVICE_URL=http://user-auth:8011
+  #     - USER_AUTH_SERVICE_URL=http://user-auth:8011
-      - FLASK_ENV=development
+  #     - FLASK_ENV=development
-    networks:
+  #   networks:
-      - pipeline_network
+  #     - pipeline_network
-    depends_on:
+  #   depends_on:
-      postgres:
+  #     postgres:
-        condition: service_healthy
+  #       condition: service_healthy
-      user-auth:
+  #     user-auth:
-        condition: service_healthy
+  #       condition: service_healthy
-    healthcheck:
+  #   healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:8021/health"]
+  #     test: ["CMD", "curl", "-f", "http://localhost:8021/health"]
-      interval: 30s
+  #     interval: 30s
-      timeout: 10s
+  #     timeout: 10s
-      retries: 3
+  #     retries: 3
  git-integration:
    build: ./services/git-integration
@ -731,7 +749,7 @@ services:
    environment:
      - PORT=8022
      - HOST=0.0.0.0
-      - ANTHROPIC_API_KEY=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
+      - ANTHROPIC_API_KEY=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
      # Neo4j Configuration
      - USE_NEO4J_KG=true
@ -790,17 +808,37 @@ services:
    environment:
      - PORT=8024
      - HOST=0.0.0.0
-      - ANTHROPIC_API_KEY=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
+      
      # Claude/Anthropic Configuration
      - ANTHROPIC_API_KEY=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
      - MULTI_DOC_CLAUDE_MODEL=claude-3-5-haiku-latest
      - CLAUDE_MODEL=claude-3-5-haiku-latest
      # Qwen2.5-VL API Configuration
      - QWEN_API_KEY=${QWEN_API_KEY:-}
      - QWEN_API_URL=${QWEN_API_URL:-https://api.example.com/v1/chat/completions}
      - QWEN_MODEL=qwen2.5-vl
      # Neo4j Configuration
      - NEO4J_URI=bolt://neo4j:7687
      - NEO4J_USER=neo4j
      - NEO4J_PASSWORD=password
      - NEO4J_DATABASE=neo4j
      # Qdrant Configuration
      - QDRANT_URL=http://qdrant:6333
      - QDRANT_COLLECTION_NAME=kg_embeddings
      # DoWhy Configuration
      - DOWHY_ENABLED=true
      - DOWHY_CONFIDENCE_THRESHOLD=0.05
      # Embedding Configuration
      - EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
      - EMBEDDING_DIMENSION=384
      # Storage Configuration
-      - STORAGE_DIR=/app/storage
+      - MULTI_DOC_STORAGE_ROOT=/app/storage
      # Database configurations (optional, for job tracking)
      - POSTGRES_HOST=pipeline_postgres
@ -817,6 +855,8 @@ services:
    depends_on:
      neo4j:
        condition: service_healthy
      qdrant:
        condition: service_healthy
      postgres:
        condition: service_healthy
      redis:
@ -958,6 +998,8 @@ volumes:
    driver: local
  multi_document_storage:
    driver: local
  qdrant_data:
    driver: local
 # =====================================
 # Networks
--- a/services/ai-analysis-service/ai-analyze.py
+++ b/services/ai-analysis-service/ai-analyze.py
@ -7094,8 +7094,29 @@ async def main():
        js_files = [fa for fa in frontend_files if fa.path.lower().endswith(('.js', '.jsx', '.mjs', '.cjs'))]
        ts_files = [fa for fa in frontend_files if fa.path.lower().endswith(('.ts', '.tsx'))]
        # Allocate frontend persona
        from persona_system import allocate_code_persona, build_code_analysis_persona_prompt
        # Determine if it's UI or state management focused
        has_state_files = len(state_files) > 0
        sample_file = frontend_files[0] if frontend_files else None
        sample_path = sample_file.path if sample_file else ""
        sample_content = getattr(sample_file, 'content', '')[:1000] if sample_file else ""
        # Allocate persona - prefer state management if state files exist
        if has_state_files:
            # Try to get state management persona
            persona = allocate_code_persona("store/state.ts", sample_content, "frontend_state")
            if "state" not in persona.get("role", "").lower():
                # Fallback to UI persona
                persona = allocate_code_persona(sample_path, sample_content, "frontend_ui")
        else:
            persona = allocate_code_persona(sample_path, sample_content, "frontend_ui")
        assignment_context = f"CTO has assigned you to analyze the frontend codebase for this project. You are analyzing {len(frontend_files)} frontend files including components, routing, state management, and configuration."
        front_end_prompt = f"""
-You are a Senior Frontend Architect and Technical Writer with 20+ years of experience. Analyze this frontend codebase and produce a comprehensive, technically precise report. The audience includes senior engineers and stakeholders who expect evidence-based, objective findings.
+Analyze this frontend codebase and produce a comprehensive, technically precise report. The audience includes senior engineers and stakeholders who expect evidence-based, objective findings.
 STRICT STYLE RULES:
 - Use professional, technical language only. Do not use analogies, metaphors, storytelling, or colloquial comparisons.
@ -7211,6 +7232,9 @@ FINAL REQUIREMENTS:
 - Ensure total length between 2000-3000 words.
 """
        # Enhance prompt with persona
        enhanced_prompt = build_code_analysis_persona_prompt(front_end_prompt, persona, assignment_context)
        try:
            print(f"🤖 [FRONTEND AI] Calling Claude API for comprehensive frontend analysis...")
            print(f"🤖 [FRONTEND AI] Analyzing {len(frontend_files)} frontend files...")
@ -7220,7 +7244,7 @@ FINAL REQUIREMENTS:
                model=os.getenv("CLAUDE_MODEL", "claude-3-5-haiku-latest"),
                max_tokens=8000,  # Increased from 6000 to 8000 for more detailed analysis
                temperature=0.1,
-                messages=[{"role": "user", "content": front_end_prompt}]
+                messages=[{"role": "user", "content": enhanced_prompt}]
            )
            ai_analysis = message.content[0].text.strip()
@ -7230,7 +7254,7 @@ FINAL REQUIREMENTS:
            if not ai_analysis or len(ai_analysis) < 100:
                print("⚠️ [FRONTEND AI] AI analysis too short, regenerating...")
                # Retry with more emphasis on detail
-                retry_prompt = front_end_prompt + "\n\nIMPORTANT: Provide a VERY DETAILED analysis. The previous response was too short. Please provide at least 2000 words of detailed explanation."
+                retry_prompt = enhanced_prompt + "\n\nIMPORTANT: Provide a VERY DETAILED analysis. The previous response was too short. Please provide at least 2000 words of detailed explanation."
                message = self.client.messages.create(
                    model=os.getenv("CLAUDE_MODEL", "claude-3-5-haiku-latest"),
                    max_tokens=8000,
--- a/services/ai-analysis-service/enhanced_chunking.py
+++ b/services/ai-analysis-service/enhanced_chunking.py
@ -524,7 +524,11 @@ class ChunkAnalyzer:
    def _build_chunk_analysis_prompt(self, file_path: str, chunk: ChunkInfo, 
                                   chunk_index: int, total_chunks: int, 
                                   context_memories: Dict[str, Any]) -> str:
-        """Build comprehensive analysis prompt for a chunk."""
+        """Build comprehensive analysis prompt for a chunk with persona."""
        from persona_system import allocate_code_persona, build_code_analysis_persona_prompt
        # Allocate persona based on file path and chunk content
        persona = allocate_code_persona(file_path, chunk.content, chunk.chunk_type)
        # Build context information
        context_info = ""
@ -538,8 +542,10 @@ class ChunkAnalyzer:
            for practice in context_memories['best_practices'][:3]:
                context_info += f"- {practice['content'][:100]}...\n"
        assignment_context = f"CTO has assigned you to analyze chunk {chunk_index + 1} of {total_chunks} from file: {file_path}. This is a {chunk.chunk_type} chunk covering lines {chunk.start_line}-{chunk.end_line}."
        prompt = f"""
-You are a senior software engineer analyzing chunk {chunk_index + 1} of {total_chunks} from file: {file_path}
+Analyzing chunk {chunk_index + 1} of {total_chunks} from file: {file_path}
 CHUNK INFORMATION:
 - Chunk Type: {chunk.chunk_type}
@ -564,7 +570,10 @@ Provide a focused analysis of this specific chunk, considering:
 Focus on actionable insights for this specific code section.
 """
-        return prompt
+        
        # Enhance with persona
        enhanced_prompt = build_code_analysis_persona_prompt(prompt, persona, assignment_context)
        return enhanced_prompt
    def _detect_language_from_path(self, file_path: str) -> str:
        """Detect language from file path."""
--- a/services/ai-analysis-service/persona_system.py
+++ b/services/ai-analysis-service/persona_system.py
@ -0,0 +1,755 @@
 """
 World-Class Persona System for AI Analysis
 Simulates real-world team allocation with domain-specific experts from top companies.
 """
 from typing import Dict, List, Optional, Tuple
 import re
 # ============================================================================
 # CODE ANALYSIS PERSONAS (for AI Analysis Service)
 # ============================================================================
 CODE_ANALYSIS_PERSONAS = {
    # BACKEND DOMAINS
    "backend_api": {
        "role": "Senior Backend API Architect",
        "companies": ["Google", "Amazon", "Stripe"],
        "expertise": ["REST APIs", "GraphQL", "gRPC", "API Gateway", "Microservices"],
        "experience_years": "18+",
        "achievements": [
            "Designed APIs at Google Cloud Platform handling 10M+ requests/day",
            "Built scalable API infrastructure at Amazon AWS serving millions of customers",
            "Led API architecture at Stripe processing billions in transactions"
        ],
        "detection_keywords": ["api", "controller", "route", "endpoint", "service", "rest", "graphql"],
        "focus_areas": [
            "API design patterns and best practices",
            "API versioning and backward compatibility",
            "Rate limiting and throttling strategies",
            "API documentation quality",
            "Security vulnerabilities in API endpoints"
        ]
    },
    "backend_database": {
        "role": "Senior Database Architect",
        "companies": ["Amazon", "Oracle", "MongoDB"],
        "expertise": ["SQL", "NoSQL", "Database Design", "Query Optimization", "Data Modeling"],
        "experience_years": "20+",
        "achievements": [
            "Designed database systems at Amazon handling petabytes of data",
            "Optimized databases at Oracle for enterprise-scale applications",
            "Built distributed databases at MongoDB for global scale"
        ],
        "detection_keywords": ["database", "db", "model", "schema", "migration", "repository", "orm", "query"],
        "focus_areas": [
            "Database schema design and normalization",
            "Query performance and optimization",
            "Data integrity and constraints",
            "Indexing strategies",
            "Transaction management"
        ]
    },
    "backend_business": {
        "role": "Senior Backend Business Logic Architect",
        "companies": ["Microsoft", "Salesforce", "SAP"],
        "expertise": ["Business Logic", "Domain Modeling", "Design Patterns", "Service Layer"],
        "experience_years": "17+",
        "achievements": [
            "Architected business logic systems at Microsoft for enterprise applications",
            "Designed domain models at Salesforce for CRM platforms",
            "Built service layers at SAP for ERP systems"
        ],
        "detection_keywords": ["service", "business", "logic", "domain", "entity", "dto", "handler"],
        "focus_areas": [
            "Code organization and structure",
            "Design patterns implementation",
            "Business logic maintainability",
            "Domain modeling quality",
            "Service layer architecture"
        ]
    },
    # FRONTEND DOMAINS
    "frontend_ui": {
        "role": "Senior Frontend UI Architect",
        "companies": ["Apple", "Meta", "Netflix"],
        "expertise": ["React", "Vue", "Angular", "Component Design", "UI/UX"],
        "experience_years": "15+",
        "achievements": [
            "Built user interfaces at Apple used by millions daily",
            "Led React architecture at Meta (Facebook) for large-scale applications",
            "Designed performance-optimized UIs at Netflix for 200M+ users"
        ],
        "detection_keywords": ["component", "ui", "view", "page", "jsx", "tsx", "vue", "template"],
        "focus_areas": [
            "Component architecture and reusability",
            "User experience and accessibility",
            "UI performance optimization",
            "Design system consistency",
            "Responsive design implementation"
        ]
    },
    "frontend_state": {
        "role": "Senior Frontend State Management Architect",
        "companies": ["Meta", "Netflix", "Airbnb"],
        "expertise": ["Redux", "Zustand", "Context API", "State Management", "Data Flow"],
        "experience_years": "14+",
        "achievements": [
            "Architected state management at Meta for complex applications",
            "Designed data flow patterns at Netflix for real-time updates",
            "Built state systems at Airbnb for booking platforms"
        ],
        "detection_keywords": ["store", "state", "redux", "context", "recoil", "zustand", "mobx"],
        "focus_areas": [
            "State architecture and patterns",
            "Data flow optimization",
            "State synchronization",
            "Performance in state updates",
            "State management best practices"
        ]
    },
    # DEVOPS DOMAINS
    "devops_ci_cd": {
        "role": "Senior DevOps CI/CD Architect",
        "companies": ["Google", "Netflix", "Uber"],
        "expertise": ["CI/CD", "Jenkins", "GitHub Actions", "GitLab CI", "Deployment Automation"],
        "experience_years": "12+",
        "achievements": [
            "Built CI/CD pipelines at Google handling 50K+ deployments/day",
            "Designed deployment systems at Netflix for zero-downtime releases",
            "Architected automation at Uber for global scale"
        ],
        "detection_keywords": ["ci", "cd", "pipeline", "jenkins", "github-actions", "gitlab", "deploy"],
        "focus_areas": [
            "CI/CD pipeline efficiency",
            "Deployment strategy and automation",
            "Quality gates and testing",
            "Rollback strategies",
            "Build optimization"
        ]
    },
    "devops_infrastructure": {
        "role": "Senior Infrastructure Architect",
        "companies": ["Amazon", "Google", "Microsoft"],
        "expertise": ["Kubernetes", "Docker", "Terraform", "Cloud Infrastructure", "Scalability"],
        "experience_years": "16+",
        "achievements": [
            "Designed infrastructure at Amazon AWS for global scale",
            "Built container orchestration at Google for millions of containers",
            "Architected cloud systems at Microsoft Azure with 99.99% uptime"
        ],
        "detection_keywords": ["docker", "kubernetes", "terraform", "infrastructure", "cloud", "aws", "gcp", "azure"],
        "focus_areas": [
            "Infrastructure scalability",
            "System reliability and uptime",
            "Cost optimization",
            "Security in infrastructure",
            "Monitoring and observability"
        ]
    },
    # SECURITY DOMAINS
    "security_engineer": {
        "role": "Senior Security Engineer",
        "companies": ["Google", "Microsoft", "Cloudflare"],
        "expertise": ["Security", "Vulnerability Assessment", "Penetration Testing", "Security Architecture"],
        "experience_years": "15+",
        "achievements": [
            "Led security initiatives at Google protecting billions of users",
            "Designed security systems at Microsoft for enterprise applications",
            "Built security infrastructure at Cloudflare for DDoS protection"
        ],
        "detection_keywords": ["security", "auth", "encryption", "jwt", "oauth", "ssl", "tls", "cors"],
        "focus_areas": [
            "Security vulnerabilities and threats",
            "Authentication and authorization",
            "Data encryption and protection",
            "Security best practices",
            "Compliance and regulations"
        ]
    },
    # DATA DOMAINS
    "data_engineer": {
        "role": "Senior Data Engineer",
        "companies": ["Google", "Netflix", "Uber"],
        "expertise": ["Data Pipelines", "ETL", "Big Data", "Data Warehousing", "Spark"],
        "experience_years": "13+",
        "achievements": [
            "Built data pipelines at Google processing petabytes daily",
            "Designed ETL systems at Netflix for real-time analytics",
            "Architected data infrastructure at Uber for millions of rides"
        ],
        "detection_keywords": ["data", "pipeline", "etl", "warehouse", "spark", "hadoop", "kafka"],
        "focus_areas": [
            "Data architecture and pipelines",
            "ETL performance and optimization",
            "Data quality and validation",
            "Scalability in data processing",
            "Data governance"
        ]
    },
    "ml_engineer": {
        "role": "Senior ML/AI Engineer",
        "companies": ["OpenAI", "Anthropic", "Google DeepMind"],
        "expertise": ["Machine Learning", "Deep Learning", "AI Systems", "Model Training"],
        "experience_years": "12+",
        "achievements": [
            "Developed ML models at OpenAI for language understanding",
            "Built AI systems at Anthropic for safety-critical applications",
            "Designed training pipelines at Google DeepMind for large-scale models"
        ],
        "detection_keywords": ["ml", "ai", "model", "training", "neural", "tensorflow", "pytorch", "learning"],
        "focus_areas": [
            "ML model architecture",
            "Training pipeline optimization",
            "Model performance and accuracy",
            "Scalability in ML systems",
            "AI safety and ethics"
        ]
    },
    # TESTING DOMAINS
    "qa_automation": {
        "role": "Senior QA Automation Architect",
        "companies": ["Google", "Microsoft", "Amazon"],
        "expertise": ["Test Automation", "Selenium", "Cypress", "Jest", "Testing Strategy"],
        "experience_years": "14+",
        "achievements": [
            "Built test automation at Google for thousands of test cases",
            "Designed testing frameworks at Microsoft for enterprise software",
            "Architected QA systems at Amazon for e-commerce platforms"
        ],
        "detection_keywords": ["test", "spec", "jest", "cypress", "selenium", "pytest", "testing"],
        "focus_areas": [
            "Test coverage and quality",
            "Automation strategy",
            "Test maintainability",
            "Performance testing",
            "Testing best practices"
        ]
    },
    "performance_engineer": {
        "role": "Senior Performance Engineer",
        "companies": ["Google", "Netflix", "Amazon"],
        "expertise": ["Performance Optimization", "Load Testing", "Profiling", "Scalability"],
        "experience_years": "16+",
        "achievements": [
            "Optimized systems at Google handling billions of requests",
            "Designed performance solutions at Netflix for streaming at scale",
            "Built performance infrastructure at Amazon for peak traffic"
        ],
        "detection_keywords": ["performance", "load", "stress", "benchmark", "profiling", "optimization"],
        "focus_areas": [
            "Performance bottlenecks",
            "Optimization strategies",
            "Scalability concerns",
            "Resource utilization",
            "Performance testing"
        ]
    },
    # CTO (for synthesis)
    "cto": {
        "role": "Chief Technology Officer",
        "companies": ["Google", "Microsoft", "Amazon"],
        "expertise": ["Strategic Planning", "System Architecture", "Team Leadership", "Technology Strategy"],
        "experience_years": "25+",
        "achievements": [
            "Former VP of Engineering at Google, leading teams of 500+ engineers",
            "CTO at Microsoft Azure, responsible for cloud infrastructure strategy",
            "Strategic advisor at Amazon Web Services for enterprise architecture"
        ],
        "focus_areas": [
            "Strategic technology insights",
            "System-wide risk assessment",
            "Architectural recommendations",
            "Cross-domain synthesis",
            "Executive-level analysis"
        ]
    }
 }
 # ============================================================================
 # DOCUMENT ANALYSIS PERSONAS (for Multi-Document Upload Service)
 # ============================================================================
 DOCUMENT_ANALYSIS_PERSONAS = {
    "technical_doc_analyst": {
        "role": "Senior Technical Documentation Analyst",
        "companies": ["Google", "Stripe", "Microsoft"],
        "expertise_domain": "technical documentation and API specifications",
        "document_types": ["API docs", "technical specs", "developer guides"],
        "experience_years": "15+",
        "achievements": [
            "Analyzed technical documentation at Google for millions of API integrations",
            "Led documentation analysis at Stripe for developer experience",
            "Mapped technical relationships at Microsoft for enterprise systems"
        ],
        "focus_areas": [
            "Technical dependencies and relationships",
            "System integration points",
            "API contract relationships",
            "Technical process flows",
            "Code-to-documentation mappings"
        ],
        "visual_focus_areas": [
            "API flow diagrams",
            "System integration diagrams",
            "Technical architecture flows"
        ],
        "detection_keywords": ["api", "technical", "specification", "documentation", "guide", "reference", "developer"]
    },
    "business_process_analyst": {
        "role": "Senior Business Process Analyst",
        "companies": ["McKinsey", "Deloitte", "Accenture"],
        "expertise_domain": "business processes and stakeholder requirements",
        "document_types": ["business requirements", "user stories", "business plans"],
        "experience_years": "18+",
        "achievements": [
            "Analyzed business processes at McKinsey for Fortune 500 companies",
            "Led process mapping at Deloitte for enterprise transformations",
            "Mapped stakeholder relationships at Accenture for global projects"
        ],
        "focus_areas": [
            "Business process flows",
            "Requirement dependencies",
            "Stakeholder impact chains",
            "Business decision consequences",
            "Organizational impact analysis"
        ],
        "visual_focus_areas": [
            "Business process diagrams",
            "Stakeholder impact maps",
            "Decision flowcharts"
        ],
        "detection_keywords": ["business", "requirement", "stakeholder", "user story", "process", "workflow", "business plan"]
    },
    "system_architecture_analyst": {
        "role": "Senior System Architecture Document Analyst",
        "companies": ["Google", "Amazon", "Microsoft"],
        "expertise_domain": "system architecture and design documents",
        "document_types": ["architecture docs", "design documents", "system designs"],
        "experience_years": "20+",
        "achievements": [
            "Analyzed architecture documents at Google for large-scale distributed systems",
            "Mapped system relationships at Amazon for cloud infrastructure",
            "Led architecture analysis at Microsoft for enterprise solutions"
        ],
        "focus_areas": [
            "Architecture relationships",
            "Component dependencies",
            "System interaction flows",
            "Design decision impacts",
            "Scalability relationships"
        ],
        "visual_focus_areas": [
            "Architecture diagrams",
            "Component interaction diagrams",
            "System dependency maps"
        ],
        "detection_keywords": ["architecture", "design", "system", "component", "diagram", "architectural"]
    },
    "requirements_analyst": {
        "role": "Senior Requirements & Specification Analyst",
        "companies": ["IBM", "Oracle", "SAP"],
        "expertise_domain": "requirements and functional specifications",
        "document_types": ["requirements docs", "functional specs", "feature specs"],
        "experience_years": "17+",
        "achievements": [
            "Analyzed requirements at IBM for enterprise software implementations",
            "Mapped specifications at Oracle for database systems",
            "Led requirement analysis at SAP for ERP platforms"
        ],
        "focus_areas": [
            "Requirement dependencies",
            "Feature relationships",
            "Specification impacts",
            "Change propagation",
            "Implementation dependencies"
        ],
        "visual_focus_areas": [
            "Requirement traceability diagrams",
            "Feature dependency maps",
            "Impact analysis charts"
        ],
        "detection_keywords": ["requirement", "specification", "feature", "functional", "traceability", "spec"]
    },
    "process_flow_analyst": {
        "role": "Senior Process Flow Analyst",
        "companies": ["Amazon", "Netflix", "Uber"],
        "expertise_domain": "operational processes and workflows",
        "document_types": ["process docs", "workflows", "operational manuals"],
        "experience_years": "14+",
        "achievements": [
            "Analyzed processes at Amazon for fulfillment operations",
            "Mapped workflows at Netflix for content delivery",
            "Led process analysis at Uber for ride-sharing operations"
        ],
        "focus_areas": [
            "Process step relationships",
            "Workflow dependencies",
            "Sequential cause-effects",
            "Decision impacts",
            "Operational dependencies"
        ],
        "visual_focus_areas": [
            "Process flowcharts",
            "Workflow diagrams",
            "Decision trees",
            "Operational flow maps"
        ],
        "detection_keywords": ["process", "workflow", "procedure", "operational", "manual", "step", "flow"]
    },
    "visual_architecture_analyst": {
        "role": "Senior Visual Architecture Analyst",
        "companies": ["Google", "Microsoft", "Apple"],
        "expertise_domain": "visual diagrams and architecture drawings",
        "document_types": ["diagrams", "flowcharts", "architecture drawings"],
        "experience_years": "16+",
        "achievements": [
            "Analyzed visual diagrams at Google for complex system mappings",
            "Mapped architecture drawings at Microsoft for enterprise solutions",
            "Led visual analysis at Apple for product architecture"
        ],
        "focus_areas": [
            "Visual relationship extraction",
            "Diagram dependency mapping",
            "Flow analysis",
            "Component interactions",
            "Visual pattern recognition"
        ],
        "visual_focus_areas": [
            "All types of visual diagrams",
            "Architecture drawings",
            "Flowcharts and process diagrams",
            "Component and sequence diagrams"
        ],
        "detection_keywords": ["diagram", "flowchart", "visual", "drawing", "chart", "map", "image"]
    }
 }
 # ============================================================================
 # DOCUMENT TYPE MAPPING
 # ============================================================================
 DOCUMENT_PERSONA_MAPPING = {
    # Technical Documents
    "api_documentation": "technical_doc_analyst",
    "technical_specification": "technical_doc_analyst",
    "code_documentation": "technical_doc_analyst",
    "developer_guide": "technical_doc_analyst",
    # Business Documents
    "business_requirements": "business_process_analyst",
    "user_stories": "business_process_analyst",
    "business_plan": "business_process_analyst",
    "product_specification": "business_process_analyst",
    "stakeholder_document": "business_process_analyst",
    # Architecture Documents
    "architecture_document": "system_architecture_analyst",
    "system_design": "system_architecture_analyst",
    "design_document": "system_architecture_analyst",
    "technical_design": "system_architecture_analyst",
    # Requirements Documents
    "requirements_document": "requirements_analyst",
    "functional_specification": "requirements_analyst",
    "feature_specification": "requirements_analyst",
    # Process Documents
    "process_document": "process_flow_analyst",
    "workflow_document": "process_flow_analyst",
    "procedure_guide": "process_flow_analyst",
    "operational_manual": "process_flow_analyst",
    # Visual/Diagram Documents
    "architecture_diagram": "visual_architecture_analyst",
    "flowchart": "visual_architecture_analyst",
    "sequence_diagram": "visual_architecture_analyst",
    "component_diagram": "visual_architecture_analyst",
    "process_diagram": "visual_architecture_analyst",
    "system_diagram": "visual_architecture_analyst",
 }
 # ============================================================================
 # PERSONA ALLOCATION FUNCTIONS
 # ============================================================================
 def allocate_code_persona(file_path: str, content: str, chunk_type: str = "module") -> Dict:
    """
    Intelligently allocates code analysis persona based on file path, content, and type.
    Returns persona config with prompt context.
    """
    file_lower = file_path.lower()
    content_lower = content.lower()[:2000] if content else ""  # Sample content
    # Score each persona based on detection rules
    persona_scores = {}
    for persona_id, persona_config in CODE_ANALYSIS_PERSONAS.items():
        if persona_id == "cto":  # Skip CTO for individual analysis
            continue
        score = 0
        detection_keywords = persona_config.get("detection_keywords", [])
        # Check file path (higher weight)
        for keyword in detection_keywords:
            if keyword in file_lower:
                score += 15
        # Check content (medium weight)
        for keyword in detection_keywords:
            if keyword in content_lower:
                score += 8
        # Check chunk type
        if chunk_type and chunk_type.lower() in detection_keywords:
            score += 10
        # Domain-specific boosts
        if "test" in file_lower and "qa" in persona_id:
            score += 20
        if "security" in file_lower and "security" in persona_id:
            score += 20
        if "performance" in file_lower and "performance" in persona_id:
            score += 20
        if score > 0:
            persona_scores[persona_id] = score
    # Select top persona
    if persona_scores:
        selected_id = max(persona_scores, key=persona_scores.get)
        return CODE_ANALYSIS_PERSONAS[selected_id]
    # Default fallback to backend business logic
    return CODE_ANALYSIS_PERSONAS.get("backend_business", {})
 def allocate_document_persona(file_path: str, content: str, file_type: str = "text") -> Dict:
    """
    Intelligently allocates document analysis persona based on file path, content, and type.
    Returns persona config for document analysis.
    """
    file_lower = file_path.lower()
    content_lower = content.lower()[:2000] if content else ""
    # Check if it's an image/diagram
    if file_type == "image" or any(ext in file_lower for ext in [".png", ".jpg", ".jpeg", ".gif", ".svg", ".pdf"]):
        return DOCUMENT_ANALYSIS_PERSONAS.get("visual_architecture_analyst", {})
    # Score each persona based on detection rules
    persona_scores = {}
    for persona_id, persona_config in DOCUMENT_ANALYSIS_PERSONAS.items():
        score = 0
        detection_keywords = persona_config.get("detection_keywords", [])
        # Check file path (higher weight)
        for keyword in detection_keywords:
            if keyword in file_lower:
                score += 15
        # Check content (medium weight)
        for keyword in detection_keywords:
            if keyword in content_lower:
                score += 8
        # Check document type mapping
        for doc_type, mapped_persona in DOCUMENT_PERSONA_MAPPING.items():
            if doc_type in file_lower and mapped_persona == persona_id:
                score += 20
        if score > 0:
            persona_scores[persona_id] = score
    # Select top persona
    if persona_scores:
        selected_id = max(persona_scores, key=persona_scores.get)
        return DOCUMENT_ANALYSIS_PERSONAS[selected_id]
    # Default fallback to technical doc analyst
    return DOCUMENT_ANALYSIS_PERSONAS.get("technical_doc_analyst", {})
 def get_cto_persona() -> Dict:
    """Returns CTO persona for synthesis and high-level analysis."""
    return CODE_ANALYSIS_PERSONAS.get("cto", {})
 # ============================================================================
 # PROMPT BUILDING FUNCTIONS
 # ============================================================================
 def build_persona_intro(persona: Dict, assignment_context: str = "", analysis_type: str = "code") -> str:
    """
    Builds persona introduction section for prompts.
    Works for both code and document analysis.
    """
    if not persona:
        return ""
    role = persona.get("role", "Senior Engineer")
    companies = persona.get("companies", [])
    experience = persona.get("experience_years", "15+")
    achievements = persona.get("achievements", [])
    focus_areas = persona.get("focus_areas", [])
    # Build company background
    company_bg = ""
    if companies:
        company_bg = f"- Previously worked at {', '.join(companies[:2])}"
        if len(companies) > 2:
            company_bg += f" and {companies[2]}"
    # Build achievements section
    achievements_text = ""
    if achievements:
        achievements_text = "\n".join([f"- {achievement}" for achievement in achievements[:2]])
    # Build focus areas
    focus_text = ""
    if focus_areas:
        focus_text = "\n".join([f"- {focus}" for focus in focus_areas[:5]])
    intro = f"""You are {role} with {experience} years of experience.
 COMPANY BACKGROUND:
 {company_bg}
 KEY ACHIEVEMENTS:
 {achievements_text}
 YOUR ASSIGNMENT:
 {assignment_context if assignment_context else 'Analyze the provided code/document for quality, issues, and recommendations.'}
 YOUR FOCUS AREAS:
 {focus_text}
 ---
 """
    return intro
 def build_code_analysis_persona_prompt(base_prompt: str, persona: Dict, 
                                      assignment_context: str = "") -> str:
    """
    Enhances code analysis prompt with persona context.
    """
    if not persona:
        return base_prompt
    persona_intro = build_persona_intro(persona, assignment_context, "code")
    return persona_intro + base_prompt
 def build_document_analysis_persona_prompt(base_prompt: str, persona: Dict,
                                          document_type: str = "document",
                                          assignment_context: str = "") -> str:
    """
    Enhances document analysis prompt with persona context.
    """
    if not persona:
        return base_prompt
    role = persona.get("role", "Senior Analyst")
    companies = persona.get("companies", [])
    expertise_domain = persona.get("expertise_domain", "document analysis")
    experience = persona.get("experience_years", "15+")
    achievements = persona.get("achievements", [])
    focus_areas = persona.get("focus_areas", [])
    company_bg = f"- Previously worked at {', '.join(companies[:2])}" if companies else ""
    achievements_text = "\n".join([f"- {achievement}" for achievement in achievements[:2]]) if achievements else ""
    focus_text = "\n".join([f"- {focus}" for focus in focus_areas[:5]]) if focus_areas else ""
    intro = f"""You are {role}, a specialist in analyzing {expertise_domain} with {experience} years of experience.
 COMPANY BACKGROUND:
 {company_bg}
 KEY ACHIEVEMENTS:
 {achievements_text}
 YOUR SPECIALIZATION:
 You excel at identifying:
 {focus_text}
 YOUR ASSIGNMENT:
 {assignment_context if assignment_context else f'Analyze this {document_type} to extract causal relationships and dependencies.'}
 ---
 """
    return intro + base_prompt
 def build_cto_synthesis_prompt(base_prompt: str, team_findings: List[Dict] = None) -> str:
    """
    Builds CTO-level synthesis prompt with team allocation context.
    """
    cto_persona = get_cto_persona()
    if not cto_persona:
        return base_prompt
    role = cto_persona.get("role", "Chief Technology Officer")
    companies = cto_persona.get("companies", [])
    experience = cto_persona.get("experience_years", "25+")
    achievements = cto_persona.get("achievements", [])
    focus_areas = cto_persona.get("focus_areas", [])
    company_bg = f"- Former VP of Engineering at {companies[0] if companies else 'Google'}, leading teams of 500+ engineers"
    if len(companies) > 1:
        company_bg += f"\n- CTO at {companies[1]}, responsible for cloud infrastructure strategy"
    achievements_text = "\n".join([f"- {achievement}" for achievement in achievements[:2]]) if achievements else ""
    focus_text = "\n".join([f"- {focus}" for focus in focus_areas[:5]]) if focus_areas else ""
    team_allocation = ""
    if team_findings:
        team_allocation = "\n\nTEAM ALLOCATION:\n"
        team_allocation += "You have allocated your expert team to analyze different domains:\n"
        for finding in team_findings[:5]:
            domain = finding.get("domain", "unknown")
            team_allocation += f"- {domain}: Expert analysis completed\n"
    intro = f"""You are {role} with {experience} years of experience.
 COMPANY BACKGROUND:
 {company_bg}
 KEY ACHIEVEMENTS:
 {achievements_text}
 {team_allocation}
 YOUR ROLE:
 You have received this project and allocated your expert team to analyze different domains.
 Now, synthesize all team findings into strategic recommendations.
 YOUR FOCUS AREAS:
 {focus_text}
 ---
 """
    return intro + base_prompt
--- a/services/ai-analysis-service/server.py
+++ b/services/ai-analysis-service/server.py
@ -2673,8 +2673,10 @@ def build_intelligent_chunk_prompt(chunk: Dict, analysis_state: Optional[Dict] =
    """
    Build comprehensive prompt for analyzing a semantically grouped chunk.
    Generates detailed module-level analysis with context awareness.
-    Now includes progressive context from previous chunks.
+    Now includes progressive context from previous chunks and world-class persona.
    """
    from persona_system import allocate_code_persona, build_code_analysis_persona_prompt
    chunk_name = chunk.get('name', 'unknown')
    chunk_type = chunk.get('chunk_type', 'module')
    files_batch = chunk.get('files', [])
@ -2694,15 +2696,22 @@ def build_intelligent_chunk_prompt(chunk: Dict, analysis_state: Optional[Dict] =
        optimized_files.append((file_path, optimized_content))
    # Allocate appropriate persona based on files in chunk
    # Use the first file to determine persona (or combine if multiple domains)
    primary_file_path = optimized_files[0][0] if optimized_files else ""
    primary_content = optimized_files[0][1] if optimized_files else ""
    persona = allocate_code_persona(primary_file_path, primary_content, chunk_type)
    # Build context from previous analyses (progressive learning)
    context_section = build_context_from_state(analysis_state, chunk)
    # Build assignment context
    assignment_context = f"CTO has assigned you to analyze the '{chunk_name}' module/chunk for this project. This is a {chunk_type} type chunk containing {len(optimized_files)} files."
    # Build comprehensive prompt with module context
    prompt_parts = [
        f"# COMPREHENSIVE ANALYSIS: {chunk_name.upper()}",
        f"Chunk Type: {chunk_type}",
        "",
        "You are a senior software architect with 30+ years of experience. Analyze this module/chunk comprehensively.",
        ""
    ]
@ -2794,7 +2803,12 @@ def build_intelligent_chunk_prompt(chunk: Dict, analysis_state: Optional[Dict] =
        "Focus on providing detailed, actionable insights that help understand the complete module context."
    ])
-    return "\n".join(prompt_parts)
+    base_prompt = "\n".join(prompt_parts)
    # Enhance with persona
    enhanced_prompt = build_code_analysis_persona_prompt(base_prompt, persona, assignment_context)
    return enhanced_prompt
 def build_smart_batch_prompt(files_batch: List[Tuple[str, str]]) -> str:
    """Legacy function: Build prompt for simple batch (backward compatibility)."""
@ -4719,13 +4733,13 @@ def build_synthesis_prompt(analysis_state: Dict, all_chunk_analyses: List[Dict]
    """
    Build comprehensive prompt for cross-module synthesis analysis.
    Synthesizes all individual module analyses into system-level insights.
    Uses CTO persona for executive-level synthesis.
    """
    from persona_system import get_cto_persona, build_cto_synthesis_prompt
    prompt_parts = [
        "# CROSS-MODULE SYNTHESIS ANALYSIS",
        "",
        "You are a senior software architect with 30+ years of experience. Your task is to synthesize",
        "findings from multiple module-level analyses into comprehensive system-level insights.",
        "",
        "## CONTEXT: PREVIOUSLY ANALYZED MODULES",
        ""
    ]
@ -4842,7 +4856,19 @@ def build_synthesis_prompt(analysis_state: Dict, all_chunk_analyses: List[Dict]
        "across all analyzed modules, not just repeating individual module findings."
    ])
-    return "\n".join(prompt_parts)
+    base_prompt = "\n".join(prompt_parts)
    # Get team findings for CTO context
    team_findings = []
    if all_chunk_analyses:
        for chunk_analysis in all_chunk_analyses:
            module_name = chunk_analysis.get('module_name', 'unknown')
            team_findings.append({"domain": module_name, "analysis": chunk_analysis})
    # Enhance with CTO persona
    enhanced_prompt = build_cto_synthesis_prompt(base_prompt, team_findings)
    return enhanced_prompt
 def parse_synthesis_response(response_text: str) -> Dict:
    """Parse synthesis response from Claude API."""
--- a/services/git-integration/src/routes/github-oauth.js
+++ b/services/git-integration/src/routes/github-oauth.js
@ -141,17 +141,19 @@ router.get('/auth/github/callback', async (req, res) => {
        setImmediate(async () => {
          try {
            console.log('[GitHub OAuth] Starting background repository attachment for:', repoContext.repoUrl);
            console.log('[GitHub OAuth] Using newly stored token for user:', user_id);
            const GitHubIntegrationService = require('../services/github-integration.service');
            const database = require('../config/database');
            const githubService = new GitHubIntegrationService();
            const { owner, repo, branch } = githubService.parseGitHubUrl(repoContext.repoUrl);
-            // Get metadata using authenticated Octokit
+            // Get metadata using authenticated Octokit with the specific user's token
-            const repositoryData = await githubService.fetchRepositoryMetadata(owner, repo);
+            // Pass userId to ensure we use the newly stored token
            const repositoryData = await githubService.fetchRepositoryMetadata(owner, repo, false, user_id);
            let actualBranch = repoContext.branchName || branch || repositoryData.default_branch || 'main';
-            // Attempt analysis and sync with fallback
+            // Attempt analysis and sync with fallback - use userId to ensure correct token
-            const codebaseAnalysis = await githubService.analyzeCodebase(owner, repo, actualBranch, false);
+            const codebaseAnalysis = await githubService.analyzeCodebase(owner, repo, actualBranch, false, user_id);
            const insertQuery = `
              INSERT INTO all_repositories (
                repository_url, repository_name, owner_name, 
@ -170,14 +172,14 @@ router.get('/auth/github/callback', async (req, res) => {
              JSON.stringify(codebaseAnalysis),
              'syncing',
              repositoryData.visibility === 'private',
-              repoContext.userId || null,
+              user_id || repoContext.userId || null, // Use user_id from OAuth callback (most reliable)
              'github' // This is GitHub OAuth callback, so provider is always github
            ];
            const insertResult = await database.query(insertQuery, insertValues);
            const repositoryRecord = insertResult.rows[0];
-            // Clone repository
+            // Clone repository - use userId to ensure correct token
-            const downloadResult = await githubService.syncRepositoryWithFallback(owner, repo, actualBranch, repositoryRecord.id, repositoryData.visibility !== 'private');
+            const downloadResult = await githubService.syncRepositoryWithFallback(owner, repo, actualBranch, repositoryRecord.id, repositoryData.visibility !== 'private', user_id);
            const finalSyncStatus = downloadResult.success ? 'synced' : 'error';
            await database.query('UPDATE all_repositories SET sync_status = $1, updated_at = NOW() WHERE id = $2', [finalSyncStatus, repositoryRecord.id]);
--- a/services/git-integration/src/routes/vcs.routes.js
+++ b/services/git-integration/src/routes/vcs.routes.js
@ -163,12 +163,28 @@ router.post('/:provider/attach-repository', async (req, res) => {
    const { template_id, repository_url, branch_name } = req.body;
    const userId = req.headers['x-user-id'] || req.query.user_id || req.body.user_id || (req.user && (req.user.id || req.user.userId));
    console.log(`[VCS Attach] Extracted userId:`, userId, `from headers:`, req.headers['x-user-id'], `query:`, req.query.user_id, `body:`, req.body.user_id);
    // Validate input - only repository_url is required (like GitHub)
    if (!repository_url) {
      return res.status(400).json({ success: false, message: 'Repository URL is required' });
    }
-    const { owner, repo, branch } = provider.parseRepoUrl(repository_url);
+    // Clean and normalize the repository URL (trim whitespace, decode URL encoding)
    let cleanedUrl = repository_url.trim();
    // Decode URL-encoded characters (like %20 for spaces)
    try {
      cleanedUrl = decodeURIComponent(cleanedUrl);
    } catch (e) {
      // If decoding fails, use original URL
      console.warn(`[VCS Attach] Failed to decode URL, using original: ${cleanedUrl}`);
    }
    // Trim again after decoding
    cleanedUrl = cleanedUrl.trim();
    console.log(`[VCS Attach] Original URL: ${repository_url}, Cleaned URL: ${cleanedUrl}`);
    const { owner, repo, branch } = provider.parseRepoUrl(cleanedUrl);
    // Enhanced flow: Detect private repos and redirect to OAuth immediately
    const providerKey = (req.params.provider || '').toLowerCase();
@ -248,7 +264,44 @@ router.post('/:provider/attach-repository', async (req, res) => {
    // For public repos or authenticated private repos, proceed with normal flow
    const accessCheck = await provider.checkRepositoryAccess(owner, repo, userId);
    console.log(`[VCS Attach] Access check result for ${owner}/${repo}:`, {
      hasAccess: accessCheck.hasAccess,
      requiresAuth: accessCheck.requiresAuth,
      authError: accessCheck.authError,
      error: accessCheck.error,
      exists: accessCheck.exists,
      github_username: accessCheck.github_username
    });
    if (!accessCheck.hasAccess) {
      // If access check failed but requires auth, trigger OAuth flow
      if (accessCheck.requiresAuth || accessCheck.authError) {
        const oauthService = getOAuthService(providerKey);
        if (oauthService) {
          console.log(`🔒 [VCS Attach] Token exists but cannot access repository (or no valid token), redirecting to OAuth: ${repository_url}`);
          console.log(`🔒 [VCS Attach] Reason: ${accessCheck.error || 'Authentication required'}, userId: ${userId}`);
          // Generate OAuth URL with repository context in state
          const stateBase = Math.random().toString(36).substring(7);
          const state = `${stateBase}|uid=${userId || 'unknown'}|repo=${encodeURIComponent(repository_url)}|branch=${encodeURIComponent(branch_name || 'main')}|private_repo=true`;
          const authUrl = oauthService.getAuthUrl(state, userId);
          console.log(`🔒 [VCS Attach] Generated OAuth URL for ${providerKey}, returning requires_auth response`);
          return res.json({
            success: false,
            message: `${providerKey.charAt(0).toUpperCase() + providerKey.slice(1)} authentication required for private repository`,
            requires_auth: true,
            is_private_repo: true,
            auth_url: authUrl,
            state: state
          });
        }
      }
      // If it's not an auth issue, return 404
      console.log(`[VCS Attach] Access check failed without auth requirement, returning 404`);
      return res.status(404).json({ success: false, message: accessCheck.error || 'Repository not accessible' });
    }
--- a/services/git-integration/src/services/github-integration.service.js
+++ b/services/git-integration/src/services/github-integration.service.js
@ -21,8 +21,8 @@ class GitHubIntegrationService {
  }
  // Get authenticated Octokit instance
-  async getAuthenticatedOctokit() {
+  async getAuthenticatedOctokit(userId = null) {
-    return await this.oauthService.getAuthenticatedOctokit();
+    return await this.oauthService.getAuthenticatedOctokit(userId);
  }
  // Extract owner, repo, and branch from GitHub URL using parse-github-url library
@ -31,8 +31,15 @@ class GitHubIntegrationService {
      throw new Error('URL must be a non-empty string');
    }
-    // Normalize the URL first
+    // Normalize the URL first - trim and decode URL encoding
    let normalizedUrl = url.trim();
    // Decode URL-encoded characters (like %20 for spaces)
    try {
      normalizedUrl = decodeURIComponent(normalizedUrl).trim();
    } catch (e) {
      // If decoding fails, just trim
      normalizedUrl = normalizedUrl.trim();
    }
    // Remove trailing slashes and .git extensions
    normalizedUrl = normalizedUrl.replace(/\/+$/, '').replace(/\.git$/, '');
@ -216,7 +223,7 @@ class GitHubIntegrationService {
        };
      }
-      // No token found - try unauthenticated access first to check if it's public
+      // No token found that can access this repo - try unauthenticated access to check if it's public
      try {
        const unauthenticatedOctokit = new Octokit({
          userAgent: 'CodeNuk-GitIntegration/1.0.0',
@ -234,13 +241,18 @@ class GitHubIntegrationService {
        };
      } catch (unauthenticatedError) {
        if (unauthenticatedError.status === 404) {
-          // Repository truly doesn't exist
+          // 404 from unauthenticated access could mean:
          // 1. Repository truly doesn't exist
          // 2. Repository is private and requires authentication
          // Since we already tried to find a token and none could access it, 
          // and we're being called from a private repo flow, assume it requires auth
          console.log(`🔒 [GitHub] 404 from unauthenticated access - assuming private repo requires authentication`);
          return {
-            exists: false,
+            exists: null, // Unknown - could be missing or private
            isPrivate: null,
            hasAccess: false,
-            requiresAuth: false,
+            requiresAuth: true, // Changed from false to true - trigger OAuth
-            error: 'Repository not found'
+            error: 'Repository not found or requires authentication'
          };
        } else if (unauthenticatedError.status === 401 || unauthenticatedError.status === 403) {
          // Repository exists but requires authentication (private) - generate auth URL
@ -289,13 +301,13 @@ class GitHubIntegrationService {
  }
  // Get repository information from GitHub
-  async fetchRepositoryMetadata(owner, repo, skipAuth = false) {
+  async fetchRepositoryMetadata(owner, repo, skipAuth = false, userId = null) {
    // If skipAuth is true, try with unauthenticated octokit first to check visibility
    let octokit;
    if (skipAuth) {
      octokit = this.octokit; // Use unauthenticated instance
    } else {
-      octokit = await this.getAuthenticatedOctokit();
+      octokit = await this.getAuthenticatedOctokit(userId);
    }
    const safe = async (fn, fallback) => {
@ -309,26 +321,41 @@ class GitHubIntegrationService {
    let repoData;
    try {
      console.log(`🔍 [GitHub] fetchRepositoryMetadata: skipAuth=${skipAuth}, calling octokit.repos.get for ${owner}/${repo}`);
      const response = await octokit.repos.get({ owner, repo });
      if (skipAuth) {
        if (response.status === 401 || response.status === 403) {
          throw new Error('Authentication required to access repository');
        } else if (response.status === 404) {
          throw new Error('Repository not found');
        }
      }
      repoData = response.data;
      console.log(`✅ [GitHub] Successfully fetched repository data: ${repoData?.full_name || 'no full_name'}`);
      // Validate we got real data
      if (!repoData || !repoData.full_name) {
        console.log(`❌ [GitHub] Invalid repository data received, throwing error`);
        throw new Error('Invalid repository data received');
      }
    } catch (error) {
-      console.log(`🔍 [GitHub] Error in fetchRepositoryMetadata:`, error.message, error.status);
+      // Check error status from various possible locations
      const status = error.status || error.response?.status || error.code;
      const errorMessage = error.message || '';
      const is404 = status === 404 || status === '404' || errorMessage.includes('404') || errorMessage.includes('Not Found');
      const isAuthError = status === 401 || status === 403 || status === '401' || status === '403';
      console.log(`🔍 [GitHub] Error in fetchRepositoryMetadata CATCH BLOCK:`, errorMessage, `Status: ${status || 'unknown'}`, `is404: ${is404}`, `isAuthError: ${isAuthError}`, `skipAuth: ${skipAuth}`);
      console.log(`🔍 [GitHub] Error object:`, JSON.stringify({ 
        status: error.status, 
        responseStatus: error.response?.status, 
        code: error.code,
        message: error.message,
        name: error.name
      }));
      if (skipAuth) {
-        // For GitHub, any error when skipAuth=true likely means private repo
+        // For GitHub, any error when skipAuth=true means private repo or doesn't exist
-        if (error.status === 401 || error.status === 403 || error.status === 404) {
+        // Always throw authentication required - let the caller decide if it's truly missing or private
        console.log(`🔒 [GitHub] skipAuth=true, THROWING authentication required error - NOT using safe fallback`);
        throw new Error('Authentication required to access repository');
      }
-        // For other errors, also assume private repo
+      
-        throw new Error('Authentication required to access repository');
+      // For authenticated requests, use safe fallback (but only if skipAuth is false)
-      }
+      console.log(`⚠️ [GitHub] skipAuth=false, using safe fallback`);
      // For other errors, use safe fallback
      repoData = await safe(
        async () => {
          const response = await octokit.repos.get({ owner, repo });
@ -336,6 +363,12 @@ class GitHubIntegrationService {
        },
        {}
      );
      // If safe fallback also failed, throw
      if (!repoData || !repoData.full_name) {
        console.log(`❌ [GitHub] Safe fallback also failed, throwing Repository not found`);
        throw new Error('Repository not found');
      }
    }
    const languages = await safe(
@ -364,7 +397,7 @@ class GitHubIntegrationService {
  }
  // Analyze codebase structure
-  async analyzeCodebase(owner, repo, branch, isPublicRepo = false) {
+  async analyzeCodebase(owner, repo, branch, isPublicRepo = false, userId = null) {
    try {
      // Use appropriate octokit instance based on repository type
      let octokit;
@ -374,8 +407,8 @@ class GitHubIntegrationService {
          userAgent: 'CodeNuk-GitIntegration/1.0.0',
        });
      } else {
-        // For private repos, use authenticated octokit
+        // For private repos, use authenticated octokit with userId
-        octokit = await this.getAuthenticatedOctokit();
+        octokit = await this.getAuthenticatedOctokit(userId);
      }
      // Get the commit SHA for the branch
@ -519,7 +552,7 @@ class GitHubIntegrationService {
  }
  // Git-based: clone or update local repo and re-index into DB
-  async syncRepositoryWithGit(owner, repo, branch, repositoryId, isPublicRepo = false) {
+  async syncRepositoryWithGit(owner, repo, branch, repositoryId, isPublicRepo = false, userId = null) {
    const database = require('../config/database');
    const localPath = this.gitRepoService.getLocalRepoPath(owner, repo, branch);
    let storageRecord = null;
@ -544,7 +577,7 @@ class GitHubIntegrationService {
          console.warn(`Failed to clone public repo without auth: ${error.message}`);
          // Fallback to authenticated clone if available
          try {
-            const tokenRecord = await this.oauthService.getToken();
+            const tokenRecord = userId ? await this.oauthService.getTokenForUser(userId) : await this.oauthService.getToken();
            if (tokenRecord?.access_token) {
              repoPath = await this.gitRepoService.cloneIfMissingWithAuth(
                owner,
@ -560,7 +593,7 @@ class GitHubIntegrationService {
      } else {
        // For private repos, try authenticated clone first
        try {
-          const tokenRecord = await this.oauthService.getToken();
+          const tokenRecord = userId ? await this.oauthService.getTokenForUser(userId) : await this.oauthService.getToken();
          if (tokenRecord?.access_token) {
            repoPath = await this.gitRepoService.cloneIfMissingWithAuth(
              owner,
@ -628,7 +661,7 @@ class GitHubIntegrationService {
    try {
      // Try to ensure repo exists for the preferred branch
      try {
-        const tokenRecord = await this.oauthService.getToken().catch(() => null);
+        const tokenRecord = userId ? await this.oauthService.getTokenForUser(userId).catch(() => null) : await this.oauthService.getToken().catch(() => null);
        if (tokenRecord?.access_token) {
          repoPath = await this.gitRepoService.cloneIfMissingWithAuth(owner, repo, preferredBranch, 'github.com', tokenRecord.access_token, 'oauth2');
        } else {
@ -637,7 +670,7 @@ class GitHubIntegrationService {
      } catch (cloneErr) {
        // If the branch doesn't exist (e.g., refs/heads not found), try the alternate branch
        try {
-          const tokenRecordAlt = await this.oauthService.getToken().catch(() => null);
+          const tokenRecordAlt = userId ? await this.oauthService.getTokenForUser(userId).catch(() => null) : await this.oauthService.getToken().catch(() => null);
          repoPath = tokenRecordAlt?.access_token
            ? await this.gitRepoService.cloneIfMissingWithAuth(owner, repo, alternateBranch, 'github.com', tokenRecordAlt.access_token, 'oauth2')
            : await this.gitRepoService.cloneIfMissing(owner, repo, alternateBranch);
@ -679,7 +712,7 @@ class GitHubIntegrationService {
    try {
      // Ensure repo exists similarly to diff flow
      try {
-        const tokenRecord = await this.oauthService.getToken().catch(() => null);
+        const tokenRecord = userId ? await this.oauthService.getTokenForUser(userId).catch(() => null) : await this.oauthService.getToken().catch(() => null);
        if (tokenRecord?.access_token) {
          repoPath = await this.gitRepoService.cloneIfMissingWithAuth(owner, repo, preferredBranch, 'github.com', tokenRecord.access_token, 'oauth2');
        } else {
@ -687,7 +720,7 @@ class GitHubIntegrationService {
        }
      } catch (_) {
        try {
-          const tokenRecordAlt = await this.oauthService.getToken().catch(() => null);
+          const tokenRecordAlt = userId ? await this.oauthService.getTokenForUser(userId).catch(() => null) : await this.oauthService.getToken().catch(() => null);
          repoPath = tokenRecordAlt?.access_token
            ? await this.gitRepoService.cloneIfMissingWithAuth(owner, repo, alternateBranch, 'github.com', tokenRecordAlt.access_token, 'oauth2')
            : await this.gitRepoService.cloneIfMissing(owner, repo, alternateBranch);
@ -720,15 +753,15 @@ class GitHubIntegrationService {
  }
  // Try git-based sync first, fall back to GitHub API download on failure
-  async syncRepositoryWithFallback(owner, repo, branch, repositoryId, isPublicRepo = false) {
+  async syncRepositoryWithFallback(owner, repo, branch, repositoryId, isPublicRepo = false, userId = null) {
    // First attempt: full git clone/fetch and index
-    const gitResult = await this.syncRepositoryWithGit(owner, repo, branch, repositoryId, isPublicRepo);
+    const gitResult = await this.syncRepositoryWithGit(owner, repo, branch, repositoryId, isPublicRepo, userId);
    if (gitResult && gitResult.success) {
      return { method: 'git', ...gitResult };
    }
    // Fallback: API-based download and storage
-    const apiResult = await this.downloadRepositoryWithStorage(owner, repo, branch, repositoryId, isPublicRepo);
+    const apiResult = await this.downloadRepositoryWithStorage(owner, repo, branch, repositoryId, isPublicRepo, userId);
    if (apiResult && apiResult.success) {
      return { method: 'api', ...apiResult, git_error: gitResult?.error };
    }
@ -737,7 +770,7 @@ class GitHubIntegrationService {
  }
  // Download repository files locally and store in database
-  async downloadRepositoryWithStorage(owner, repo, branch, repositoryId, isPublicRepo = false) {
+  async downloadRepositoryWithStorage(owner, repo, branch, repositoryId, isPublicRepo = false, userId = null) {
    const targetDir = path.join(
      process.env.ATTACHED_REPOS_DIR,
      `${owner}__${repo}__${branch}`
@ -765,8 +798,8 @@ class GitHubIntegrationService {
          userAgent: 'CodeNuk-GitIntegration/1.0.0',
        });
      } else {
-        // For private repos, use authenticated octokit
+        // For private repos, use authenticated octokit with userId
-        octokit = await this.getAuthenticatedOctokit();
+        octokit = await this.getAuthenticatedOctokit(userId);
      }
      // Get the commit SHA for the branch
--- a/services/git-integration/src/services/github-oauth.js
+++ b/services/git-integration/src/services/github-oauth.js
@ -199,8 +199,16 @@ class GitHubOAuthService {
  }
  // Create authenticated Octokit instance
-  async getAuthenticatedOctokit() {
+  async getAuthenticatedOctokit(userId = null) {
-    const tokenRecord = await this.getToken();
+    // If userId is provided, get the newest token for that user
    // Otherwise, get the newest token overall
    let tokenRecord;
    if (userId) {
      tokenRecord = await this.getTokenForUser(userId);
      console.log(`[GitHub OAuth] Using token for user ${userId}: ${tokenRecord?.github_username || 'none'}`);
    } else {
      tokenRecord = await this.getToken();
    }
    if (!tokenRecord) {
      throw new Error('No GitHub token found. Please authenticate with GitHub first.');
--- a/services/git-integration/src/services/provider-registry.js
+++ b/services/git-integration/src/services/provider-registry.js
@ -15,7 +15,11 @@ class GithubAdapter {
    return this.impl.parseGitHubUrl(url);
  }
-  async checkRepositoryAccess(owner, repo) {
+  async checkRepositoryAccess(owner, repo, userId = null) {
    // Use user-specific method if userId is provided
    if (userId) {
      return await this.impl.checkRepositoryAccessWithUser(owner, repo, userId);
    }
    return await this.impl.checkRepositoryAccess(owner, repo);
  }
--- a/services/multi-document-upload-service/.dockerignore
+++ b/services/multi-document-upload-service/.dockerignore
@ -0,0 +1,58 @@
 # Python
 __pycache__/
 *.py[cod]
 *$py.class
 *.so
 .Python
 *.egg-info/
 dist/
 build/
 *.egg
 # Virtual environments
 venv/
 env/
 ENV/
 .venv
 # IDE
 .vscode/
 .idea/
 *.swp
 *.swo
 *~
 # Documentation
 *.md
 !README.md
 # Testing
 .pytest_cache/
 .coverage
 htmlcov/
 *.log
 # Storage and temporary files
 storage/
 *.tmp
 *.temp
 # Git
 .git/
 .gitignore
 # Docker
 Dockerfile*
 docker-compose*.yml
 .dockerignore
 # Environment files
 .env
 .env.local
 *.env
 # OS
 .DS_Store
 Thumbs.db
--- a/services/multi-document-upload-service/Dockerfile
+++ b/services/multi-document-upload-service/Dockerfile
@ -1,29 +1,60 @@
-FROM python:3.11-slim
+# Build stage - install dependencies that require compilation
 FROM python:3.11-slim as builder
 ENV PYTHONDONTWRITEBYTECODE=1 \
    PYTHONUNBUFFERED=1
 WORKDIR /app
 # Install build dependencies only
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        build-essential \
        curl \
    && rm -rf /var/lib/apt/lists/*
 # Copy and install Python dependencies
 COPY requirements.txt .
 RUN pip install --no-cache-dir --user -r requirements.txt && \
    pip cache purge
 # Download SpaCy English model
 RUN python -m spacy download en_core_web_sm
 # Runtime stage - minimal image with only runtime dependencies
 FROM python:3.11-slim
 ENV PYTHONDONTWRITEBYTECODE=1 \
    PYTHONUNBUFFERED=1 \
    PYTHONPATH=/app/src \
    PATH=/root/.local/bin:$PATH \
    MULTI_DOC_STORAGE_ROOT=/app/storage \
    MULTI_DOC_CLAUDE_MODEL=claude-3-5-haiku-latest \
    CLAUDE_MODEL=claude-3-5-haiku-latest \
    PORT=8024
 WORKDIR /app
 # Install only runtime dependencies (no build tools)
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        poppler-utils \
        tesseract-ocr \
        ffmpeg \
        libmagic1 \
-    && rm -rf /var/lib/apt/lists/*
+        curl \
        # Required for some Python packages at runtime
        libgomp1 \
        libglib2.0-0 \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean
-COPY requirements.txt .
+# Copy Python packages from builder stage (includes spacy model)
-RUN pip install --no-cache-dir -r requirements.txt
+COPY --from=builder /root/.local /root/.local
 # Copy application code
 COPY src ./src
 ENV PYTHONPATH=/app/src \
    MULTI_DOC_STORAGE_ROOT=/app/storage \
    MULTI_DOC_CLAUDE_MODEL=claude-3-5-sonnet-20241022 \
    PORT=8024
 EXPOSE 8024
 CMD ["sh", "-c", "uvicorn multi_document_upload_service.main:app --host 0.0.0.0 --port ${PORT:-8024}"]
--- a/services/multi-document-upload-service/FIX_EMPTY_GRAPH.md
+++ b/services/multi-document-upload-service/FIX_EMPTY_GRAPH.md
@ -1,144 +0,0 @@
 # Fix: Empty Graph in Neo4j (No Relationships Found)
 ## Problem
 When querying Neo4j for `CAUSES` relationships, you get "(no changes, no records)" because:
 1. **PDF extraction failed** - Missing dependencies (`unstructured[pdf]`)
 2. **0 relations extracted** - No text was extracted, so no analysis happened
 3. **0 relations written** - Nothing was written to Neo4j (correct behavior)
 ## Root Cause
 The service completed with 0 relations because:
 - PDF file extraction failed: `partition_pdf() is not available because one or more dependencies are not installed`
 - No text was extracted from the PDF
 - No chunks were created
 - No Claude analysis happened
 - 0 relations were extracted
 - 0 relations were written to Neo4j
 ## Solution
 ### Step 1: Update Dependencies
 The `requirements.txt` has been updated to include:
 ```
 unstructured[pdf]>=0.15.0
 unstructured[docx]>=0.15.0
 unstructured[pptx]>=0.15.0
 unstructured[xlsx]>=0.15.0
 ```
 ### Step 2: Rebuild the Service
 ```bash
 cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine
 # Rebuild the service with new dependencies
 docker-compose build multi-document-upload-service
 # Restart the service
 docker-compose restart multi-document-upload-service
 # Check logs to verify it's working
 docker-compose logs -f multi-document-upload-service
 ```
 ### Step 3: Verify Dependencies
 ```bash
 # Check if unstructured[pdf] is installed
 docker-compose exec multi-document-upload-service pip list | grep unstructured
 ```
 ### Step 4: Re-upload Documents
 1. Go to Project Builder in the frontend
 2. Click on "Upload Documents for Knowledge Graph"
 3. Upload a PDF or other document
 4. Wait for processing to complete
 5. Check Neo4j for relationships
 ### Step 5: Check Neo4j
 Run these queries in Neo4j Browser:
 ```cypher
 // Check if any nodes exist
 MATCH (n)
 RETURN count(n) as node_count
 // Check for CAUSES relationships
 MATCH (n:Concept)-[r:CAUSES]->(m:Concept)
 RETURN n.name as cause, m.name as effect, r.confidence as confidence
 LIMIT 50
 ```
 ## Expected Behavior After Fix
 1. **PDF extraction succeeds** - Text is extracted from PDF files
 2. **Text is chunked** - Document is split into manageable chunks
 3. **Claude analyzes** - Causal relationships are extracted
 4. **Relations are written** - Relationships are stored in Neo4j
 5. **Query returns results** - Neo4j query shows relationships
 ## Verification Steps
 1. **Check service logs**:
   ```bash
   docker-compose logs multi-document-upload-service | grep -i "extracted\|relation\|neo4j"
   ```
 2. **Check job status**:
   ```bash
   curl http://localhost:8000/api/multi-docs/jobs/{job_id}
   ```
   Should show: `"processed_files": 1` and relations count > 0
 3. **Check Neo4j**:
   ```cypher
   MATCH (n:Concept)-[r:CAUSES]->(m:Concept)
   RETURN count(r) as relation_count
   ```
 ## Improvements Made
 1. ✅ **Added PDF dependencies** - `unstructured[pdf]`, `unstructured[docx]`, etc.
 2. ✅ **Added fallback extractors** - Uses `pdfplumber` if unstructured fails
 3. ✅ **Better error handling** - Shows actual errors in job status
 4. ✅ **Improved logging** - More detailed logs for debugging
 5. ✅ **Better Neo4j query** - Validates data before writing
 ## Troubleshooting
 If you still see 0 relations after rebuilding:
 1. **Check extraction logs**:
   ```bash
   docker-compose logs multi-document-upload-service | grep -i "extract"
   ```
 2. **Check Claude analysis**:
   ```bash
   docker-compose logs multi-document-upload-service | grep -i "claude\|analyze"
   ```
 3. **Check Neo4j connection**:
   ```bash
   docker-compose logs multi-document-upload-service | grep -i "neo4j\|graph"
   ```
 4. **Verify document has causal language**:
   - Not all documents contain causal relationships
   - Try uploading a document with clear cause-effect statements
   - Example: "Smoking causes lung cancer" or "Rain causes flooding"
 ## Next Steps
 1. Rebuild the service with new dependencies
 2. Re-upload documents
 3. Check Neo4j for relationships
 4. If still no results, check service logs for errors
 5. Verify the document contains causal language
--- a/services/multi-document-upload-service/NEO4J_DIAGNOSTIC_QUERIES.md
+++ b/services/multi-document-upload-service/NEO4J_DIAGNOSTIC_QUERIES.md
@ -1,176 +0,0 @@
 # Neo4j Diagnostic Queries
 ## Issue: No relationships found in Neo4j
 If you're seeing "(no changes, no records)" when querying for `CAUSES` relationships, here are diagnostic queries to check what's actually in the database.
 ## Diagnostic Queries
 ### 1. Check if any nodes exist
 ```cypher
 MATCH (n)
 RETURN count(n) as node_count
 LIMIT 1
 ```
 ### 2. Check if Concept nodes exist
 ```cypher
 MATCH (n:Concept)
 RETURN count(n) as concept_count, 
       collect(DISTINCT labels(n)) as labels,
       collect(DISTINCT keys(n)) as properties
 LIMIT 10
 ```
 ### 3. Check all relationship types
 ```cypher
 CALL db.relationshipTypes() YIELD relationshipType
 RETURN relationshipType
 ```
 ### 4. Check all node labels
 ```cypher
 CALL db.labels() YIELD label
 RETURN label
 ```
 ### 5. Check all relationships (any type)
 ```cypher
 MATCH (n)-[r]->(m)
 RETURN type(r) as relationship_type, 
       count(r) as count,
       labels(n) as from_labels,
       labels(m) as to_labels
 LIMIT 50
 ```
 ### 6. Check for CAUSES relationships specifically
 ```cypher
 MATCH (n)-[r:CAUSES]->(m)
 RETURN n, r, m
 LIMIT 50
 ```
 ### 7. Check for relationships with lowercase "causes"
 ```cypher
 MATCH (n)-[r]->(m)
 WHERE type(r) =~ '(?i)causes'
 RETURN type(r) as relationship_type, n, r, m
 LIMIT 50
 ```
 ### 8. Check all nodes and their relationships
 ```cypher
 MATCH (n)
 OPTIONAL MATCH (n)-[r]->(m)
 RETURN n, labels(n) as node_labels, 
       type(r) as relationship_type, 
       m, labels(m) as target_labels
 LIMIT 50
 ```
 ### 9. Check for nodes created by the service (by job_id property)
 ```cypher
 MATCH (n)-[r]->(m)
 WHERE r.job_id IS NOT NULL
 RETURN n, r, m, r.job_id as job_id
 LIMIT 50
 ```
 ### 10. Check database statistics
 ```cypher
 MATCH (n)
 RETURN count(n) as total_nodes,
       size([(n)-[r]->() | r]) as total_relationships
 ```
 ## Common Issues and Solutions
 ### Issue 1: No nodes at all
 **Symptom**: Query 1 returns 0 nodes
 **Cause**: Service hasn't written anything to Neo4j, or connection failed
 **Solution**: 
 - Check service logs: `docker-compose logs multi-document-upload-service`
 - Verify Neo4j connection in service configuration
 - Check if job completed with 0 relations (extraction failed)
 ### Issue 2: Nodes exist but no relationships
 **Symptom**: Query 1 returns nodes, but Query 6 returns no relationships
 **Cause**: Relationships weren't created, or different relationship type
 **Solution**:
 - Check Query 5 to see what relationship types actually exist
 - Check service logs for graph writing errors
 - Verify the job actually extracted relations (check job status)
 ### Issue 3: Different relationship type
 **Symptom**: Query 5 shows relationships but not `CAUSES`
 **Cause**: Service might be using a different relationship type
 **Solution**:
 - Check Query 3 to see all relationship types
 - Update query to use the correct relationship type
 ### Issue 4: Different node labels
 **Symptom**: Query 6 returns no results, but Query 2 shows different labels
 **Cause**: Service might be using different node labels
 **Solution**:
 - Check Query 2 to see what labels exist
 - Update query to match actual labels
 ## Expected Structure
 After a successful upload, you should see:
 ### Nodes
 - **Label**: `Concept`
 - **Properties**: `name`, `lastSeen`
 ### Relationships
 - **Type**: `CAUSES`
 - **Properties**: `confidence`, `explanation`, `source_file_id`, `source_snippet`, `job_id`, `model`, `updated_at`
 ### Example Query
 ```cypher
 MATCH (cause:Concept)-[r:CAUSES]->(effect:Concept)
 RETURN cause.name as cause, 
       effect.name as effect, 
       r.confidence as confidence,
       r.job_id as job_id,
       r.source_file_id as source_file
 LIMIT 50
 ```
 ## Troubleshooting Steps
 1. **Check service logs**:
   ```bash
   docker-compose logs -f multi-document-upload-service
   ```
 2. **Check if job completed successfully**:
   ```bash
   curl http://localhost:8000/api/multi-docs/jobs/{job_id}
   ```
 3. **Check Neo4j connection**:
   ```bash
   docker-compose logs neo4j | grep -i error
   ```
 4. **Verify Neo4j is running**:
   ```bash
   docker-compose ps neo4j
   ```
 5. **Test Neo4j connection manually**:
   ```bash
   docker-compose exec neo4j cypher-shell -u neo4j -p password "MATCH (n) RETURN count(n)"
   ```
 ## Next Steps
 1. Run the diagnostic queries above
 2. Check the service logs for errors
 3. Verify the job status via API
 4. Re-upload documents after fixing dependencies
 5. Check if relations were actually extracted (job status should show relation count)
--- a/services/multi-document-upload-service/QUICK_TEST.md
+++ b/services/multi-document-upload-service/QUICK_TEST.md
@ -1,85 +0,0 @@
 # Quick Testing Guide - Multi-Document Upload
 ## 🚀 Quick Start Testing
 ### 1. Start Services
 ```bash
 cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine
 docker-compose up -d multi-document-upload-service neo4j redis postgres api-gateway
 ```
 ### 2. Verify Services
 ```bash
 # Check health
 curl http://localhost:8024/health
 curl http://localhost:8000/api/multi-docs/health
 ```
 ### 3. Test via Frontend
 1. **Open Frontend**: `http://localhost:3001`
 2. **Login** (if required)
 3. **Go to Project Builder**
 4. **Complete Steps 1-2** (Project Type & Features)
 5. **Step 3: Multi Docs Upload** appears
 6. **Upload files**:
   - Click upload area
   - Select multiple files (PDF, DOCX, etc.)
   - Click "Start Upload"
 7. **Watch Progress**:
   - Progress bar updates
   - Status messages appear
   - Polls every 4 seconds
 8. **Auto-proceeds** when completed
 ### 4. Verify in Neo4j
 ```bash
 # Open Neo4j Browser: http://localhost:7474
 # Login: neo4j / password
 # Query causal relationships:
 MATCH (n)-[r:CAUSES]->(m)
 RETURN n, r, m
 LIMIT 50
 ```
 ## 📝 Test Checklist
 - [ ] Service starts successfully
 - [ ] Health endpoint works
 - [ ] Frontend component renders
 - [ ] File upload works
 - [ ] Progress updates correctly
 - [ ] Job completes successfully
 - [ ] Neo4j graph contains relationships
 - [ ] Error handling works
 - [ ] Skip button works
 ## 🔍 Debug Commands
 ```bash
 # View service logs
 docker-compose logs -f multi-document-upload-service
 # Check job status (replace {job_id})
 curl http://localhost:8000/api/multi-docs/jobs/{job_id}
 # Check graph summary
 curl http://localhost:8000/api/multi-docs/jobs/{job_id}/graph
 ```
 ## ⚠️ Common Issues
 1. **502 Bad Gateway**: Service not running → `docker-compose ps`
 2. **413 Too Large**: File too big → Reduce file size
 3. **No progress**: Check browser console → Check network tab
 4. **No relationships**: Check Claude API key → Check service logs
 ## 🎯 Expected Flow
 ```
 Upload Files → Job Created → Files Saved → Content Extracted → 
 Claude Analysis → Graph Built → Completed → Auto-proceed to Next Step
 ```
--- a/services/multi-document-upload-service/README.md
+++ b/services/multi-document-upload-service/README.md
--- a/services/multi-document-upload-service/REBUILD_INSTRUCTIONS.md
+++ b/services/multi-document-upload-service/REBUILD_INSTRUCTIONS.md
@ -1,152 +0,0 @@
 # Rebuild Instructions - Multi-Document Upload Service
 ## Issue: Empty Graph in Neo4j
 **Problem**: Query returns "(no changes, no records)" because the job completed with 0 relations.
 **Root Cause**: PDF extraction failed due to missing dependencies (`unstructured[pdf]`).
 ## Fixes Applied
 1. ✅ Added PDF dependencies (`unstructured[pdf]`, `unstructured[docx]`, etc.)
 2. ✅ Added fallback extractors (pdfplumber, python-docx, python-pptx)
 3. ✅ Improved error handling and logging
 4. ✅ Fixed Neo4j query syntax
 5. ✅ Better status messages
 ## Rebuild Steps
 ### Step 1: Rebuild the Service
 ```bash
 cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine
 # Stop the service
 docker-compose stop multi-document-upload-service
 # Rebuild with new dependencies
 docker-compose build --no-cache multi-document-upload-service
 # Start the service
 docker-compose up -d multi-document-upload-service
 # Check logs to verify it's starting correctly
 docker-compose logs -f multi-document-upload-service
 ```
 ### Step 2: Verify Dependencies
 ```bash
 # Check if unstructured[pdf] is installed
 docker-compose exec multi-document-upload-service pip list | grep unstructured
 # You should see:
 # unstructured
 # unstructured-pdf
 # unstructured-docx
 # etc.
 ```
 ### Step 3: Test the Service
 ```bash
 # Check health endpoint
 curl http://localhost:8024/health
 # Should return:
 # {
 #   "status": "ok",
 #   "claude_model": "claude-3-5-haiku-latest",
 #   ...
 # }
 ```
 ### Step 4: Re-upload Documents
 1. Open frontend: `http://localhost:3001/project-builder`
 2. Go to Step 1: Project Type
 3. Find "Upload Documents for Knowledge Graph" section
 4. Upload a PDF or other document
 5. Wait for processing to complete
 6. Check status - should show relation count > 0
 ### Step 5: Verify in Neo4j
 Run these queries in Neo4j Browser (`http://localhost:7474`):
 ```cypher
 // Check if any nodes exist
 MATCH (n)
 RETURN count(n) as node_count
 // Check for CAUSES relationships
 MATCH (n:Concept)-[r:CAUSES]->(m:Concept)
 RETURN n.name as cause, 
       m.name as effect, 
       r.confidence as confidence,
       r.job_id as job_id
 LIMIT 50
 ```
 ## Expected Results
 After rebuilding and re-uploading:
 1. **PDF extraction succeeds** ✅
 2. **Text is extracted** ✅
 3. **Relations are extracted** ✅
 4. **Relations are written to Neo4j** ✅
 5. **Query returns results** ✅
 ## Troubleshooting
 If you still see 0 relations:
 1. **Check service logs**:
   ```bash
   docker-compose logs multi-document-upload-service | tail -50
   ```
 2. **Check extraction logs**:
   ```bash
   docker-compose logs multi-document-upload-service | grep -i "extract\|pdf"
   ```
 3. **Check Claude analysis**:
   ```bash
   docker-compose logs multi-document-upload-service | grep -i "claude\|analyze\|relation"
   ```
 4. **Check Neo4j connection**:
   ```bash
   docker-compose logs multi-document-upload-service | grep -i "neo4j\|graph\|write"
   ```
 5. **Verify document has causal language**:
   - Not all documents contain causal relationships
   - Try uploading a document with clear cause-effect statements
   - Example: "Smoking causes lung cancer"
 ## Quick Test
 Test with a simple text file:
 1. Create a test file `test_causal.txt`:
   ```
   Smoking cigarettes causes lung cancer.
   Heavy rain causes flooding.
   Exercise improves health.
   ```
 2. Upload it via the frontend
 3. Check Neo4j for relationships
 4. Should see 3 causal relationships
 ## Next Steps
 1. Rebuild the service
 2. Re-upload documents
 3. Check Neo4j for relationships
 4. If still no results, check service logs
 5. Verify the document contains causal language
--- a/services/multi-document-upload-service/TESTING_GUIDE.md
+++ b/services/multi-document-upload-service/TESTING_GUIDE.md
@ -1,300 +0,0 @@
 # Multi-Document Upload Service - Frontend Testing Guide
 ## Prerequisites
 1. **Backend Services Running**:
   ```bash
   cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine
   docker-compose up -d
   ```
 2. **Verify Services are Running**:
   - API Gateway: `http://localhost:8000/health`
   - Multi-Document Upload Service: `http://localhost:8024/health`
   - Neo4j: `http://localhost:7474` (Browser interface)
   - Frontend: `http://localhost:3001` (or your frontend port)
 3. **Check Service Health**:
   ```bash
   # Check API Gateway
   curl http://localhost:8000/health
   # Check Multi-Document Upload Service directly
   curl http://localhost:8024/health
   # Check via API Gateway proxy
   curl http://localhost:8000/api/multi-docs/health
   ```
 ## Frontend Testing Steps
 ### Step 1: Navigate to Project Builder
 1. Open your browser and go to: `http://localhost:3001` (or your frontend URL)
 2. Log in if required
 3. Click on **"Project Builder"** in the navigation
 ### Step 2: Go to Multi Docs Upload Step
 1. In the Project Builder, you should see the workflow steps:
   - **Step 1**: Project Type
   - **Step 2**: Features
   - **Step 3**: Multi Docs Upload ← **This is the new step**
   - **Step 4**: Business Context
   - **Step 5**: Generate
   - **Step 6**: Architecture
 2. Complete Steps 1 and 2 (Project Type and Features selection)
 3. You will automatically be taken to **Step 3: Multi Docs Upload**
 ### Step 3: Upload Documents
 1. **Click on the upload area** or **drag and drop files**
 2. **Select multiple files** (you can mix different formats):
   - PDF files (`.pdf`)
   - Word documents (`.doc`, `.docx`)
   - PowerPoint (`.ppt`, `.pptx`)
   - Excel files (`.xls`, `.xlsx`)
   - JSON files (`.json`)
   - XML files (`.xml`)
   - Markdown files (`.md`)
   - Images (`.png`, `.jpg`, `.jpeg`) - will use OCR
   - Audio files (`.mp3`, `.wav`) - will be transcribed
   - Video files (`.mp4`, `.avi`) - will be transcribed
 3. **View selected files**: You should see a list of all selected files with:
   - File icon
   - File name
   - Remove button for each file
 4. **Click "Start Upload"** button
 ### Step 4: Monitor Upload Progress
 After clicking "Start Upload", you should see:
 1. **Upload Status**:
   - Button shows "Uploading..." with spinner
   - Progress bar appears
   - Stage messages appear:
     - "Job received"
     - "Saving files"
     - "Extracting document content"
     - "Calling Claude for causal relations"
     - "Writing to Neo4j knowledge graph"
     - "Completed"
 2. **Progress Indicators**:
   - Progress percentage (0-100%)
   - Status message showing current stage
   - Processed files count vs total files count
 3. **Polling**: The frontend automatically polls the job status every 4 seconds
 ### Step 5: Verify Results
 Once the job is completed:
 1. **Check Neo4j Graph**:
   - Open Neo4j Browser: `http://localhost:7474`
   - Login with:
     - Username: `neo4j`
     - Password: `password`
   - Run Cypher query to see the graph:
     ```cypher
     MATCH (n)-[r:CAUSES]->(m)
     RETURN n, r, m
     LIMIT 50
     ```
 2. **Check Job Status via API**:
   ```bash
   # Replace {job_id} with the actual job ID from the frontend
   curl http://localhost:8000/api/multi-docs/jobs/{job_id}
   ```
 3. **Get Graph Summary**:
   ```bash
   curl http://localhost:8000/api/multi-docs/jobs/{job_id}/graph
   ```
 ## Testing Different Scenarios
 ### Scenario 1: Single PDF File
 - Upload one PDF file
 - Verify it processes correctly
 - Check Neo4j for causal relationships
 ### Scenario 2: Multiple Mixed Format Files
 - Upload 3-5 files of different formats (PDF, DOCX, JSON, image)
 - Verify all files are processed
 - Check that progress updates correctly
 ### Scenario 3: Large Files
 - Upload a large PDF (10+ MB)
 - Verify it handles large files correctly
 - Check processing time
 ### Scenario 4: Error Handling
 - Try uploading an unsupported file type
 - Verify error message appears
 - Check that the error is displayed clearly
 ### Scenario 5: Skip Option
 - Upload files
 - Click "Skip" button before completion
 - Verify you can proceed to the next step
 - Job continues processing in the background
 ## Browser Developer Tools
 ### Check Network Requests
 1. **Open Developer Tools** (F12)
 2. **Go to Network tab**
 3. **Filter by "multi-docs"**
 4. **Monitor requests**:
   - `POST /api/multi-docs/jobs` - Upload files
   - `GET /api/multi-docs/jobs/{job_id}` - Poll job status
   - `GET /api/multi-docs/jobs/{job_id}/graph` - Get graph summary
 ### Check Console Logs
 1. **Open Console tab**
 2. **Look for**:
   - Upload progress logs
   - Job status updates
   - Any error messages
 ### Check Response Data
 Verify the API responses:
 ```javascript
 // Upload response should be:
 {
  "job_id": "uuid-here",
  "stage": "received",
  "total_files": 3,
  "created_at": "2024-01-01T00:00:00Z"
 }
 // Status response should be:
 {
  "job_id": "uuid-here",
  "stage": "extracting",
  "status_message": "Extracting document content",
  "total_files": 3,
  "processed_files": 1,
  "error": null,
  "created_at": "2024-01-01T00:00:00Z",
  "updated_at": "2024-01-01T00:01:00Z",
  "files": [...]
 }
 ```
 ## Troubleshooting
 ### Issue: Upload fails with 502 Bad Gateway
 **Solution**:
 - Check if multi-document-upload-service is running:
  ```bash
  docker-compose ps multi-document-upload-service
  ```
 - Check service logs:
  ```bash
  docker-compose logs multi-document-upload-service
  ```
 ### Issue: Upload fails with 413 Request Entity Too Large
 **Solution**:
 - Check file sizes (max 500MB total per job)
 - Reduce number of files or file sizes
 - Check API Gateway body size limits
 ### Issue: Status polling stops working
 **Solution**:
 - Check browser console for errors
 - Verify job ID is correct
 - Check if job completed or failed
 - Check network tab for failed requests
 ### Issue: No causal relationships found
 **Solution**:
 - Check Claude API key is configured correctly
 - Check service logs for Claude API errors
 - Verify documents contain causal language
 - Check Neo4j connection
 ### Issue: Frontend shows "Failed" status
 **Solution**:
 - Check the error message in the frontend
 - Check backend service logs:
  ```bash
  docker-compose logs -f multi-document-upload-service
  ```
 - Verify all dependencies are running (Neo4j, Redis, Postgres)
 ## Expected Behavior
 ### Successful Flow:
 1. ✅ Files upload successfully
 2. ✅ Job ID is returned
 3. ✅ Status polling starts automatically
 4. ✅ Progress updates every 4 seconds
 5. ✅ Stage changes are displayed
 6. ✅ Progress bar updates
 7. ✅ Job completes successfully
 8. ✅ Frontend automatically proceeds to next step
 9. ✅ Neo4j contains causal relationships
 ### Error Flow:
 1. ✅ Error message is displayed clearly
 2. ✅ User can retry upload
 3. ✅ User can skip and proceed
 4. ✅ Error details are logged in console
 ## API Endpoints Reference
 ### Upload Files
 ```bash
 POST /api/multi-docs/jobs
 Content-Type: multipart/form-data
 Form Data:
 - files: File[] (multiple files)
 - job_name: string (optional)
 ```
 ### Get Job Status
 ```bash
 GET /api/multi-docs/jobs/{job_id}
 ```
 ### Get Graph Summary
 ```bash
 GET /api/multi-docs/jobs/{job_id}/graph
 ```
 ### Health Check
 ```bash
 GET /api/multi-docs/health
 ```
 ## Next Steps After Testing
 1. **Verify Neo4j Graph**: Check that causal relationships are stored correctly
 2. **Check Storage**: Verify files are stored in the persistent volume
 3. **Monitor Performance**: Check processing times for different file types
 4. **Test Error Scenarios**: Verify error handling works correctly
 5. **Test Large Batches**: Upload 50+ files to test scalability
 ## Support
 If you encounter issues:
 1. Check service logs: `docker-compose logs multi-document-upload-service`
 2. Check API Gateway logs: `docker-compose logs api-gateway`
 3. Check Neo4j logs: `docker-compose logs neo4j`
 4. Verify all environment variables are set correctly
 5. Check network connectivity between services
--- a/services/multi-document-upload-service/requirements.txt
+++ b/services/multi-document-upload-service/requirements.txt
@ -8,10 +8,6 @@ pydantic-settings>=2.2.1
 aiofiles>=23.2.1
 tenacity>=8.2.3
 python-dotenv>=1.0.1
 unstructured[pdf]>=0.15.0
 unstructured[docx]>=0.15.0
 unstructured[pptx]>=0.15.0
 unstructured[xlsx]>=0.15.0
 pdfplumber>=0.11.0
 python-docx>=1.1.0
 python-pptx>=0.6.23
@ -30,5 +26,13 @@ beautifulsoup4>=4.12.3
 lxml>=5.2.1
 sqlalchemy>=2.0.25
 httpx>=0.27.0
-tiktoken>=0.7.0
+dowhy>=0.11.0
 qdrant-client>=1.7.0
 sentence-transformers>=2.2.0
 numpy>=1.24.0
 scipy>=1.11.0
 networkx>=3.1
 spacy>=3.7.0
 markdown>=3.5.0
 weasyprint>=60.0
--- a/services/multi-document-upload-service/src/multi_document_upload_service/claude_client.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/claude_client.py
@ -1,328 +0,0 @@
 from __future__ import annotations
 import base64
 import json
 import logging
 import re
 from pathlib import Path
 from typing import Iterable, List
 from anthropic import Anthropic, BadRequestError
 from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential, RetryCallState
 from .models import CausalRelation
 logger = logging.getLogger(__name__)
 def is_billing_error(exception: Exception) -> bool:
    """Check if the exception is a billing/credit related error that shouldn't be retried."""
    if isinstance(exception, BadRequestError):
        error_message = str(exception).lower()
        billing_keywords = ["credit", "balance", "too low", "billing", "upgrade", "purchase credits"]
        return any(keyword in error_message for keyword in billing_keywords)
    return False
 def should_retry_exception(retry_state: RetryCallState) -> bool:
    """Custom retry condition that excludes billing errors."""
    exception = retry_state.outcome.exception()
    if exception is None:
        return False
    # Don't retry billing errors - they won't be resolved by retrying
    if is_billing_error(exception):
        return False
    # Retry other exceptions
    return True
 CLAUDE_PROMPT_TEMPLATE = """You are an expert analyst extracting causal relationships from documents.
 Given the following text chunk, identify all explicit or strongly implied cause and effect pairs.
 Return JSON with the schema:
 [
  {
    "cause": "<short phrase>",
    "effect": "<short phrase>",
    "confidence": 0-1 float,
    "explanation": "<why this is causal>",
    "source_snippet": "<exact quote or paraphrase>"
  }
 ]
 Only include items when the causal direction is clear.
 If none are found, return an empty list [].
 Text chunk:
 ```
 <<<CHUNK_PLACEHOLDER>>>
 ```"""
 IMAGE_PROMPT_TEMPLATE = """You are an expert analyst extracting causal relationships from images, diagrams, and visual content.
 Analyze this image/diagram for causal relationships. Look for:
 - Architecture flows (A → B → C)
 - Dependency relationships
 - Cause-effect chains in diagrams
 - Process flows
 - System interactions
 - Data flows
 - Sequential relationships
 - Visual connections between components
 Return JSON with the schema:
 [
  {
    "cause": "<short phrase describing the cause>",
    "effect": "<short phrase describing the effect>",
    "confidence": 0-1 float,
    "explanation": "<why this is causal, referencing visual elements>",
    "source_snippet": "<description of what you see in the image that shows this relationship>"
  }
 ]
 Only include items when the causal direction is clear from the visual structure.
 If none are found, return an empty list []."""
 class ClaudeCausalExtractor:
    def __init__(self, api_key: str, model: str, max_output_tokens: int = 4000):
        self.client = Anthropic(api_key=api_key)
        self.model = model
        self.max_output_tokens = max_output_tokens
    @retry(
        retry=should_retry_exception,
        wait=wait_exponential(multiplier=1, min=1, max=10),
        stop=stop_after_attempt(3),
        reraise=True,
    )
    def analyze_chunk(self, chunk: str, source_file_id: str) -> List[CausalRelation]:
        logger.debug("Analyzing chunk with Claude model %s", self.model)
        # Validate chunk is not empty and is readable text
        if not chunk or not chunk.strip():
            logger.warning("Empty or whitespace-only chunk, skipping")
            return []
        # Check if chunk contains mostly readable text (not binary data)
        # Simple heuristic: if >50% of characters are non-printable or control chars, skip it
        printable_chars = sum(1 for c in chunk if c.isprintable() or c.isspace())
        if len(chunk) > 100 and printable_chars / len(chunk) < 0.5:
            logger.warning("Chunk appears to contain binary data, skipping analysis")
            return []
        # Use string replacement with a unique placeholder to avoid KeyError with braces in content
        # This prevents Python's .format() from interpreting braces in the chunk text as format placeholders
        prompt_text = CLAUDE_PROMPT_TEMPLATE.replace("<<<CHUNK_PLACEHOLDER>>>", chunk)
        try:
            message = self.client.messages.create(
                model=self.model,
                max_tokens=self.max_output_tokens,
                temperature=0.0,
                system="You extract causal (cause→effect) relations with high precision.",
                messages=[
                    {
                        "role": "user",
                        "content": [{"type": "text", "text": prompt_text}],
                    }
                ],
            )
        except BadRequestError as e:
            # Check if it's a billing error
            if is_billing_error(e):
                error_msg = (
                    "Anthropic API credit balance is too low. "
                    "Please go to Plans & Billing to upgrade or purchase credits. "
                    f"Error: {str(e)}"
                )
                logger.error(error_msg)
                raise RuntimeError(error_msg) from e
            # Re-raise other BadRequestErrors
            raise
        content_blocks = message.content or []
        raw_text = "".join(block.text for block in content_blocks if hasattr(block, "text"))  # type: ignore[attr-defined]
        if not raw_text:
            return []
        # Try to extract JSON from markdown code blocks if present
        json_text = raw_text.strip()
        # Look for JSON in markdown code blocks (```json ... ```)
        json_match = re.search(r'```(?:json)?\s*(\[.*?\])\s*```', json_text, re.DOTALL)
        if json_match:
            json_text = json_match.group(1)
        else:
            # Look for JSON array/object at the start or end
            json_match = re.search(r'(\[.*?\]|{.*?})', json_text, re.DOTALL)
            if json_match:
                json_text = json_match.group(1)
        try:
            data = json.loads(json_text)
            if not isinstance(data, list):
                logger.warning("Claude response is not a list: %s", type(data))
                return []
            relations: List[CausalRelation] = []
            for item in data:
                if not isinstance(item, dict):
                    continue
                cause = item.get("cause", "").strip()
                effect = item.get("effect", "").strip()
                if not cause or not effect:
                    continue  # Skip invalid relations
                relations.append(
                    CausalRelation(
                        cause=cause,
                        effect=effect,
                        confidence=float(item.get("confidence", 0.0)),
                        explanation=item.get("explanation"),
                        source_file_id=source_file_id,
                        source_snippet=item.get("source_snippet"),
                        metadata={"model": self.model},
                    )
                )
            logger.info("Extracted %d relations from Claude response", len(relations))
            return relations
        except json.JSONDecodeError as e:
            logger.warning("Failed to parse Claude response as JSON: %s. Raw text: %s", e, raw_text[:200])
            return []
    def analyze(self, chunks: Iterable[str], source_file_id: str) -> List[CausalRelation]:
        relations: List[CausalRelation] = []
        for chunk in chunks:
            relations.extend(self.analyze_chunk(chunk, source_file_id=source_file_id))
        return relations
    @retry(
        retry=should_retry_exception,
        wait=wait_exponential(multiplier=1, min=1, max=10),
        stop=stop_after_attempt(3),
        reraise=True,
    )
    def analyze_image(self, image_path: Path, source_file_id: str) -> List[CausalRelation]:
        """
        Analyze an image using Claude Vision API to extract causal relationships.
        Sends image directly to Claude (no OCR).
        """
        logger.info("Analyzing image with Claude Vision: %s", image_path.name)
        try:
            # Read and encode image as base64
            with open(image_path, "rb") as image_file:
                image_data = image_file.read()
            # Determine media type
            suffix = image_path.suffix.lower()
            media_type_map = {
                ".png": "image/png",
                ".jpg": "image/jpeg",
                ".jpeg": "image/jpeg",
                ".gif": "image/gif",
                ".webp": "image/webp",
            }
            media_type = media_type_map.get(suffix, "image/png")
            # Encode to base64
            base64_image = base64.b64encode(image_data).decode("utf-8")
            # Prepare content for Claude Vision API
            content = [
                {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": media_type,
                        "data": base64_image,
                    },
                },
                {
                    "type": "text",
                    "text": IMAGE_PROMPT_TEMPLATE,
                },
            ]
            # Call Claude Vision API
            try:
                message = self.client.messages.create(
                    model=self.model,  # Claude models support vision
                    max_tokens=self.max_output_tokens,
                    temperature=0.0,
                    system="You extract causal (cause→effect) relations from visual content with high precision.",
                    messages=[
                        {
                            "role": "user",
                            "content": content,
                        }
                    ],
                )
            except BadRequestError as e:
                # Check if it's a billing error
                if is_billing_error(e):
                    error_msg = (
                        "Anthropic API credit balance is too low. "
                        "Please go to Plans & Billing to upgrade or purchase credits. "
                        f"Error: {str(e)}"
                    )
                    logger.error(error_msg)
                    raise RuntimeError(error_msg) from e
                # Re-raise other BadRequestErrors
                raise
            # Parse response
            content_blocks = message.content or []
            raw_text = "".join(block.text for block in content_blocks if hasattr(block, "text"))  # type: ignore[attr-defined]
            if not raw_text:
                logger.warning("No text response from Claude Vision for image %s", image_path.name)
                return []
            # Extract JSON from response
            json_text = raw_text.strip()
            json_match = re.search(r'```(?:json)?\s*(\[.*?\])\s*```', json_text, re.DOTALL)
            if json_match:
                json_text = json_match.group(1)
            else:
                json_match = re.search(r'(\[.*?\]|{.*?})', json_text, re.DOTALL)
                if json_match:
                    json_text = json_match.group(1)
            try:
                data = json.loads(json_text)
                if not isinstance(data, list):
                    logger.warning("Claude Vision response is not a list: %s", type(data))
                    return []
                relations: List[CausalRelation] = []
                for item in data:
                    if not isinstance(item, dict):
                        continue
                    cause = item.get("cause", "").strip()
                    effect = item.get("effect", "").strip()
                    if not cause or not effect:
                        continue
                    relations.append(
                        CausalRelation(
                            cause=cause,
                            effect=effect,
                            confidence=float(item.get("confidence", 0.0)),
                            explanation=item.get("explanation"),
                            source_file_id=source_file_id,
                            source_snippet=item.get("source_snippet") or f"Image: {image_path.name}",
                            metadata={"model": self.model, "content_type": "image", "image_path": str(image_path)},
                        )
                    )
                logger.info("Extracted %d relations from image %s", len(relations), image_path.name)
                return relations
            except json.JSONDecodeError as e:
                logger.warning("Failed to parse Claude Vision response as JSON: %s. Raw text: %s", e, raw_text[:200])
                return []
        except Exception as exc:
            logger.exception("Failed to analyze image %s: %s", image_path, exc)
            return []
--- a/services/multi-document-upload-service/src/multi_document_upload_service/config.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/config.py
@ -20,7 +20,7 @@ class Settings(BaseSettings):
    model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")
    anthropic_api_key: str | None = Field(default=None, validation_alias="ANTHROPIC_API_KEY")
-    claude_model: str = Field(default=os.getenv("MULTI_DOC_CLAUDE_MODEL", "claude-3-5-sonnet-20241022"))
+    claude_model: str = Field(default=os.getenv("MULTI_DOC_CLAUDE_MODEL", os.getenv("CLAUDE_MODEL", "claude-3-5-haiku-latest")))
    claude_max_input_tokens: int = Field(default=200_000)
    claude_max_output_tokens: int = Field(default=16_000)
@ -37,6 +37,27 @@ class Settings(BaseSettings):
    job_retention_days: int = Field(default=30)
    # Qwen2.5-VL API configuration
    qwen_api_key: str | None = Field(default=None, validation_alias="QWEN_API_KEY")
    qwen_api_url: str = Field(default=os.getenv("QWEN_API_URL", "https://api.example.com/v1/chat/completions"))
    qwen_model: str = Field(default=os.getenv("QWEN_MODEL", "qwen2.5-vl"))
    # DoWhy configuration
    dowhy_enabled: bool = Field(default=True)
    dowhy_confidence_threshold: float = Field(default=0.05)
    # Embedding configuration
    embedding_model: str = Field(default="sentence-transformers/all-MiniLM-L6-v2")
    embedding_dimension: int = Field(default=384)
    # Qdrant configuration
    qdrant_url: str = Field(default=os.getenv("QDRANT_URL", "http://localhost:6333"))
    qdrant_collection_name: str = Field(default="kg_embeddings")
    qdrant_vector_size: int = Field(default=384)
    # Report generation configuration
    report_format: str = Field(default="markdown")
    def ensure_storage_dirs(self) -> None:
        (self.storage_root / "jobs").mkdir(parents=True, exist_ok=True)
        (self.storage_root / "uploads").mkdir(parents=True, exist_ok=True)
--- a/services/multi-document-upload-service/src/multi_document_upload_service/extractors/auto.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/extractors/auto.py
@ -1,168 +0,0 @@
 from __future__ import annotations
 import logging
 from pathlib import Path
 from typing import List
 logger = logging.getLogger(__name__)
 # Try to import unstructured, but fall back to alternatives if not available
 try:
    from unstructured.partition.auto import partition
    HAS_UNSTRUCTURED = True
 except ImportError:
    HAS_UNSTRUCTURED = False
    logger.warning("unstructured not available, will use fallback extractors")
 # Fallback extractors
 try:
    import pdfplumber
    HAS_PDFPLUMBER = True
 except ImportError:
    HAS_PDFPLUMBER = False
 try:
    from docx import Document as DocxDocument
    HAS_DOCX = True
 except ImportError:
    HAS_DOCX = False
 try:
    from pptx import Presentation
    HAS_PPTX = True
 except ImportError:
    HAS_PPTX = False
 # Image processing libraries
 try:
    from PIL import Image
    import pytesseract
    HAS_OCR = True
 except ImportError:
    HAS_OCR = False
    logger.warning("OCR libraries not available, image extraction will be limited")
 def extract_text(path: Path) -> str:
    """
    Extract text from a file using multiple strategies.
    Falls back through: unstructured -> format-specific -> plain text read.
    """
    suffix = path.suffix.lower()
    # Validate PDF file before processing
    if suffix == ".pdf":
        # Quick validation: check if file starts with PDF magic bytes
        try:
            with path.open("rb") as f:
                header = f.read(4)
                if header != b"%PDF":
                    raise ValueError(
                        f"File {path.name} does not appear to be a valid PDF. "
                        f"PDF files must start with '%PDF' magic bytes. "
                        f"Got: {header[:20] if len(header) > 0 else 'empty file'}"
                    )
        except Exception as exc:
            if isinstance(exc, ValueError):
                raise
            logger.warning("Could not validate PDF header: %s", exc)
    # Image files - return empty text (will be processed directly with Claude Vision)
    # We skip OCR and send images directly to Claude Vision API
    if suffix in {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}:
        logger.info("Image file detected: %s. Will be processed directly with Claude Vision (no OCR)", path.name)
        # Return empty string - images will be handled separately in pipeline
        return ""
    # Plain text files - direct read
    if suffix in {".txt", ".md", ".json", ".xml", ".html", ".csv"}:
        try:
            return path.read_text(encoding="utf-8", errors="ignore")
        except Exception as exc:
            logger.warning("Failed to read %s as text: %s", path, exc)
            raise
    # Try unstructured first (if available)
    if HAS_UNSTRUCTURED:
        try:
            elements = partition(filename=str(path))
            lines: List[str] = []
            for element in elements:
                text = getattr(element, "text", None)
                if text:
                    lines.append(text.strip())
            if lines:
                logger.info("Extracted %d lines using unstructured", len(lines))
                return "\n".join(lines)
        except Exception as exc:
            logger.warning("unstructured extraction failed for %s: %s", path, exc)
            # Continue to fallback methods
    # Fallback: PDF with pdfplumber
    if suffix == ".pdf" and HAS_PDFPLUMBER:
        try:
            with pdfplumber.open(path) as pdf:
                text_parts = []
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text_parts.append(page_text)
                if text_parts:
                    logger.info("Extracted PDF using pdfplumber")
                    return "\n".join(text_parts)
        except Exception as exc:
            logger.warning("pdfplumber extraction failed for %s: %s", path, exc)
    # Fallback: DOCX
    if suffix == ".docx" and HAS_DOCX:
        try:
            doc = DocxDocument(path)
            paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
            if paragraphs:
                logger.info("Extracted DOCX using python-docx")
                return "\n".join(paragraphs)
        except Exception as exc:
            logger.warning("python-docx extraction failed for %s: %s", path, exc)
    # Fallback: PPTX
    if suffix in {".pptx", ".ppt"} and HAS_PPTX:
        try:
            prs = Presentation(path)
            text_parts = []
            for slide in prs.slides:
                for shape in slide.shapes:
                    if hasattr(shape, "text") and shape.text:
                        text_parts.append(shape.text.strip())
            if text_parts:
                logger.info("Extracted PPTX using python-pptx")
                return "\n".join(text_parts)
        except Exception as exc:
            logger.warning("python-pptx extraction failed for %s: %s", path, exc)
    # Last resort: try to read as text anyway, but validate it's readable
    try:
        content = path.read_text(encoding="utf-8", errors="ignore")
        if content.strip():
            # Check if content is actually readable text (not binary data)
            # Simple heuristic: if >30% of characters are printable, consider it text
            printable_chars = sum(1 for c in content if c.isprintable() or c.isspace())
            total_chars = len(content)
            if total_chars > 0 and printable_chars / total_chars > 0.3:
                logger.warning("Read %s as plain text (may contain binary data)", path)
                return content
            else:
                logger.error("Content from %s appears to be binary data, cannot extract text", path)
                raise ValueError(f"File {path} appears to be binary or corrupted. Cannot extract readable text.")
    except Exception as exc:
        if isinstance(exc, ValueError):
            raise
        logger.warning("Failed to read %s as text: %s", path, exc)
    # If all else fails, raise an error
    raise ValueError(
        f"Could not extract text from {path}. "
        f"File type may not be supported, file may be corrupted, or dependencies are missing. "
        f"Supported formats: PDF, DOCX, PPTX, XLSX, TXT, MD, JSON, XML, HTML, CSV, PNG, JPG, JPEG (with OCR)"
    )
--- a/services/multi-document-upload-service/src/multi_document_upload_service/extractors/pymupdf_extractor.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/extractors/pymupdf_extractor.py
@ -0,0 +1,320 @@
 from __future__ import annotations
 import logging
 import re
 from dataclasses import dataclass
 from pathlib import Path
 from typing import List, Optional
 logger = logging.getLogger(__name__)
 try:
    import fitz  # PyMuPDF
    HAS_PYMUPDF = True
 except ImportError:
    HAS_PYMUPDF = False
    logger.warning("PyMuPDF not available")
 try:
    from docx import Document as DocxDocument
    HAS_DOCX = True
 except ImportError:
    HAS_DOCX = False
    logger.warning("python-docx not available")
 try:
    from pptx import Presentation
    HAS_PPTX = True
 except ImportError:
    HAS_PPTX = False
    logger.warning("python-pptx not available")
 try:
    import pandas as pd
    HAS_PANDAS = True
 except ImportError:
    HAS_PANDAS = False
    logger.warning("pandas not available")
@dataclass
 class ExtractedText:
    """Structured text extraction with context."""
    text: str
    page_number: int
    metadata: dict
    context: Optional[str] = None  # Surrounding context
 def extract_text_with_context(path: Path) -> List[ExtractedText]:
    """
    Extract text from PDF using PyMuPDF with page-level context.
    Returns structured text with metadata.
    """
    if not HAS_PYMUPDF:
        raise ImportError("PyMuPDF is required for text extraction")
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")
    if path.suffix.lower() != ".pdf":
        # For non-PDF files, fall back to simple text reading
        try:
            text = path.read_text(encoding="utf-8", errors="ignore")
            return [ExtractedText(
                text=text,
                page_number=1,
                metadata={"file_type": path.suffix, "filename": path.name},
                context=None
            )]
        except Exception as exc:
            logger.warning("Failed to read %s as text: %s", path, exc)
            raise
    extracted_pages: List[ExtractedText] = []
    try:
        doc = fitz.open(path)
        for page_num in range(len(doc)):
            page = doc[page_num]
            # Extract text
            text = page.get_text()
            # Extract metadata
            metadata = {
                "page_number": page_num + 1,
                "page_count": len(doc),
                "filename": path.name,
                "file_type": "pdf",
                "page_rect": {
                    "width": page.rect.width,
                    "height": page.rect.height
                }
            }
            # Extract context (surrounding pages for better understanding)
            context = None
            if page_num > 0:
                prev_page = doc[page_num - 1]
                prev_text = prev_page.get_text()[:500]  # Last 500 chars of previous page
                context = f"Previous page context: {prev_text}"
            if text.strip():
                extracted_pages.append(ExtractedText(
                    text=text,
                    page_number=page_num + 1,
                    metadata=metadata,
                    context=context
                ))
        doc.close()
        logger.info("Extracted text from %d pages in %s", len(extracted_pages), path.name)
        return extracted_pages
    except Exception as exc:
        logger.exception("Failed to extract text from PDF %s: %s", path, exc)
        raise
 def extract_text_from_docx(path: Path) -> str:
    """
    Extract text from DOCX file using python-docx.
    Reads paragraphs and tables as per README Step 2.2b.
    """
    if not HAS_DOCX:
        raise ImportError("python-docx is required for DOCX extraction")
    try:
        doc = DocxDocument(path)
        text_parts = []
        # Extract paragraphs
        for paragraph in doc.paragraphs:
            if paragraph.text.strip():
                text_parts.append(paragraph.text.strip())
        # Extract tables
        for table in doc.tables:
            table_text = []
            for row in table.rows:
                row_text = []
                for cell in row.cells:
                    if cell.text.strip():
                        row_text.append(cell.text.strip())
                if row_text:
                    table_text.append(" | ".join(row_text))
            if table_text:
                text_parts.append("\n".join(table_text))
        result = "\n\n".join(text_parts)
        logger.info("Extracted %d characters from DOCX %s", len(result), path.name)
        return result
    except Exception as exc:
        logger.exception("Failed to extract text from DOCX %s: %s", path, exc)
        raise
 def extract_text_from_pptx(path: Path) -> str:
    """
    Extract text from PPTX file using python-pptx.
    Reads slides, titles, and notes as per README Step 2.2c.
    """
    if not HAS_PPTX:
        raise ImportError("python-pptx is required for PPTX extraction")
    try:
        prs = Presentation(path)
        text_parts = []
        for slide_num, slide in enumerate(prs.slides, 1):
            slide_text = []
            # Extract slide title
            if slide.shapes.title and slide.shapes.title.text:
                slide_text.append(f"Slide {slide_num} Title: {slide.shapes.title.text.strip()}")
            # Extract content from shapes
            for shape in slide.shapes:
                if hasattr(shape, "text") and shape.text.strip():
                    # Skip title (already extracted)
                    if not (slide.shapes.title and shape == slide.shapes.title):
                        slide_text.append(shape.text.strip())
            # Extract notes (if available)
            if hasattr(slide, "notes_slide") and slide.notes_slide:
                notes_text = ""
                for shape in slide.notes_slide.shapes:
                    if hasattr(shape, "text") and shape.text.strip():
                        notes_text += shape.text.strip() + " "
                if notes_text.strip():
                    slide_text.append(f"Notes: {notes_text.strip()}")
            if slide_text:
                text_parts.append("\n".join(slide_text))
        result = "\n\n".join(text_parts)
        logger.info("Extracted %d characters from PPTX %s (%d slides)", 
                   len(result), path.name, len(prs.slides))
        return result
    except Exception as exc:
        logger.exception("Failed to extract text from PPTX %s: %s", path, exc)
        raise
 def extract_text_from_spreadsheet(path: Path) -> str:
    """
    Extract text from CSV/XLSX file using pandas.
    Reads rows and columns, converts to text representation as per README Step 2.2d.
    """
    if not HAS_PANDAS:
        raise ImportError("pandas is required for spreadsheet extraction")
    try:
        suffix = path.suffix.lower()
        text_parts = []
        if suffix == ".csv":
            df = pd.read_csv(path, encoding="utf-8", errors="ignore")
        elif suffix in {".xlsx", ".xls"}:
            # Read first sheet by default
            df = pd.read_excel(path, engine="openpyxl" if suffix == ".xlsx" else None)
        else:
            raise ValueError(f"Unsupported spreadsheet format: {suffix}")
        # Convert DataFrame to text representation
        # Add column headers
        headers = " | ".join(str(col) for col in df.columns)
        text_parts.append(f"Columns: {headers}")
        # Add rows (limit to first 1000 rows to avoid huge output)
        max_rows = min(1000, len(df))
        for idx, row in df.head(max_rows).iterrows():
            row_values = " | ".join(str(val) if pd.notna(val) else "" for val in row)
            text_parts.append(f"Row {idx + 1}: {row_values}")
        if len(df) > max_rows:
            text_parts.append(f"... ({len(df) - max_rows} more rows)")
        result = "\n".join(text_parts)
        logger.info("Extracted %d characters from spreadsheet %s (%d rows)", 
                   len(result), path.name, len(df))
        return result
    except Exception as exc:
        logger.exception("Failed to extract text from spreadsheet %s: %s", path, exc)
        raise
 def clean_text(text: str) -> str:
    """
    Clean extracted text as per README Step 2.3.
    - Remove extra whitespace
    - Fix encoding issues
    - Preserve important structure
    """
    if not text:
        return ""
    # Fix encoding issues (remove non-printable characters except newlines and tabs)
    cleaned = "".join(char for char in text if char.isprintable() or char in "\n\t\r")
    # Remove extra whitespace (but preserve paragraph breaks)
    # Replace multiple spaces with single space
    cleaned = re.sub(r'[ \t]+', ' ', cleaned)
    # Normalize line breaks (preserve double newlines for paragraphs)
    cleaned = re.sub(r'\r\n', '\n', cleaned)  # Windows line breaks
    cleaned = re.sub(r'\r', '\n', cleaned)    # Old Mac line breaks
    # Preserve paragraph structure (double newlines)
    # But remove excessive blank lines (more than 2 consecutive)
    cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
    # Remove leading/trailing whitespace from each line
    lines = [line.strip() for line in cleaned.split('\n')]
    cleaned = '\n'.join(lines)
    # Remove leading/trailing whitespace overall
    cleaned = cleaned.strip()
    return cleaned
 def extract_all_text(path: Path) -> str:
    """
    Extract all text from a file based on type (as per README Step 2).
    Routes to appropriate extractor: PDF, DOCX, PPTX, CSV/XLSX, or plain text.
    """
    suffix = path.suffix.lower()
    # Step 2.2a: PDF
    if suffix == ".pdf" and HAS_PYMUPDF:
        extracted_pages = extract_text_with_context(path)
        text = "\n\n".join([page.text for page in extracted_pages])
    # Step 2.2b: DOCX (Word)
    elif suffix == ".docx" and HAS_DOCX:
        text = extract_text_from_docx(path)
    # Step 2.2c: PPTX (PowerPoint)
    elif suffix in {".pptx", ".ppt"} and HAS_PPTX:
        text = extract_text_from_pptx(path)
    # Step 2.2d: CSV/XLSX (Spreadsheet)
    elif suffix in {".csv", ".xlsx", ".xls"} and HAS_PANDAS:
        text = extract_text_from_spreadsheet(path)
    # Fallback: Plain text files
    else:
        try:
            text = path.read_text(encoding="utf-8", errors="ignore")
        except Exception as exc:
            logger.warning("Failed to read %s as text: %s", path, exc)
            raise
    # Step 2.3: TEXT CLEANING
    text = clean_text(text)
    return text
--- a/services/multi-document-upload-service/src/multi_document_upload_service/extractors/qwen_vision.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/extractors/qwen_vision.py
@ -0,0 +1,153 @@
 from __future__ import annotations
 import base64
 import json
 import logging
 from pathlib import Path
 from typing import Dict, List, Optional
 import httpx
 from ..config import get_settings
 logger = logging.getLogger(__name__)
 class QwenVisionClient:
    """Client for Qwen2.5-VL API to extract relationships from diagrams and ERDs."""
    def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None, model: Optional[str] = None):
        settings = get_settings()
        self.api_key = api_key or settings.qwen_api_key
        self.api_url = api_url or settings.qwen_api_url
        self.model = model or settings.qwen_model
        if not self.api_key:
            logger.warning("Qwen API key not configured")
    def extract_relationships_from_image(self, image_path: Path, source_file_id: str) -> List[Dict]:
        """
        Extract relationships (entities, connections, flows) from an image using Qwen2.5-VL.
        Returns list of extracted relationships.
        """
        if not self.api_key:
            logger.warning("Qwen API key not configured, skipping image analysis")
            return []
        try:
            # Read and encode image
            with open(image_path, "rb") as img_file:
                image_data = img_file.read()
            base64_image = base64.b64encode(image_data).decode("utf-8")
            # Determine media type
            suffix = image_path.suffix.lower()
            media_type_map = {
                ".png": "image/png",
                ".jpg": "image/jpeg",
                ".jpeg": "image/jpeg",
                ".gif": "image/gif",
                ".webp": "image/webp",
            }
            media_type = media_type_map.get(suffix, "image/png")
            # Prepare prompt for relationship extraction
            prompt = """Analyze this diagram/ERD/image and extract all relationships, entities, and connections.
 Extract:
 1. Entities (boxes, nodes, components)
 2. Relationships between entities (arrows, connections, flows)
 3. Data flows and dependencies
 4. Process flows
 5. Architecture patterns
 Return JSON with this structure:
 [
  {
    "entity1": "name of first entity",
    "entity2": "name of second entity",
    "relationship_type": "causes|depends_on|flows_to|contains|uses",
    "description": "description of the relationship",
    "confidence": 0.0-1.0
  }
 ]
 Focus on cause-effect relationships, dependencies, and flows."""
            # Prepare API request
            payload = {
                "model": self.model,
                "messages": [
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:{media_type};base64,{base64_image}"
                                }
                            },
                            {
                                "type": "text",
                                "text": prompt
                            }
                        ]
                    }
                ],
                "max_tokens": 4000,
                "temperature": 0.0
            }
            headers = {
                "Authorization": f"Bearer {self.api_key}",
                "Content-Type": "application/json"
            }
            # Make API call
            with httpx.Client(timeout=60.0) as client:
                response = client.post(self.api_url, json=payload, headers=headers)
                response.raise_for_status()
                result = response.json()
            # Parse response
            content = result.get("choices", [{}])[0].get("message", {}).get("content", "")
            if not content:
                logger.warning("Empty response from Qwen API for image %s", image_path.name)
                return []
            # Extract JSON from response
            json_text = content.strip()
            # Try to find JSON in markdown code blocks
            if "```json" in json_text:
                json_text = json_text.split("```json")[1].split("```")[0].strip()
            elif "```" in json_text:
                json_text = json_text.split("```")[1].split("```")[0].strip()
            # Parse JSON
            try:
                relationships = json.loads(json_text)
                if not isinstance(relationships, list):
                    relationships = [relationships]
                # Add source metadata
                for rel in relationships:
                    rel["source_file_id"] = source_file_id
                    rel["source_image"] = str(image_path.name)
                    rel["extraction_method"] = "qwen2.5-vl"
                logger.info("Extracted %d relationships from image %s using Qwen2.5-VL", 
                           len(relationships), image_path.name)
                return relationships
            except json.JSONDecodeError as e:
                logger.warning("Failed to parse Qwen response as JSON: %s. Content: %s", 
                             e, content[:200])
                return []
        except Exception as exc:
            logger.exception("Failed to extract relationships from image %s: %s", image_path, exc)
            return []
--- a/services/multi-document-upload-service/src/multi_document_upload_service/main.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/main.py
@ -2,15 +2,16 @@ from __future__ import annotations
 import logging
 from dataclasses import dataclass
 from pathlib import Path
 from typing import List, Optional
 from fastapi import BackgroundTasks, Depends, FastAPI, File, Form, HTTPException, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import FileResponse
 from .claude_client import ClaudeCausalExtractor
 from .config import Settings, get_settings
 from .jobs import JobStore
-from .models import CreateJobResponse, JobGraphSummary, JobStage, JobStatusResponse
+from .models import CreateJobResponse, JobGraphSummary, JobStage, JobStatusResponse, ProjectReport
 from .processors.graph_writer import GraphWriter
 from .storage import StorageManager
 from .workflows.pipeline import JobPipeline
@ -20,8 +21,8 @@ logging.basicConfig(level=logging.INFO)
 app = FastAPI(
    title="Multi Document Upload Service",
-    version="0.1.0",
+    version="0.2.0",
-    description="Processes multi-format documents to build causal knowledge graphs using Claude.",
+    description="Processes multi-format documents to build knowledge graphs and generate beginner-friendly onboarding reports.",
 )
@ -40,7 +41,6 @@ class ServiceContainer:
    storage: StorageManager
    job_store: JobStore
    graph_writer: GraphWriter
    claude_extractor: ClaudeCausalExtractor
    pipeline: JobPipeline
@ -51,29 +51,24 @@ def get_container() -> ServiceContainer:
    global _container
    if _container is None:
        settings = get_settings()
-        if not settings.anthropic_api_key:
+        # Anthropic API key is only needed for report generation, not required at startup
-            raise HTTPException(status_code=500, detail="ANTHROPIC_API_KEY is not configured")
+        # if not settings.anthropic_api_key:
        #     raise HTTPException(status_code=500, detail="ANTHROPIC_API_KEY is not configured")
        storage = StorageManager(settings.storage_root)
        job_store = JobStore(settings.storage_root)
        graph_writer = GraphWriter(settings.neo4j_uri, settings.neo4j_user, settings.neo4j_password)
-        claude_extractor = ClaudeCausalExtractor(
+        
            api_key=settings.anthropic_api_key,
            model=settings.claude_model,
            max_output_tokens=min(settings.claude_max_output_tokens, 4000),
        )
        pipeline = JobPipeline(
            job_store=job_store,
            storage=storage,
            graph_writer=graph_writer,
            claude_extractor=claude_extractor,
        )
        _container = ServiceContainer(
            settings=settings,
            storage=storage,
            job_store=job_store,
            graph_writer=graph_writer,
            claude_extractor=claude_extractor,
            pipeline=pipeline,
        )
    return _container
@ -170,14 +165,86 @@ async def get_job_graph(job_id: str, container: ServiceContainer = Depends(get_d
    )
@app.get("/jobs/{job_id}/report", response_model=ProjectReport)
 async def get_job_report(job_id: str, container: ServiceContainer = Depends(get_dependencies)) -> ProjectReport:
    """Get the generated beginner-friendly onboarding report."""
    job_store = container.job_store
    if not job_store.exists(job_id):
        raise HTTPException(status_code=404, detail="Job not found")
    job = job_store.get(job_id)
    if job.stage != JobStage.COMPLETED:
        raise HTTPException(
            status_code=409,
            detail="Report not ready yet. Job is still processing."
        )
    if not job.report:
        # Check if there was an error during report generation
        error_msg = "Report not found. "
        if job.error:
            # Check if error is specifically about report generation
            if "report generation" in job.error.lower() or "claude" in job.error.lower():
                error_msg = job.error
            else:
                error_msg += f"Error during generation: {job.error}"
        else:
            error_msg += "Report generation may have failed (check logs for details)."
        raise HTTPException(
            status_code=404,
            detail=error_msg
        )
    return job.report
@app.get("/jobs/{job_id}/report/pdf")
 async def get_job_report_pdf(job_id: str, container: ServiceContainer = Depends(get_dependencies)):
    """Download the PDF version of the onboarding report (as per README Step 7.9)."""
    job_store = container.job_store
    if not job_store.exists(job_id):
        raise HTTPException(status_code=404, detail="Job not found")
    job = job_store.get(job_id)
    if job.stage != JobStage.COMPLETED:
        raise HTTPException(
            status_code=409,
            detail="Report not ready yet. Job is still processing."
        )
    if not job.report:
        raise HTTPException(
            status_code=404,
            detail="Report not found. Job may have completed without generating report."
        )
    # Get PDF path from report metadata
    pdf_path_str = job.report.metadata.get("pdf_path")
    if not pdf_path_str:
        raise HTTPException(
            status_code=404,
            detail="PDF not available. Report may have been generated without PDF conversion."
        )
    pdf_path = Path(pdf_path_str)
    if not pdf_path.exists():
        raise HTTPException(
            status_code=404,
            detail="PDF file not found on server."
        )
    return FileResponse(
        path=pdf_path,
        media_type="application/pdf",
        filename=f"onboarding_report_{job_id}.pdf"
    )
@app.get("/health")
 async def healthcheck(container: ServiceContainer = Depends(get_dependencies)):
    settings = container.settings
    return {
        "status": "ok",
        "claude_model": settings.claude_model,
-        "max_input_tokens_per_min": settings.claude_max_input_tokens,
+        "qwen_model": settings.qwen_model,
-        "max_output_tokens_per_min": settings.claude_max_output_tokens,
+        "embedding_model": settings.embedding_model,
        "qdrant_url": settings.qdrant_url,
        "dowhy_enabled": settings.dowhy_enabled,
    }
--- a/services/multi-document-upload-service/src/multi_document_upload_service/models.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/models.py
@ -10,9 +10,10 @@ from pydantic import BaseModel, Field
 class JobStage(str, Enum):
    RECEIVED = "received"
    SAVING_FILES = "saving_files"
-    EXTRACTING = "extracting"
+    EXTRACTING = "extracting"  # PyMuPDF + Qwen2.5-VL
-    ANALYZING = "analyzing"
+    BUILDING_GRAPH = "building_graph"  # DoWhy + Neo4j
-    BUILDING_GRAPH = "building_graph"
+    INDEXING_VECTORS = "indexing_vectors"  # Qdrant
    GENERATING_REPORT = "generating_report"  # Claude onboarding doc
    COMPLETED = "completed"
    FAILED = "failed"
@ -34,6 +35,7 @@ class CausalRelation(BaseModel):
    explanation: Optional[str] = None
    source_file_id: Optional[str] = None
    source_snippet: Optional[str] = None
    relationship_type: str = Field(default="CAUSES")  # DEPENDS_ON, USES, IMPLEMENTS, etc.
    metadata: Dict[str, Any] = Field(default_factory=dict)
@ -46,6 +48,7 @@ class JobRecord(BaseModel):
    total_files: int = 0
    processed_files: int = 0
    relations: List[CausalRelation] = Field(default_factory=list)
    report: Optional[ProjectReport] = None  # Generated onboarding report
    created_at: datetime = Field(default_factory=datetime.utcnow)
    updated_at: datetime = Field(default_factory=datetime.utcnow)
    error: str | None = None
@ -82,3 +85,15 @@ class JobGraphSummary(BaseModel):
    edge_count: int
    generated_at: datetime
 class ProjectReport(BaseModel):
    """Beginner-friendly onboarding report generated from project documents."""
    job_id: str
    title: str = "Project Onboarding Guide"
    content: str  # Markdown content
    sections: Dict[str, str] = Field(default_factory=dict)  # Section name -> content
    key_concepts: List[str] = Field(default_factory=list)  # Important concepts covered
    total_pages: int = 0  # Estimated pages
    generated_at: datetime = Field(default_factory=datetime.utcnow)
    metadata: Dict[str, Any] = Field(default_factory=dict)
--- a/services/multi-document-upload-service/src/multi_document_upload_service/processors/chunker.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/processors/chunker.py
@ -1,24 +0,0 @@
 from __future__ import annotations
 from typing import Iterable, List
 import tiktoken
 class TextChunker:
    def __init__(self, model_name: str, token_target: int = 800, overlap: int = 200):
        self.encoder = tiktoken.encoding_for_model("gpt-4o") if "claude" not in model_name else tiktoken.get_encoding("cl100k_base")
        self.token_target = token_target
        self.overlap = overlap
    def chunk(self, text: str) -> Iterable[str]:
        tokens = self.encoder.encode(text)
        step = max(self.token_target - self.overlap, 1)
        chunks: List[str] = []
        for start in range(0, len(tokens), step):
            end = min(start + self.token_target, len(tokens))
            chunk_tokens = tokens[start:end]
            chunk_text = self.encoder.decode(chunk_tokens)
            chunks.append(chunk_text)
        return chunks
--- a/services/multi-document-upload-service/src/multi_document_upload_service/processors/dowhy_analyzer.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/processors/dowhy_analyzer.py
@ -0,0 +1,187 @@
 from __future__ import annotations
 import logging
 from typing import List, Optional
 import pandas as pd
 from ..config import get_settings
 from ..models import CausalRelation
 logger = logging.getLogger(__name__)
 try:
    import dowhy
    from dowhy import CausalModel
    HAS_DOWHY = True
 except ImportError:
    HAS_DOWHY = False
    logger.warning("DoWhy not available")
 class DoWhyAnalyzer:
    """Validate causal relationships using DoWhy Structural Causal Models."""
    def __init__(self, confidence_threshold: Optional[float] = None):
        if not HAS_DOWHY:
            raise ImportError("DoWhy is required for causal analysis")
        settings = get_settings()
        self.confidence_threshold = confidence_threshold or settings.dowhy_confidence_threshold
        self.enabled = settings.dowhy_enabled
    def validate_relationships(
        self, 
        relationships: List[CausalRelation],
        text_data: Optional[str] = None
    ) -> List[CausalRelation]:
        """
        Validate causal relationships using DoWhy SCM.
        Filters out relationships that don't pass validation.
        """
        if not self.enabled:
            logger.info("DoWhy validation is disabled, returning all relationships")
            return relationships
        if not relationships:
            return []
        validated: List[CausalRelation] = []
        # Group relationships by cause to build SCM
        cause_groups = {}
        for rel in relationships:
            cause = rel.cause
            if cause not in cause_groups:
                cause_groups[cause] = []
            cause_groups[cause].append(rel)
        # Validate each group
        for cause, effects in cause_groups.items():
            for rel in effects:
                try:
                    is_valid = self._validate_single_relationship(rel, relationships, text_data)
                    if is_valid:
                        # Update confidence with validation score
                        rel.confidence = min(rel.confidence + 0.1, 0.95)  # Boost validated relationships
                        rel.metadata["dowhy_validated"] = True
                        validated.append(rel)
                    else:
                        logger.debug("DoWhy validation failed for: %s -> %s", rel.cause, rel.effect)
                except Exception as exc:
                    logger.warning("DoWhy validation error for %s -> %s: %s", 
                                 rel.cause, rel.effect, exc)
                    # If validation fails, keep the relationship but mark it
                    rel.metadata["dowhy_validated"] = False
                    rel.metadata["dowhy_error"] = str(exc)
                    validated.append(rel)  # Keep it but with lower confidence
        logger.info("DoWhy validated %d/%d relationships", len(validated), len(relationships))
        return validated
    def _validate_single_relationship(
        self,
        relationship: CausalRelation,
        all_relationships: List[CausalRelation],
        text_data: Optional[str] = None
    ) -> bool:
        """
        Validate a single relationship using DoWhy.
        Returns True if relationship is valid, False otherwise.
        """
        try:
            # Build a simple causal graph from relationships
            # Extract unique variables (causes and effects)
            variables = set()
            for rel in all_relationships:
                variables.add(rel.cause)
                variables.add(rel.effect)
            # Create a simple dataset for DoWhy
            # Since we don't have actual data, we'll use a heuristic approach
            # based on relationship frequency and structure
            # Check if there's a path from cause to effect in the graph
            has_path = self._check_causal_path(
                relationship.cause,
                relationship.effect,
                all_relationships
            )
            if not has_path:
                return False
            # Additional validation: check for confounders
            # If there are many relationships involving both cause and effect,
            # it's more likely to be valid
            related_count = sum(
                1 for rel in all_relationships
                if rel.cause == relationship.cause or rel.effect == relationship.effect
            )
            # If there are multiple relationships involving these concepts,
            # it's more likely to be a valid causal relationship
            if related_count >= 2:
                return True
            # For single relationships, use confidence threshold
            return relationship.confidence >= 0.6
        except Exception as exc:
            logger.warning("DoWhy validation error: %s", exc)
            return False
    def _check_causal_path(
        self,
        cause: str,
        effect: str,
        relationships: List[CausalRelation],
        max_depth: int = 3
    ) -> bool:
        """Check if there's a causal path from cause to effect."""
        if max_depth == 0:
            return False
        # Direct relationship
        for rel in relationships:
            if rel.cause == cause and rel.effect == effect:
                return True
        # Indirect relationship (transitive)
        for rel in relationships:
            if rel.cause == cause:
                # Check if rel.effect leads to the target effect
                if self._check_causal_path(rel.effect, effect, relationships, max_depth - 1):
                    return True
        return False
    def build_scm_from_relationships(
        self,
        relationships: List[CausalRelation]
    ) -> Optional[CausalModel]:
        """
        Build a Structural Causal Model from relationships.
        This is a simplified version for text-based causal inference.
        """
        if not relationships:
            return None
        try:
            # Extract all unique variables
            variables = set()
            for rel in relationships:
                variables.add(rel.cause)
                variables.add(rel.effect)
            # Create a simple adjacency matrix representation
            # This is a heuristic approach since we don't have actual data
            # For now, return None as building a full SCM requires actual data
            # The validation uses graph-based heuristics instead
            return None
        except Exception as exc:
            logger.warning("Failed to build SCM: %s", exc)
            return None
--- a/services/multi-document-upload-service/src/multi_document_upload_service/processors/embedder.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/processors/embedder.py
@ -0,0 +1,85 @@
 from __future__ import annotations
 import logging
 from typing import List
 from ..config import get_settings
 logger = logging.getLogger(__name__)
 try:
    from sentence_transformers import SentenceTransformer
    HAS_SENTENCE_TRANSFORMERS = True
 except ImportError:
    HAS_SENTENCE_TRANSFORMERS = False
    logger.warning("sentence-transformers not available")
 class Embedder:
    """Generate embeddings using sentence-transformers."""
    def __init__(self, model_name: str | None = None):
        if not HAS_SENTENCE_TRANSFORMERS:
            raise ImportError("sentence-transformers is required for embeddings")
        settings = get_settings()
        self.model_name = model_name or settings.embedding_model
        logger.info("Loading embedding model: %s", self.model_name)
        try:
            self.model = SentenceTransformer(self.model_name)
            self.dimension = self.model.get_sentence_embedding_dimension()
            logger.info("Loaded embedding model with dimension: %d", self.dimension)
        except Exception as exc:
            logger.exception("Failed to load embedding model %s: %s", self.model_name, exc)
            raise
    def embed_text(self, text: str) -> List[float]:
        """Generate embedding for a single text."""
        if not text or not text.strip():
            # Return zero vector for empty text
            return [0.0] * self.dimension
        try:
            embedding = self.model.encode(text, normalize_embeddings=True)
            return embedding.tolist()
        except Exception as exc:
            logger.warning("Failed to embed text: %s", exc)
            return [0.0] * self.dimension
    def embed_batch(self, texts: List[str], batch_size: int = 32) -> List[List[float]]:
        """Generate embeddings for a batch of texts."""
        if not texts:
            return []
        try:
            embeddings = self.model.encode(
                texts,
                batch_size=batch_size,
                normalize_embeddings=True,
                show_progress_bar=False
            )
            return embeddings.tolist()
        except Exception as exc:
            logger.warning("Failed to embed batch: %s", exc)
            return [[0.0] * self.dimension] * len(texts)
    def embed_relation(self, cause: str, effect: str, explanation: str | None = None) -> List[float]:
        """Generate embedding for a cause-effect relationship."""
        # Combine cause, effect, and explanation into a single text
        parts = [cause, "causes", effect]
        if explanation:
            parts.append(explanation)
        text = " ".join(parts)
        return self.embed_text(text)
    def embed_concept(self, concept_name: str, description: str | None = None) -> List[float]:
        """Generate embedding for a concept/node."""
        if description:
            text = f"{concept_name}: {description}"
        else:
            text = concept_name
        return self.embed_text(text)
--- a/services/multi-document-upload-service/src/multi_document_upload_service/processors/entity_resolver.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/processors/entity_resolver.py
@ -0,0 +1,253 @@
 from __future__ import annotations
 import json
 import logging
 import re
 from typing import Dict, List, Set
 from anthropic import Anthropic, BadRequestError
 from ..config import get_settings
 from ..models import CausalRelation
 logger = logging.getLogger(__name__)
 class EntityResolver:
    """
    Resolve entity mentions using Claude AI as per README Stage 4.
    Identifies that different mentions refer to the same entity.
    """
    def __init__(self):
        settings = get_settings()
        self.api_key = settings.anthropic_api_key
        self.model = settings.claude_model
        self.max_output_tokens = settings.claude_max_output_tokens
        if not self.api_key:
            logger.warning("ANTHROPIC_API_KEY not set - Entity resolution will be skipped")
            self.client = None
        else:
            try:
                self.client = Anthropic(api_key=self.api_key)
                logger.info("EntityResolver initialized with Claude AI")
            except Exception as e:
                logger.warning("Failed to initialize Claude AI for entity resolution: %s", e)
                self.client = None
    def resolve_entities(self, relations: List[CausalRelation]) -> Dict[str, Dict]:
        """
        Resolve entity mentions across all documents as per README Step 4.
        Step 4.1: Collect all entities
        Step 4.2: Group by entity type
        Step 4.3: AI-powered resolution (Claude API)
        Step 4.4: Create canonical names
        Returns mapping: canonical_name -> {mentions, type, role, confidence}
        """
        if not self.client:
            logger.info("Entity resolution skipped (Claude AI not available)")
            return {}
        if not relations:
            return {}
        # Step 4.1: COLLECT ALL ENTITIES
        all_mentions: Set[str] = set()
        for rel in relations:
            all_mentions.add(rel.cause.strip())
            all_mentions.add(rel.effect.strip())
        if not all_mentions:
            return {}
        logger.info("Collecting %d entity mentions for resolution", len(all_mentions))
        # Step 4.2: GROUP BY ENTITY TYPE (simple heuristic)
        people_mentions = []
        project_mentions = []
        team_mentions = []
        other_mentions = []
        for mention in all_mentions:
            mention_lower = mention.lower()
            if any(word in mention_lower for word in ["team", "department", "group", "division"]):
                team_mentions.append(mention)
            elif any(word in mention_lower for word in ["project", "system", "application", "platform"]):
                project_mentions.append(mention)
            elif len(mention.split()) <= 3 and not any(char.isdigit() for char in mention):
                # Likely a person name (short, no numbers)
                people_mentions.append(mention)
            else:
                other_mentions.append(mention)
        # Step 4.3: AI-POWERED RESOLUTION (Claude API)
        resolved_entities = {}
        # Resolve people
        if people_mentions:
            people_resolved = self._resolve_with_claude(people_mentions, "Person")
            resolved_entities.update(people_resolved)
        # Resolve projects
        if project_mentions:
            projects_resolved = self._resolve_with_claude(project_mentions, "Project")
            resolved_entities.update(projects_resolved)
        # Resolve teams
        if team_mentions:
            teams_resolved = self._resolve_with_claude(team_mentions, "Team")
            resolved_entities.update(teams_resolved)
        # Resolve others
        if other_mentions:
            others_resolved = self._resolve_with_claude(other_mentions, "Entity")
            resolved_entities.update(others_resolved)
        logger.info("Resolved %d entities from %d mentions", len(resolved_entities), len(all_mentions))
        return resolved_entities
    def _resolve_with_claude(self, mentions: List[str], entity_type: str) -> Dict[str, Dict]:
        """Use Claude AI to resolve entity mentions."""
        if not self.client or not mentions:
            return {}
        try:
            system_prompt = """You are an expert at entity resolution. Your task is to identify which mentions refer to the same real-world entity.
 Analyze the given list of entity mentions and group them by the actual entity they refer to.
 Return a JSON object where:
 - Key: Canonical name (best/most complete name)
 - Value: Object with:
  - "mentions": List of all mentions that refer to this entity
  - "type": Entity type (Person, Project, Team, etc.)
  - "role": Role or description (if applicable)
  - "confidence": Confidence score (0.0 to 1.0)
 Example:
 {
  "John Smith": {
    "mentions": ["John", "J. Smith", "John Smith", "Smith"],
    "type": "Person",
    "role": "Project Lead",
    "confidence": 0.95
  },
  "Project Alpha": {
    "mentions": ["Project Alpha", "Alpha", "The Alpha Project"],
    "type": "Project",
    "role": null,
    "confidence": 0.90
  }
 }
 Be thorough and group all related mentions together."""
            user_prompt = f"""Analyze these {entity_type} entity mentions and resolve which ones refer to the same entity:
 {json.dumps(mentions, indent=2)}
 Return a JSON object mapping canonical names to their resolved mentions."""
            message = self.client.messages.create(
                model=self.model,
                max_tokens=self.max_output_tokens,
                temperature=0.2,  # Lower temperature for more consistent resolution
                system=system_prompt,
                messages=[{"role": "user", "content": user_prompt}]
            )
            response_text = "".join(
                block.text for block in message.content 
                if hasattr(block, "text")
            )
            if not response_text:
                logger.warning("Empty response from Claude for entity resolution")
                return {}
            # Parse JSON response
            try:
                json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
                if json_match:
                    json_text = json_match.group(0)
                else:
                    json_text = response_text
                resolved = json.loads(json_text)
                # Validate and structure the response
                result = {}
                for canonical_name, entity_data in resolved.items():
                    if isinstance(entity_data, dict):
                        result[canonical_name] = {
                            "mentions": entity_data.get("mentions", [canonical_name]),
                            "type": entity_data.get("type", entity_type),
                            "role": entity_data.get("role"),
                            "confidence": float(entity_data.get("confidence", 0.85))
                        }
                    else:
                        # Fallback if structure is different
                        result[canonical_name] = {
                            "mentions": [canonical_name] if isinstance(entity_data, str) else entity_data,
                            "type": entity_type,
                            "role": None,
                            "confidence": 0.8
                        }
                return result
            except json.JSONDecodeError as e:
                logger.warning("Failed to parse Claude response as JSON: %s. Response: %s", 
                             e, response_text[:500])
                return {}
        except BadRequestError as e:
            logger.warning("Claude API error during entity resolution: %s", e)
            return {}
        except Exception as e:
            logger.warning("Entity resolution failed: %s", e)
            return {}
    def apply_resolution_to_relations(
        self, 
        relations: List[CausalRelation], 
        resolved_entities: Dict[str, Dict]
    ) -> List[CausalRelation]:
        """
        Apply entity resolution to relationships.
        Replace mentions with canonical names.
        """
        if not resolved_entities:
            return relations
        # Create reverse mapping: mention -> canonical_name
        mention_to_canonical: Dict[str, str] = {}
        for canonical_name, entity_data in resolved_entities.items():
            mentions = entity_data.get("mentions", [])
            for mention in mentions:
                mention_to_canonical[mention.lower()] = canonical_name
        # Update relations with canonical names
        updated_relations = []
        for rel in relations:
            # Resolve cause
            cause_lower = rel.cause.strip().lower()
            if cause_lower in mention_to_canonical:
                rel.cause = mention_to_canonical[cause_lower]
            # Resolve effect
            effect_lower = rel.effect.strip().lower()
            if effect_lower in mention_to_canonical:
                rel.effect = mention_to_canonical[effect_lower]
            # Store resolution info in metadata
            rel.metadata["entity_resolved"] = True
            updated_relations.append(rel)
        logger.info("Applied entity resolution to %d relationships", len(updated_relations))
        return updated_relations
--- a/services/multi-document-upload-service/src/multi_document_upload_service/processors/graph_writer.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/processors/graph_writer.py
@ -1,23 +1,163 @@
 from __future__ import annotations
 import json
 import logging
-from typing import Iterable
+import re
 from typing import Dict, Iterable, List, Optional
 from anthropic import Anthropic, BadRequestError
 from neo4j import GraphDatabase, Transaction
 from ..config import get_settings
 from ..models import CausalRelation
 logger = logging.getLogger(__name__)
-MERGE_QUERY = """
+# Query to create Document node
-MERGE (cause:Concept {name: $cause})
+CREATE_DOCUMENT_QUERY = """
-ON CREATE SET cause.created_at = timestamp(), cause.lastSeen = timestamp()
+MERGE (doc:Document {filename: $filename})
-ON MATCH SET cause.lastSeen = timestamp()
+ON CREATE SET doc.uploaded_at = timestamp(),
-MERGE (effect:Concept {name: $effect})
+              doc.file_path = $file_path,
-ON CREATE SET effect.created_at = timestamp(), effect.lastSeen = timestamp()
+              doc.job_id = $job_id,
-ON MATCH SET effect.lastSeen = timestamp()
+              doc.created_at = timestamp()
-MERGE (cause)-[r:CAUSES]->(effect)
+ON MATCH SET doc.lastSeen = timestamp()
 """
 # Query to create Entity nodes and relationship with dynamic type
 CREATE_ENTITY_RELATIONSHIP_QUERY = """
 MERGE (source:Entity:Concept {name: $source})
 ON CREATE SET source.created_at = timestamp(), 
              source.lastSeen = timestamp(),
              source.type = COALESCE($source_type, 'Entity')
 ON MATCH SET source.lastSeen = timestamp()
 MERGE (target:Entity:Concept {name: $target})
 ON CREATE SET target.created_at = timestamp(), 
              target.lastSeen = timestamp(),
              target.type = COALESCE($target_type, 'Entity')
 ON MATCH SET target.lastSeen = timestamp()
 WITH source, target
 CALL apoc.merge.relationship(
  source,
  $rel_type,
  {confidence: $confidence,
   explanation: $explanation,
   source_file_id: $source_file_id,
   source_snippet: $source_snippet,
   job_id: $job_id,
   model: $model,
   created_at: timestamp(),
   updated_at: timestamp()},
  {confidence: $confidence,
   explanation: $explanation,
   source_file_id: $source_file_id,
   source_snippet: $source_snippet,
   job_id: $job_id,
   model: $model,
   updated_at: timestamp()},
  target
 ) YIELD rel
 RETURN rel
 """
 class GraphWriter:
    def __init__(self, uri: str, user: str, password: str):
        self._driver = GraphDatabase.driver(uri, auth=(user, password))
    def close(self) -> None:
        self._driver.close()
    def write_documents(self, job_id: str, files: Iterable) -> None:
        """Create Document nodes for uploaded files."""
        files_list = list(files)
        if not files_list:
            return
        logger.info("Creating %d document nodes for job %s", len(files_list), job_id)
        with self._driver.session() as session:
            def _write_docs(tx: Transaction) -> None:
                for file_record in files_list:
                    try:
                        tx.run(
                            CREATE_DOCUMENT_QUERY,
                            filename=file_record.filename,
                            file_path=file_record.stored_path,
                            job_id=job_id
                        )
                        logger.debug("Created document node: %s", file_record.filename)
                    except Exception as exc:
                        logger.warning("Failed to create document node for %s: %s", file_record.filename, exc)
            session.execute_write(_write_docs)
            logger.info("Created document nodes for job %s", job_id)
    def write_relations(self, job_id: str, relations: Iterable[CausalRelation], files: Iterable = None) -> None:
        """Write entities and relationships to Neo4j with multiple relationship types."""
        relations_list = list(relations)
        if not relations_list:
            logger.warning("No relations to write for job %s", job_id)
            return
        # Create document nodes if files provided
        if files:
            self.write_documents(job_id, files)
        logger.info("Writing %d relations to Neo4j for job %s", len(relations_list), job_id)
        with self._driver.session() as session:
            def _write(tx: Transaction) -> None:
                count = 0
                for relation in relations_list:
                    if not relation.cause or not relation.effect:
                        logger.warning("Skipping relation with empty cause or effect: %s -> %s", relation.cause, relation.effect)
                        continue
                    # Get relationship type (default to CAUSES for backward compatibility)
                    rel_type = getattr(relation, 'relationship_type', None) or "CAUSES"
                    # Sanitize relationship type (only allow alphanumeric and underscores)
                    rel_type = re.sub(r'[^A-Z0-9_]', '', rel_type.upper())
                    if not rel_type:
                        rel_type = "CAUSES"
                    # Infer entity types from names (simple heuristic)
                    source_type = self._infer_entity_type(relation.cause)
                    target_type = self._infer_entity_type(relation.effect)
                    try:
                        # Create source entity
                        tx.run("""
                        MERGE (source:Entity:Concept {name: $source})
                        ON CREATE SET source.created_at = timestamp(), 
                                      source.lastSeen = timestamp(),
                                      source.type = $source_type
                        ON MATCH SET source.lastSeen = timestamp()
                        """,
                        source=relation.cause.strip(),
                        source_type=source_type
                        )
                        # Create target entity
                        tx.run("""
                        MERGE (target:Entity:Concept {name: $target})
                        ON CREATE SET target.created_at = timestamp(), 
                                      target.lastSeen = timestamp(),
                                      target.type = $target_type
                        ON MATCH SET target.lastSeen = timestamp()
                        """,
                        target=relation.effect.strip(),
                        target_type=target_type
                        )
                        # Create relationship with dynamic type (sanitized)
                        query = f"""
                        MATCH (source:Entity {{name: $source}})
                        MATCH (target:Entity {{name: $target}})
                        MERGE (source)-[r:{rel_type}]->(target)
                        ON CREATE SET r.confidence = $confidence,
                                      r.explanation = $explanation,
                                      r.source_file_id = $source_file_id,
@ -35,34 +175,10 @@ ON MATCH SET r.confidence = $confidence,
                                     r.updated_at = timestamp()
                        """
 class GraphWriter:
    def __init__(self, uri: str, user: str, password: str):
        self._driver = GraphDatabase.driver(uri, auth=(user, password))
    def close(self) -> None:
        self._driver.close()
    def write_relations(self, job_id: str, relations: Iterable[CausalRelation]) -> None:
        relations_list = list(relations)
        if not relations_list:
            logger.warning("No relations to write for job %s", job_id)
            return
        logger.info("Writing %d relations to Neo4j for job %s", len(relations_list), job_id)
        with self._driver.session() as session:
            def _write(tx: Transaction) -> None:
                count = 0
                for relation in relations_list:
                    if not relation.cause or not relation.effect:
                        logger.warning("Skipping relation with empty cause or effect: %s -> %s", relation.cause, relation.effect)
                        continue
                    try:
                        result = tx.run(
-                            MERGE_QUERY,
+                            query,
-                            cause=relation.cause.strip(),
+                            source=relation.cause.strip(),
-                            effect=relation.effect.strip(),
+                            target=relation.effect.strip(),
                            confidence=float(relation.confidence) if relation.confidence else 0.0,
                            explanation=relation.explanation or "",
                            source_file_id=relation.source_file_id or "",
@ -70,12 +186,145 @@ class GraphWriter:
                            job_id=job_id,
                            model=relation.metadata.get("model") or "",
                        )
                        # Link entities to documents if source_file_id is a filename
                        if relation.source_file_id and relation.source_file_id != "combined_text":
                            link_query = f"""
                            MATCH (entity:Entity {{name: $entity_name}})
                            MATCH (doc:Document {{filename: $filename}})
                            MERGE (entity)-[:EXTRACTED_FROM]->(doc)
                            """
                            try:
                                tx.run(link_query, entity_name=relation.cause.strip(), filename=relation.source_file_id)
                                tx.run(link_query, entity_name=relation.effect.strip(), filename=relation.source_file_id)
                            except:
                                pass  # Ignore if document doesn't exist
                        count += 1
-                        logger.debug("Wrote relation: %s -> %s (confidence: %s)", relation.cause, relation.effect, relation.confidence)
+                        logger.debug("Wrote relation: %s -[%s]-> %s (confidence: %s)", 
                                   relation.cause, rel_type, relation.effect, relation.confidence)
                    except Exception as exc:
                        logger.exception("Failed to write relation %s -> %s: %s", relation.cause, relation.effect, exc)
                logger.info("Successfully wrote %d/%d relations to Neo4j", count, len(relations_list))
            session.execute_write(_write)
-            logger.info("Persisted causal relations for job %s", job_id)
+            logger.info("Persisted relations for job %s", job_id)
    def _infer_entity_type(self, entity_name: str) -> str:
        """Infer entity type from name (simple heuristic)."""
        name_lower = entity_name.lower()
        # Technology patterns
        if any(tech in name_lower for tech in ['react', 'node', 'python', 'java', 'postgres', 'mysql', 'redis', 'mongodb', 'docker', 'kubernetes']):
            return "Technology"
        # Service patterns
        if any(word in name_lower for word in ['service', 'api', 'gateway', 'auth', 'payment', 'notification']):
            return "Service"
        # Component patterns
        if any(word in name_lower for word in ['component', 'module', 'system', 'application', 'platform']):
            return "Component"
        # Process patterns
        if any(word in name_lower for word in ['flow', 'process', 'workflow', 'pipeline', 'procedure']):
            return "Process"
        # Default
        return "Entity"
    def query_causal_chains(
        self, 
        job_id: str, 
        min_length: int = 2, 
        max_length: int = 4,
        min_confidence: float = 0.8,
        limit: int = 20
    ) -> List[Dict]:
        """
        Query Neo4j for causal chains as per README Step 7.3.
        Returns sequences of connected events.
        """
        # Query for causal chains - match any relationship type
        query = f"""
        MATCH path = (start:Entity)-[r*{min_length}..{max_length}]->(end:Entity)
        WHERE ALL(rel in relationships(path) WHERE rel.job_id = $job_id AND rel.confidence >= $min_confidence)
        WITH path, 
             [node in nodes(path) | node.name] as chain,
             [rel in relationships(path) | rel.confidence] as confidences,
             [rel in relationships(path) | type(rel)] as rel_types,
             [rel in relationships(path) | rel.explanation] as explanations
        RETURN chain, confidences, rel_types, explanations
        ORDER BY reduce(conf = 0.0, c in confidences | conf + c) DESC
        LIMIT $limit
        """
        try:
            with self._driver.session() as session:
                result = session.run(
                    query,
                    job_id=job_id,
                    min_confidence=min_confidence,
                    limit=limit
                )
                chains = []
                for record in result:
                    chain = record["chain"]
                    confidences = record["confidences"]
                    rel_types = record["rel_types"]
                    explanations = record["explanations"]
                    # Calculate average confidence
                    avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
                    chains.append({
                        "chain": chain,
                        "confidences": confidences,
                        "rel_types": rel_types,
                        "explanations": explanations,
                        "avg_confidence": avg_confidence,
                        "length": len(chain) - 1
                    })
                logger.info("Found %d causal chains for job %s", len(chains), job_id)
                return chains
        except Exception as exc:
            logger.exception("Failed to query causal chains: %s", exc)
            return []
    def query_key_entities(self, job_id: str, limit: int = 20) -> List[Dict]:
        """
        Query Neo4j for key entities (most involved) as per README Step 7.3.
        """
        query = """
        MATCH (e:Entity)-[r]->(target)
        WHERE r.job_id = $job_id
        WITH e, count(r) as relation_count, collect(DISTINCT type(r)) as rel_types
        RETURN e.name as name, 
               e.type as type,
               relation_count,
               rel_types
        ORDER BY relation_count DESC
        LIMIT $limit
        """
        try:
            with self._driver.session() as session:
                result = session.run(query, job_id=job_id, limit=limit)
                entities = []
                for record in result:
                    entities.append({
                        "name": record["name"],
                        "type": record.get("type", "Entity"),
                        "relation_count": record["relation_count"],
                        "relation_types": record["rel_types"]
                    })
                logger.info("Found %d key entities for job %s", len(entities), job_id)
                return entities
        except Exception as exc:
            logger.exception("Failed to query key entities: %s", exc)
            return []
--- a/services/multi-document-upload-service/src/multi_document_upload_service/processors/relationship_extractor.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/processors/relationship_extractor.py
@ -0,0 +1,625 @@
 from __future__ import annotations
 import json
 import logging
 import re
 from typing import Dict, List, Optional
 from anthropic import Anthropic, BadRequestError
 from ..config import get_settings
 from ..models import CausalRelation
 logger = logging.getLogger(__name__)
 # Try to import SpaCy
 try:
    import spacy
    from spacy.lang.en import English
    HAS_SPACY = True
 except ImportError:
    HAS_SPACY = False
    logger.warning("spacy not available - NLP detection will be skipped")
 class RelationshipExtractor:
    """Extract potential cause-effect relationships from text using NLP (SpaCy) + Claude AI."""
    # Causal keywords for NLP detection (Step 3.1)
    CAUSAL_KEYWORDS = [
        "because", "due to", "as a result", "led to", "caused", "therefore",
        "consequently", "hence", "thus", "so", "since", "owing to",
        "resulted in", "brought about", "gave rise to", "triggered",
        "provoked", "induced", "generated", "produced", "created"
    ]
    # Common cause-effect patterns (expanded for architecture/technical documents)
    CAUSE_EFFECT_PATTERNS = [
        # Direct causal patterns
        (r"(\w+(?:\s+\w+){0,15})\s+causes?\s+(\w+(?:\s+\w+){0,15})", "causes"),
        (r"(\w+(?:\s+\w+){0,15})\s+leads?\s+to\s+(\w+(?:\s+\w+){0,15})", "leads_to"),
        (r"(\w+(?:\s+\w+){0,15})\s+results?\s+in\s+(\w+(?:\s+\w+){0,15})", "results_in"),
        (r"(\w+(?:\s+\w+){0,15})\s+triggers?\s+(\w+(?:\s+\w+){0,15})", "triggers"),
        (r"(\w+(?:\s+\w+){0,15})\s+produces?\s+(\w+(?:\s+\w+){0,15})", "produces"),
        (r"(\w+(?:\s+\w+){0,15})\s+enables?\s+(\w+(?:\s+\w+){0,15})", "enables"),
        (r"(\w+(?:\s+\w+){0,15})\s+allows?\s+(\w+(?:\s+\w+){0,15})", "allows"),
        (r"(\w+(?:\s+\w+){0,15})\s+facilitates?\s+(\w+(?:\s+\w+){0,15})", "facilitates"),
        # Dependency patterns
        (r"(\w+(?:\s+\w+){0,15})\s+depends?\s+on\s+(\w+(?:\s+\w+){0,15})", "depends_on"),
        (r"(\w+(?:\s+\w+){0,15})\s+requires?\s+(\w+(?:\s+\w+){0,15})", "requires"),
        (r"(\w+(?:\s+\w+){0,15})\s+needs?\s+(\w+(?:\s+\w+){0,15})", "needs"),
        (r"(\w+(?:\s+\w+){0,15})\s+relies?\s+on\s+(\w+(?:\s+\w+){0,15})", "relies_on"),
        (r"(\w+(?:\s+\w+){0,15})\s+uses?\s+(\w+(?:\s+\w+){0,15})", "uses"),
        (r"(\w+(?:\s+\w+){0,15})\s+utilizes?\s+(\w+(?:\s+\w+){0,15})", "utilizes"),
        (r"(\w+(?:\s+\w+){0,15})\s+leverages?\s+(\w+(?:\s+\w+){0,15})", "leverages"),
        # Architectural/System patterns
        (r"(\w+(?:\s+\w+){0,15})\s+connects?\s+to\s+(\w+(?:\s+\w+){0,15})", "connects_to"),
        (r"(\w+(?:\s+\w+){0,15})\s+communicates?\s+with\s+(\w+(?:\s+\w+){0,15})", "communicates_with"),
        (r"(\w+(?:\s+\w+){0,15})\s+interacts?\s+with\s+(\w+(?:\s+\w+){0,15})", "interacts_with"),
        (r"(\w+(?:\s+\w+){0,15})\s+integrates?\s+with\s+(\w+(?:\s+\w+){0,15})", "integrates_with"),
        (r"(\w+(?:\s+\w+){0,15})\s+provides?\s+(\w+(?:\s+\w+){0,15})", "provides"),
        (r"(\w+(?:\s+\w+){0,15})\s+supports?\s+(\w+(?:\s+\w+){0,15})", "supports"),
        (r"(\w+(?:\s+\w+){0,15})\s+handles?\s+(\w+(?:\s+\w+){0,15})", "handles"),
        (r"(\w+(?:\s+\w+){0,15})\s+manages?\s+(\w+(?:\s+\w+){0,15})", "manages"),
        (r"(\w+(?:\s+\w+){0,15})\s+controls?\s+(\w+(?:\s+\w+){0,15})", "controls"),
        (r"(\w+(?:\s+\w+){0,15})\s+processes?\s+(\w+(?:\s+\w+){0,15})", "processes"),
        (r"(\w+(?:\s+\w+){0,15})\s+generates?\s+(\w+(?:\s+\w+){0,15})", "generates"),
        (r"(\w+(?:\s+\w+){0,15})\s+creates?\s+(\w+(?:\s+\w+){0,15})", "creates"),
        (r"(\w+(?:\s+\w+){0,15})\s+implements?\s+(\w+(?:\s+\w+){0,15})", "implements"),
        (r"(\w+(?:\s+\w+){0,15})\s+delivers?\s+(\w+(?:\s+\w+){0,15})", "delivers"),
        # Flow patterns
        (r"(\w+(?:\s+\w+){0,15})\s+flows?\s+to\s+(\w+(?:\s+\w+){0,15})", "flows_to"),
        (r"(\w+(?:\s+\w+){0,15})\s+sends?\s+to\s+(\w+(?:\s+\w+){0,15})", "sends_to"),
        (r"(\w+(?:\s+\w+){0,15})\s+transmits?\s+to\s+(\w+(?:\s+\w+){0,15})", "transmits_to"),
        (r"(\w+(?:\s+\w+){0,15})\s+receives?\s+from\s+(\w+(?:\s+\w+){0,15})", "receives_from"),
        # Conditional patterns
        (r"if\s+(\w+(?:\s+\w+){0,15}),\s+then\s+(\w+(?:\s+\w+){0,15})", "if_then"),
        (r"when\s+(\w+(?:\s+\w+){0,15}),\s+(\w+(?:\s+\w+){0,15})\s+occurs?", "when_then"),
        (r"(\w+(?:\s+\w+){0,15})\s+implies?\s+(\w+(?:\s+\w+){0,15})", "implies"),
        (r"(\w+(?:\s+\w+){0,15})\s+ensures?\s+(\w+(?:\s+\w+){0,15})", "ensures"),
        # Sequential patterns
        (r"(\w+(?:\s+\w+){0,15})\s+follows?\s+(\w+(?:\s+\w+){0,15})", "follows"),
        (r"(\w+(?:\s+\w+){0,15})\s+comes?\s+after\s+(\w+(?:\s+\w+){0,15})", "comes_after"),
        (r"first\s+(\w+(?:\s+\w+){0,15}),\s+then\s+(\w+(?:\s+\w+){0,15})", "first_then"),
        (r"(\w+(?:\s+\w+){0,15})\s+precedes?\s+(\w+(?:\s+\w+){0,15})", "precedes"),
        # Containment patterns
        (r"(\w+(?:\s+\w+){0,15})\s+contains?\s+(\w+(?:\s+\w+){0,15})", "contains"),
        (r"(\w+(?:\s+\w+){0,15})\s+includes?\s+(\w+(?:\s+\w+){0,15})", "includes"),
        (r"(\w+(?:\s+\w+){0,15})\s+consists?\s+of\s+(\w+(?:\s+\w+){0,15})", "consists_of"),
        # Influence patterns
        (r"(\w+(?:\s+\w+){0,15})\s+affects?\s+(\w+(?:\s+\w+){0,15})", "affects"),
        (r"(\w+(?:\s+\w+){0,15})\s+impacts?\s+(\w+(?:\s+\w+){0,15})", "impacts"),
        (r"(\w+(?:\s+\w+){0,15})\s+influences?\s+(\w+(?:\s+\w+){0,15})", "influences"),
    ]
    def __init__(self):
        """Initialize NLP and Claude AI components."""
        settings = get_settings()
        # Initialize SpaCy NLP model (Step 3.1)
        self.nlp = None
        if HAS_SPACY:
            try:
                # Try to load English model, fallback to blank if not available
                try:
                    self.nlp = spacy.load("en_core_web_sm")
                except OSError:
                    logger.warning("en_core_web_sm model not found, using blank English model")
                    self.nlp = English()
                    self.nlp.add_pipe("sentencizer")
                logger.info("SpaCy NLP model loaded")
            except Exception as e:
                logger.warning("Failed to load SpaCy model: %s", e)
                self.nlp = None
        # Initialize Claude AI client (Step 3.2)
        self.claude_client = None
        self.claude_model = settings.claude_model
        self.claude_max_input_tokens = settings.claude_max_input_tokens
        self.claude_max_output_tokens = settings.claude_max_output_tokens
        if settings.anthropic_api_key:
            try:
                self.claude_client = Anthropic(api_key=settings.anthropic_api_key)
                logger.info("Claude AI client initialized")
            except Exception as e:
                logger.warning("Failed to initialize Claude AI client: %s", e)
        else:
            logger.warning("ANTHROPIC_API_KEY not set - Claude AI extraction will be skipped")
    def extract_from_text(self, text: str, source_file_id: str) -> List[CausalRelation]:
        """
        Extract cause-effect relationships using NLP (SpaCy) + Claude AI.
        Implements Step 3.1 (NLP Detection) and Step 3.2 (Claude AI Extraction).
        """
        if not text or not text.strip():
            return []
        all_relationships: List[CausalRelation] = []
        # Step 3.1: BASIC NLP DETECTION (SpaCy)
        nlp_relationships = self._extract_with_nlp(text, source_file_id)
        all_relationships.extend(nlp_relationships)
        logger.info("NLP (SpaCy) extracted %d candidate relationships (low confidence)", 
                   len(nlp_relationships))
        # Step 3.2: AI-POWERED EXTRACTION (Claude API)
        if self.claude_client:
            claude_relationships = self._extract_with_claude(text, source_file_id)
            all_relationships.extend(claude_relationships)
            logger.info("Claude AI extracted %d relationships (high confidence)", 
                       len(claude_relationships))
        else:
            logger.info("Claude AI extraction skipped (API key not configured)")
        # Also run pattern matching as fallback
        pattern_relationships = self._extract_with_patterns(text, source_file_id)
        all_relationships.extend(pattern_relationships)
        logger.info("Pattern matching extracted %d relationships", len(pattern_relationships))
        # Deduplicate relationships
        seen = set()
        unique_relationships = []
        for rel in all_relationships:
            key = (rel.cause.lower().strip(), rel.effect.lower().strip())
            if key not in seen:
                seen.add(key)
                unique_relationships.append(rel)
        logger.info("Total unique relationships extracted: %d (from %d total)", 
                   len(unique_relationships), len(all_relationships))
        return unique_relationships
    def _extract_with_nlp(self, text: str, source_file_id: str) -> List[CausalRelation]:
        """
        Step 3.1: Basic NLP Detection using SpaCy.
        Look for causal keywords and find sentences containing these patterns.
        Returns potential causal relationships (low confidence).
        """
        if not self.nlp:
            return []
        relationships: List[CausalRelation] = []
        try:
            # Process text with SpaCy
            doc = self.nlp(text)
            # Find sentences containing causal keywords
            for sent in doc.sents:
                sent_text = sent.text.strip()
                if len(sent_text) < 10:
                    continue
                # Check if sentence contains causal keywords
                sent_lower = sent_text.lower()
                has_causal_keyword = any(keyword in sent_lower for keyword in self.CAUSAL_KEYWORDS)
                if has_causal_keyword:
                    # Try to extract cause-effect using dependency parsing
                    cause = None
                    effect = None
                    # Look for causal conjunctions
                    for token in sent:
                        if token.text.lower() in ["because", "due", "since", "as"]:
                            # Find the clause after the causal conjunction
                            if token.dep_ in ["mark", "prep"]:
                                # Try to extract cause and effect
                                cause_span = None
                                effect_span = None
                                # Simple heuristic: text before "because/due to" is effect, after is cause
                                if "because" in sent_lower or "since" in sent_lower:
                                    parts = re.split(r'\b(because|since)\b', sent_text, flags=re.IGNORECASE)
                                    if len(parts) >= 3:
                                        effect = parts[0].strip()
                                        cause = parts[2].strip()
                                elif "due to" in sent_lower:
                                    parts = re.split(r'\bdue to\b', sent_text, flags=re.IGNORECASE)
                                    if len(parts) >= 2:
                                        effect = parts[0].strip()
                                        cause = parts[1].strip()
                                if cause and effect:
                                    # Clean up cause and effect
                                    cause = re.sub(r'^[,\s]+|[,\s]+$', '', cause)
                                    effect = re.sub(r'^[,\s]+|[,\s]+$', '', effect)
                                    if len(cause) >= 3 and len(effect) >= 3:
                                        relationships.append(CausalRelation(
                                            cause=cause,
                                            effect=effect,
                                            confidence=0.5,  # Low confidence for NLP
                                            explanation=f"Extracted using NLP (SpaCy) - found causal keyword",
                                            source_file_id=source_file_id,
                                            source_snippet=sent_text[:200],
                                            relationship_type="CAUSES",
                                            metadata={
                                                "extraction_method": "spacy_nlp",
                                                "sentence": sent_text
                                            }
                                        ))
        except Exception as e:
            logger.warning("NLP extraction failed: %s", e)
        return relationships
    def _extract_with_claude(self, text: str, source_file_id: str) -> List[CausalRelation]:
        """
        Step 3.2: AI-Powered Extraction using Claude API.
        Send full document text to Claude AI and ask it to find ALL causal relationships.
        Returns high-quality causal relationships (high confidence).
        """
        if not self.claude_client:
            return []
        relationships: List[CausalRelation] = []
        try:
            # Prepare prompt for Claude
            system_prompt = """You are an expert at analyzing text and extracting cause-effect relationships.
 Your task is to identify ALL causal relationships in the given text, including both explicit and implicit ones.
 For each causal relationship, extract:
 - Cause: What triggered or led to this?
 - Effect: What was the result or outcome?
 - Context: Additional background information
 - Entities: Who or what is involved (people, teams, projects, systems)
 - Confidence: How certain are you? (0.0 to 1.0)
 - Source sentence: The sentence or passage where this relationship was found
 - Date: When did this happen (if mentioned)
 Return the results as a JSON array of objects with this structure:
 [
  {
    "cause": "string",
    "effect": "string",
    "context": "string (optional)",
    "entities": ["string"],
    "confidence": 0.0-1.0,
    "source_sentence": "string",
    "date": "string (optional)"
  }
 ]
 Focus on:
 - Explicit relationships ("because X, therefore Y")
 - Implicit relationships (strongly implied cause-effect)
 - Technical and architectural dependencies
 - Business decisions and their impacts
 - Process flows and sequences"""
            # Truncate text to fit within token limits (rough estimate: 1 token ≈ 4 characters)
            max_chars = (self.claude_max_input_tokens - 1000) * 4
            truncated_text = text[:max_chars] if len(text) > max_chars else text
            user_prompt = f"""Analyze the following text and extract ALL causal relationships.
 Text:
 {truncated_text}
 Return a JSON array of causal relationships. Be thorough and find both explicit and implicit relationships."""
            # Call Claude API
            message = self.claude_client.messages.create(
                model=self.claude_model,
                max_tokens=self.claude_max_output_tokens,
                temperature=0.3,  # Lower temperature for more focused extraction
                system=system_prompt,
                messages=[
                    {
                        "role": "user",
                        "content": user_prompt
                    }
                ]
            )
            # Extract response text
            content_blocks = message.content or []
            response_text = "".join(
                block.text for block in content_blocks 
                if hasattr(block, "text")
            )
            if not response_text:
                logger.warning("Empty response from Claude AI")
                return []
            # Parse JSON response
            try:
                # Try to extract JSON from response (might have markdown code blocks)
                json_match = re.search(r'\[.*\]', response_text, re.DOTALL)
                if json_match:
                    json_text = json_match.group(0)
                else:
                    json_text = response_text
                claude_results = json.loads(json_text)
                # Convert Claude results to CausalRelation objects
                for result in claude_results:
                    cause = result.get("cause", "").strip()
                    effect = result.get("effect", "").strip()
                    context = result.get("context", "")
                    entities = result.get("entities", [])
                    confidence = float(result.get("confidence", 0.85))
                    source_sentence = result.get("source_sentence", "")
                    date = result.get("date", "")
                    if not cause or not effect:
                        continue
                    # Map to Neo4j relationship type (default to CAUSES)
                    relationship_type = "CAUSES"
                    explanation = context or f"Extracted by Claude AI"
                    if entities:
                        explanation += f" (Entities: {', '.join(entities)})"
                    relationships.append(CausalRelation(
                        cause=cause,
                        effect=effect,
                        confidence=min(confidence, 0.95),  # Cap at 0.95
                        explanation=explanation,
                        source_file_id=source_file_id,
                        source_snippet=source_sentence[:200] if source_sentence else "",
                        relationship_type=relationship_type,
                        metadata={
                            "extraction_method": "claude_ai",
                            "context": context,
                            "entities": entities,
                            "date": date,
                            "source_sentence": source_sentence
                        }
                    ))
                logger.info("Claude AI successfully extracted %d relationships", len(relationships))
            except json.JSONDecodeError as e:
                logger.warning("Failed to parse Claude AI response as JSON: %s. Response: %s", 
                             e, response_text[:500])
            except Exception as e:
                logger.warning("Error processing Claude AI response: %s", e)
        except BadRequestError as e:
            logger.warning("Claude API error: %s", e)
        except Exception as e:
            logger.warning("Claude AI extraction failed: %s", e)
        return relationships
    def _extract_with_patterns(self, text: str, source_file_id: str) -> List[CausalRelation]:
        """
        Fallback: Pattern-based extraction (original method).
        Returns candidate relationships for DoWhy validation.
        """
        if not text or not text.strip():
            return []
        relationships: List[CausalRelation] = []
        seen = set()  # Avoid duplicates
        # Normalize text
        text = re.sub(r'\s+', ' ', text)
        sentences = re.split(r'[.!?]\s+', text)
        for sentence in sentences:
            sentence = sentence.strip()
            if len(sentence) < 10:  # Skip very short sentences
                continue
            for pattern, rel_type in self.CAUSE_EFFECT_PATTERNS:
                matches = re.finditer(pattern, sentence, re.IGNORECASE)
                for match in matches:
                    cause = match.group(1).strip()
                    effect = match.group(2).strip()
                    # Filter out very short or very long phrases (increased limit for technical terms)
                    if len(cause) < 3 or len(cause) > 150:
                        continue
                    if len(effect) < 3 or len(effect) > 150:
                        continue
                    # Skip common false positives
                    if cause.lower() in ["this", "that", "it", "they", "we"]:
                        continue
                    if effect.lower() in ["this", "that", "it", "they", "we"]:
                        continue
                    # Create unique key
                    key = (cause.lower(), effect.lower())
                    if key in seen:
                        continue
                    seen.add(key)
                    # Calculate confidence based on pattern type
                    confidence = self._calculate_confidence(rel_type, sentence)
                    # Map pattern type to Neo4j relationship type (uppercase with underscores)
                    neo4j_rel_type = self._map_to_neo4j_relationship_type(rel_type)
                    relationships.append(CausalRelation(
                        cause=cause,
                        effect=effect,
                        confidence=confidence,
                        explanation=f"Extracted from text using pattern: {rel_type}",
                        source_file_id=source_file_id,
                        source_snippet=sentence[:200],  # First 200 chars
                        relationship_type=neo4j_rel_type,
                        metadata={
                            "extraction_method": "pattern_matching",
                            "pattern_type": rel_type,
                            "sentence": sentence
                        }
                    ))
        logger.info("Extracted %d candidate relationships from text (source: %s)", 
                   len(relationships), source_file_id)
        return relationships
    def _calculate_confidence(self, rel_type: str, sentence: str) -> float:
        """Calculate confidence score based on pattern type and sentence quality."""
        base_confidence = {
            "causes": 0.8,
            "leads_to": 0.75,
            "results_in": 0.75,
            "triggers": 0.7,
            "produces": 0.7,
            "depends_on": 0.65,
            "requires": 0.65,
            "needs": 0.6,
            "if_then": 0.8,
            "when_then": 0.75,
            "implies": 0.7,
            "follows": 0.6,
            "comes_after": 0.6,
            "first_then": 0.7,
            "enables": 0.7,
            "allows": 0.65,
            "facilitates": 0.65,
            "relies_on": 0.65,
            "uses": 0.6,
            "utilizes": 0.6,
            "leverages": 0.6,
            "connects_to": 0.7,
            "communicates_with": 0.7,
            "interacts_with": 0.7,
            "integrates_with": 0.7,
            "provides": 0.7,
            "supports": 0.7,
            "handles": 0.65,
            "manages": 0.65,
            "controls": 0.65,
            "processes": 0.65,
            "generates": 0.7,
            "creates": 0.7,
            "implements": 0.7,
            "delivers": 0.7,
            "flows_to": 0.7,
            "sends_to": 0.7,
            "transmits_to": 0.7,
            "receives_from": 0.7,
            "ensures": 0.75,
            "precedes": 0.6,
            "contains": 0.6,
            "includes": 0.6,
            "consists_of": 0.6,
            "affects": 0.65,
            "impacts": 0.65,
            "influences": 0.65,
        }.get(rel_type, 0.5)
        # Adjust based on sentence length (longer sentences might be more descriptive)
        if len(sentence) > 50:
            base_confidence += 0.05
        return min(base_confidence, 0.95)
    def _map_to_neo4j_relationship_type(self, pattern_type: str) -> str:
        """Map pattern type to Neo4j relationship type (uppercase with underscores)."""
        # Map lowercase pattern types to Neo4j relationship types
        mapping = {
            "causes": "CAUSES",
            "leads_to": "LEADS_TO",
            "results_in": "RESULTS_IN",
            "triggers": "TRIGGERS",
            "produces": "PRODUCES",
            "depends_on": "DEPENDS_ON",
            "requires": "REQUIRES",
            "needs": "NEEDS",
            "relies_on": "RELIES_ON",
            "uses": "USES",
            "utilizes": "UTILIZES",
            "leverages": "LEVERAGES",
            "connects_to": "CONNECTS_TO",
            "communicates_with": "COMMUNICATES_WITH",
            "interacts_with": "INTERACTS_WITH",
            "integrates_with": "INTEGRATES_WITH",
            "provides": "PROVIDES",
            "supports": "SUPPORTS",
            "handles": "HANDLES",
            "manages": "MANAGES",
            "controls": "CONTROLS",
            "processes": "PROCESSES",
            "generates": "GENERATES",
            "creates": "CREATES",
            "implements": "IMPLEMENTS",
            "delivers": "DELIVERS",
            "flows_to": "FLOWS_TO",
            "sends_to": "SENDS_TO",
            "transmits_to": "TRANSMITS_TO",
            "receives_from": "RECEIVES_FROM",
            "if_then": "IF_THEN",
            "when_then": "WHEN_THEN",
            "implies": "IMPLIES",
            "ensures": "ENSURES",
            "follows": "FOLLOWS",
            "comes_after": "COMES_AFTER",
            "first_then": "FIRST_THEN",
            "precedes": "PRECEDES",
            "contains": "CONTAINS",
            "includes": "INCLUDES",
            "consists_of": "CONSISTS_OF",
            "affects": "AFFECTS",
            "impacts": "IMPACTS",
            "influences": "INFLUENCES",
            "enables": "ENABLES",
            "allows": "ALLOWS",
            "facilitates": "FACILITATES",
        }
        return mapping.get(pattern_type, "CAUSES")  # Default to CAUSES if not found
    def extract_from_qwen_results(self, qwen_results: List[Dict], source_file_id: str) -> List[CausalRelation]:
        """Convert Qwen2.5-VL extraction results to CausalRelation objects."""
        relationships: List[CausalRelation] = []
        for result in qwen_results:
            entity1 = result.get("entity1", "").strip()
            entity2 = result.get("entity2", "").strip()
            rel_type = result.get("relationship_type", "").strip()
            description = result.get("description", "").strip()
            confidence = float(result.get("confidence", 0.7))
            if not entity1 or not entity2:
                continue
            # Map relationship type to cause-effect
            # For most types, entity1 is cause, entity2 is effect
            cause = entity1
            effect = entity2
            # Some relationship types might need reversal
            if rel_type in ["depends_on", "requires", "needs"]:
                # If A depends on B, then B is the cause, A is the effect
                cause, effect = effect, cause
            # Map Qwen relationship type to Neo4j format
            neo4j_rel_type = self._map_to_neo4j_relationship_type(rel_type.lower().replace("-", "_"))
            relationships.append(CausalRelation(
                cause=cause,
                effect=effect,
                confidence=confidence,
                explanation=description or f"Extracted from diagram: {rel_type}",
                source_file_id=source_file_id,
                source_snippet=description,
                relationship_type=neo4j_rel_type,
                metadata={
                    "extraction_method": "qwen2.5-vl",
                    "relationship_type": rel_type,
                    "original_entity1": entity1,
                    "original_entity2": entity2
                }
            ))
        return relationships
--- a/services/multi-document-upload-service/src/multi_document_upload_service/processors/report_generator.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/processors/report_generator.py
@ -0,0 +1,570 @@
 from __future__ import annotations
 import json
 import logging
 from datetime import datetime
 from pathlib import Path
 from typing import Dict, List, Optional, Set
 from anthropic import Anthropic, BadRequestError
 from ..config import get_settings
 from ..models import CausalRelation, ProjectReport
 logger = logging.getLogger(__name__)
 # Try to import PDF generation libraries
 try:
    import markdown
    from markdown.extensions import codehilite, fenced_code, tables
    HAS_MARKDOWN = True
 except ImportError:
    HAS_MARKDOWN = False
    logger.warning("markdown library not available - PDF conversion will be limited")
 try:
    from weasyprint import HTML, CSS
    from weasyprint.text.fonts import FontConfiguration
    HAS_WEASYPRINT = True
 except ImportError:
    HAS_WEASYPRINT = False
    logger.warning("weasyprint not available - PDF conversion will be skipped")
 class ReportGenerator:
    """Generate beginner-friendly onboarding reports from knowledge graph."""
    def __init__(self, api_key: str | None = None, model: str | None = None):
        settings = get_settings()
        self.api_key = api_key or settings.anthropic_api_key
        self.model = model or settings.claude_model
        self.max_output_tokens = settings.claude_max_output_tokens
        if not self.api_key:
            raise ValueError("Anthropic API key is required for report generation")
        self.client = Anthropic(api_key=self.api_key)
    def generate_onboarding_report(
        self,
        job_id: str,
        relations: List[CausalRelation],
        vector_store,
        embedder,
        graph_writer=None,
        kg_summary: Dict | None = None
    ) -> ProjectReport:
        """
        Generate a beginner-friendly onboarding report from the knowledge graph.
        """
        logger.info("Generating onboarding report for job %s", job_id)
        # Step 1: Analyze KG structure
        key_concepts = self._analyze_kg_structure(relations)
        # Step 2: Semantic search for different topics
        overview_content = self._search_topic(
            "project overview main purpose goals objectives",
            vector_store, embedder, job_id, top_k=10
        )
        concepts_content = self._search_topic(
            "core concepts definitions key terms important ideas",
            vector_store, embedder, job_id, top_k=15
        )
        processes_content = self._search_topic(
            "how system works processes flows procedures steps",
            vector_store, embedder, job_id, top_k=15
        )
        relationships_content = self._search_topic(
            "cause effect dependencies relationships connections",
            vector_store, embedder, job_id, top_k=20
        )
        components_content = self._search_topic(
            "components modules systems parts architecture",
            vector_store, embedder, job_id, top_k=15
        )
        # Step 3: Query Neo4j for causal chains (as per README Step 7.3)
        causal_chains = []
        key_entities = []
        if graph_writer:
            try:
                # Query 1: Get critical causal chains
                causal_chains = graph_writer.query_causal_chains(
                    job_id=job_id,
                    min_length=2,
                    max_length=4,
                    min_confidence=0.8,
                    limit=20
                )
                logger.info("Retrieved %d causal chains from Neo4j", len(causal_chains))
                # Query 2: Get key entities
                key_entities = graph_writer.query_key_entities(job_id=job_id, limit=20)
                logger.info("Retrieved %d key entities from Neo4j", len(key_entities))
            except Exception as neo4j_exc:
                logger.warning("Failed to query Neo4j: %s", neo4j_exc)
        # Step 4: Organize content hierarchically
        organized_content = self._organize_content(
            key_concepts,
            overview_content,
            concepts_content,
            processes_content,
            relationships_content,
            components_content,
            causal_chains,
            key_entities
        )
        # Step 5: Generate report with Claude
        report_content = self._claude_generate_report(
            job_id=job_id,
            relations=relations,
            organized_content=organized_content,
            kg_summary=kg_summary or {}
        )
        # Step 6: Parse sections
        sections = self._parse_sections(report_content)
        # Step 7: Convert to PDF (as per README Step 7.8)
        pdf_path = None
        if HAS_WEASYPRINT and HAS_MARKDOWN:
            try:
                pdf_path = self._convert_to_pdf(report_content, job_id)
                logger.info("Generated PDF report: %s", pdf_path)
            except Exception as pdf_exc:
                logger.warning("PDF conversion failed: %s", pdf_exc)
        # Estimate pages (rough: ~500 words per page)
        word_count = len(report_content.split())
        estimated_pages = max(1, word_count // 500)
        return ProjectReport(
            job_id=job_id,
            title="Project Onboarding Guide",
            content=report_content,
            sections=sections,
            key_concepts=list(key_concepts)[:20],  # Top 20 concepts
            total_pages=estimated_pages,
            generated_at=datetime.utcnow(),
            metadata={
                "total_relations": len(relations),
                "total_concepts": len(key_concepts),
                "causal_chains_count": len(causal_chains),
                "key_entities_count": len(key_entities),
                "model": self.model,
                "pdf_path": str(pdf_path) if pdf_path else None
            }
        )
    def _analyze_kg_structure(self, relations: List[CausalRelation]) -> Set[str]:
        """Identify key concepts from the knowledge graph."""
        concepts = set()
        for rel in relations:
            concepts.add(rel.cause)
            concepts.add(rel.effect)
        # Identify high-degree nodes (concepts involved in many relationships)
        cause_counts: Dict[str, int] = {}
        effect_counts: Dict[str, int] = {}
        for rel in relations:
            cause_counts[rel.cause] = cause_counts.get(rel.cause, 0) + 1
            effect_counts[rel.effect] = effect_counts.get(rel.effect, 0) + 1
        # Key concepts are those with high degree (appear in many relationships)
        all_counts = {**cause_counts, **effect_counts}
        threshold = max(1, len(relations) // 10)  # Top 10% most connected
        key_concepts = {
            concept for concept, count in all_counts.items()
            if count >= threshold
        }
        # If threshold is too high, use top N concepts
        if len(key_concepts) < 5:
            sorted_concepts = sorted(all_counts.items(), key=lambda x: x[1], reverse=True)
            key_concepts = {concept for concept, _ in sorted_concepts[:20]}
        logger.info("Identified %d key concepts from %d relationships", 
                   len(key_concepts), len(relations))
        return key_concepts
    def _search_topic(
        self,
        query: str,
        vector_store,
        embedder,
        job_id: str,
        top_k: int = 10
    ) -> List[Dict]:
        """Search for content related to a topic."""
        try:
            results = vector_store.search_by_text(
                query_text=query,
                embedder=embedder,
                job_id=job_id,
                top_k=top_k
            )
            return results
        except Exception as exc:
            logger.warning("Search failed for topic '%s': %s", query, exc)
            return []
    def _organize_content(
        self,
        key_concepts: Set[str],
        overview_content: List[Dict],
        concepts_content: List[Dict],
        processes_content: List[Dict],
        relationships_content: List[Dict],
        components_content: List[Dict],
        causal_chains: List[Dict] = None,
        key_entities: List[Dict] = None
    ) -> Dict:
        """Organize retrieved content into a structured format."""
        return {
            "key_concepts": list(key_concepts),
            "overview": [r.get("payload", {}) for r in overview_content],
            "concepts": [r.get("payload", {}) for r in concepts_content],
            "processes": [r.get("payload", {}) for r in processes_content],
            "relationships": [r.get("payload", {}) for r in relationships_content],
            "components": [r.get("payload", {}) for r in components_content],
            "causal_chains": causal_chains or [],
            "key_entities": key_entities or [],
        }
    def _claude_generate_report(
        self,
        job_id: str,
        relations: List[CausalRelation],
        organized_content: Dict,
        kg_summary: Dict
    ) -> str:
        """Generate report using Claude AI."""
        # Build KG summary text
        kg_summary_text = self._build_kg_summary(relations, organized_content)
        # Build system prompt
        system_prompt = """You are an expert technical writer specializing in creating beginner-friendly onboarding documentation for new team members.
 Your goal is to explain complex project information in simple, clear language that anyone can understand, even without technical background.
 Guidelines:
 - Use simple, clear language - avoid jargon or explain it when necessary
 - Use examples and analogies to make concepts relatable
 - Structure information logically (basics first, then advanced)
 - Make it engaging and easy to follow
 - Cover all important aspects comprehensively
 - Write in a friendly, welcoming tone
 - Use headings, bullet points, and clear sections
 - Explain "why" not just "what"
 Generate a comprehensive onboarding document that helps a new team member understand the entire project."""
        # Format causal chains from Neo4j
        causal_chains_text = self._format_causal_chains(organized_content.get('causal_chains', []))
        key_entities_text = self._format_key_entities(organized_content.get('key_entities', []))
        # Build user prompt
        user_prompt = f"""Generate a comprehensive, beginner-friendly onboarding document for this project.
 KNOWLEDGE GRAPH SUMMARY:
 {kg_summary_text}
 IMPORTANT RELATIONSHIPS:
 {self._format_relationships(relations[:50])}  # Top 50 relationships
 CAUSAL CHAINS (from Knowledge Graph):
 {causal_chains_text}
 KEY ENTITIES (from Knowledge Graph):
 {key_entities_text}
 KEY CONCEPTS:
 {', '.join(organized_content.get('key_concepts', [])[:30])}
 REQUIRED SECTIONS:
 1. Project Overview
   - What is this project about?
   - Main purpose and goals
   - Key stakeholders or users
 2. Core Concepts (Explained Simply)
   - Explain each important concept in simple terms
   - Why each concept matters
   - How concepts relate to each other
 3. How Things Work Together
   - System flow (simple explanation)
   - Key processes and workflows
   - Dependencies explained simply
 4. Important Relationships
   - Cause → Effect relationships (explained in plain language)
   - "When X happens, Y occurs because..."
   - Visual flow if possible (describe it)
 5. Key Components
   - Main modules/systems/components
   - What each does (beginner-friendly)
   - How they interact
 6. Getting Started
   - Where to start learning
   - What to understand first
   - Recommended learning path
 7. Common Questions
   - FAQ based on the knowledge graph
   - Answers in simple terms
 Generate the complete onboarding document in Markdown format. Make it comprehensive, beginner-friendly, and easy to follow."""
        try:
            message = self.client.messages.create(
                model=self.model,
                max_tokens=self.max_output_tokens,
                temperature=0.3,  # Slightly creative but focused
                system=system_prompt,
                messages=[
                    {
                        "role": "user",
                        "content": user_prompt
                    }
                ]
            )
            content_blocks = message.content or []
            report_text = "".join(
                block.text for block in content_blocks 
                if hasattr(block, "text")
            )
            if not report_text:
                logger.warning("Empty report generated")
                return "# Project Onboarding Guide\n\nNo content available."
            logger.info("Generated onboarding report (%d characters)", len(report_text))
            return report_text
        except BadRequestError as e:
            # Handle API credit/authentication errors gracefully
            error_msg = str(e)
            if "credit balance" in error_msg.lower() or "too low" in error_msg.lower():
                logger.error("Claude API credit balance too low. Cannot generate report.")
                raise ValueError("Claude API credit balance is too low. Please add credits to your Anthropic account to generate reports.")
            elif "invalid_request_error" in error_msg.lower():
                logger.error("Claude API invalid request: %s", error_msg)
                raise ValueError(f"Claude API request failed: {error_msg}")
            else:
                raise
        except Exception as e:
            logger.exception("Failed to generate report: %s", e)
            raise
    def _build_kg_summary(
        self,
        relations: List[CausalRelation],
        organized_content: Dict
    ) -> str:
        """Build a text summary of the knowledge graph."""
        summary_parts = [
            f"Total Relationships: {len(relations)}",
            f"Total Concepts: {len(organized_content.get('key_concepts', []))}",
            "",
            "Top Relationships:",
        ]
        # Show top relationships by confidence
        top_relations = sorted(relations, key=lambda r: r.confidence, reverse=True)[:20]
        for i, rel in enumerate(top_relations, 1):
            summary_parts.append(
                f"{i}. {rel.cause} → {rel.effect} "
                f"(confidence: {rel.confidence:.2f})"
            )
        return "\n".join(summary_parts)
    def _format_relationships(self, relations: List[CausalRelation]) -> str:
        """Format relationships for the prompt."""
        if not relations:
            return "No relationships found."
        lines = []
        for rel in relations[:50]:  # Limit to 50
            line = f"- {rel.cause} → {rel.effect}"
            if rel.explanation:
                line += f" ({rel.explanation[:100]})"
            lines.append(line)
        return "\n".join(lines)
    def _parse_sections(self, content: str) -> Dict[str, str]:
        """Parse markdown content into sections."""
        sections = {}
        current_section = None
        current_content = []
        lines = content.split('\n')
        for line in lines:
            # Check if it's a heading (starts with #)
            if line.strip().startswith('#'):
                # Save previous section
                if current_section:
                    sections[current_section] = '\n'.join(current_content).strip()
                # Start new section
                current_section = line.strip().lstrip('#').strip()
                current_content = [line]
            else:
                if current_section:
                    current_content.append(line)
                else:
                    # Content before first heading
                    if 'introduction' not in sections:
                        sections['introduction'] = line
                    else:
                        sections['introduction'] += '\n' + line
        # Save last section
        if current_section:
            sections[current_section] = '\n'.join(current_content).strip()
        return sections
    def _format_causal_chains(self, causal_chains: List[Dict]) -> str:
        """Format causal chains from Neo4j for the prompt."""
        if not causal_chains:
            return "No causal chains found in knowledge graph."
        lines = []
        for i, chain_data in enumerate(causal_chains[:20], 1):  # Top 20 chains
            chain = chain_data.get("chain", [])
            avg_confidence = chain_data.get("avg_confidence", 0.0)
            if len(chain) >= 2:
                chain_text = " → ".join(chain)
                lines.append(f"{i}. {chain_text} (confidence: {avg_confidence:.2f})")
        return "\n".join(lines) if lines else "No causal chains found."
    def _format_key_entities(self, key_entities: List[Dict]) -> str:
        """Format key entities from Neo4j for the prompt."""
        if not key_entities:
            return "No key entities found in knowledge graph."
        lines = []
        for entity in key_entities[:20]:  # Top 20 entities
            name = entity.get("name", "")
            entity_type = entity.get("type", "Entity")
            relation_count = entity.get("relation_count", 0)
            lines.append(f"- {name} ({entity_type}): involved in {relation_count} relationships")
        return "\n".join(lines) if lines else "No key entities found."
    def _convert_to_pdf(self, markdown_content: str, job_id: str) -> Optional[Path]:
        """
        Convert Markdown report to PDF as per README Step 7.8.
        Uses markdown + weasyprint for PDF generation.
        """
        if not HAS_MARKDOWN or not HAS_WEASYPRINT:
            return None
        try:
            # Convert Markdown to HTML
            html_content = markdown.markdown(
                markdown_content,
                extensions=['codehilite', 'fenced_code', 'tables']
            )
            # Add CSS styling
            css_style = """
            @page {
                size: A4;
                margin: 2cm;
            }
            body {
                font-family: 'Georgia', serif;
                line-height: 1.6;
                color: #333;
            }
            h1, h2, h3, h4 {
                color: #2c3e50;
                margin-top: 1.5em;
                margin-bottom: 0.5em;
            }
            h1 { font-size: 2em; border-bottom: 2px solid #3498db; padding-bottom: 0.3em; }
            h2 { font-size: 1.5em; border-bottom: 1px solid #95a5a6; padding-bottom: 0.2em; }
            h3 { font-size: 1.2em; }
            code {
                background-color: #f4f4f4;
                padding: 2px 4px;
                border-radius: 3px;
                font-family: 'Courier New', monospace;
            }
            pre {
                background-color: #f4f4f4;
                padding: 1em;
                border-radius: 5px;
                overflow-x: auto;
            }
            table {
                border-collapse: collapse;
                width: 100%;
                margin: 1em 0;
            }
            th, td {
                border: 1px solid #ddd;
                padding: 8px;
                text-align: left;
            }
            th {
                background-color: #3498db;
                color: white;
            }
            """
            # Create full HTML document
            full_html = f"""
            <!DOCTYPE html>
            <html>
            <head>
                <meta charset="UTF-8">
                <title>Project Onboarding Guide</title>
            </head>
            <body>
                {html_content}
            </body>
            </html>
            """
            # Generate PDF
            settings = get_settings()
            storage_root = Path(settings.storage_root)
            reports_dir = storage_root / "reports"
            reports_dir.mkdir(parents=True, exist_ok=True)
            pdf_path = reports_dir / f"report_{job_id}.pdf"
            HTML(string=full_html).write_pdf(
                pdf_path,
                stylesheets=[CSS(string=css_style)]
            )
            logger.info("PDF report generated: %s", pdf_path)
            return pdf_path
        except Exception as exc:
            logger.exception("Failed to convert Markdown to PDF: %s", exc)
            return None
--- a/services/multi-document-upload-service/src/multi_document_upload_service/processors/vector_store.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/processors/vector_store.py
@ -0,0 +1,269 @@
 from __future__ import annotations
 import logging
 from typing import Dict, List, Optional
 from uuid import uuid4
 from ..config import get_settings
 from ..models import CausalRelation
 logger = logging.getLogger(__name__)
 try:
    from qdrant_client import QdrantClient
    from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
    HAS_QDRANT = True
 except ImportError:
    HAS_QDRANT = False
    logger.warning("qdrant-client not available")
 class VectorStore:
    """Qdrant vector database client for storing KG embeddings."""
    def __init__(
        self,
        url: str | None = None,
        collection_name: str | None = None,
        vector_size: int | None = None
    ):
        if not HAS_QDRANT:
            raise ImportError("qdrant-client is required for vector storage")
        settings = get_settings()
        self.url = url or settings.qdrant_url
        self.collection_name = collection_name or settings.qdrant_collection_name
        self.vector_size = vector_size or settings.qdrant_vector_size
        logger.info("Connecting to Qdrant at %s", self.url)
        try:
            self.client = QdrantClient(url=self.url)
            logger.info("Connected to Qdrant")
        except Exception as exc:
            logger.exception("Failed to connect to Qdrant: %s", exc)
            raise
        # Ensure collection exists
        self._ensure_collection()
    def _ensure_collection(self) -> None:
        """Create collection if it doesn't exist."""
        try:
            collections = self.client.get_collections()
            collection_names = [col.name for col in collections.collections]
            if self.collection_name not in collection_names:
                logger.info("Creating Qdrant collection: %s", self.collection_name)
                try:
                    self.client.create_collection(
                        collection_name=self.collection_name,
                        vectors_config=VectorParams(
                            size=self.vector_size,
                            distance=Distance.COSINE
                        )
                    )
                    logger.info("Created collection: %s", self.collection_name)
                except Exception as create_exc:
                    # Collection might have been created by another instance
                    if "already exists" in str(create_exc).lower() or "409" in str(create_exc):
                        logger.info("Collection %s already exists (created by another instance)", self.collection_name)
                    else:
                        raise
            else:
                logger.debug("Collection %s already exists", self.collection_name)
        except Exception as exc:
            logger.exception("Failed to ensure collection: %s", exc)
            raise
    def store_relation(
        self,
        relation: CausalRelation,
        embedding: List[float],
        job_id: str
    ) -> str:
        """Store a relationship embedding in Qdrant."""
        point_id = str(uuid4())
        payload = {
            "job_id": job_id,
            "cause": relation.cause,
            "effect": relation.effect,
            "confidence": relation.confidence,
            "source_file_id": relation.source_file_id or "",
            "source_snippet": relation.source_snippet or "",
            "explanation": relation.explanation or "",
        }
        point = PointStruct(
            id=point_id,
            vector=embedding,
            payload=payload
        )
        try:
            self.client.upsert(
                collection_name=self.collection_name,
                points=[point]
            )
            logger.debug("Stored relation embedding: %s -> %s", relation.cause, relation.effect)
            return point_id
        except Exception as exc:
            logger.warning("Failed to store relation: %s", exc)
            return ""
    def store_concept(
        self,
        concept_name: str,
        embedding: List[float],
        job_id: str,
        description: str | None = None
    ) -> str:
        """Store a concept/node embedding in Qdrant."""
        point_id = str(uuid4())
        payload = {
            "job_id": job_id,
            "concept_name": concept_name,
            "description": description or "",
            "type": "concept"
        }
        point = PointStruct(
            id=point_id,
            vector=embedding,
            payload=payload
        )
        try:
            self.client.upsert(
                collection_name=self.collection_name,
                points=[point]
            )
            logger.debug("Stored concept embedding: %s", concept_name)
            return point_id
        except Exception as exc:
            logger.warning("Failed to store concept: %s", exc)
            return ""
    def search(
        self,
        query_embedding: List[float],
        job_id: str | None = None,
        top_k: int = 10,
        score_threshold: float = 0.5
    ) -> List[Dict]:
        """Search for similar vectors in Qdrant."""
        try:
            # Build filter if job_id is provided
            query_filter = None
            if job_id:
                query_filter = Filter(
                    must=[
                        FieldCondition(
                            key="job_id",
                            match=MatchValue(value=job_id)
                        )
                    ]
                )
            # Use the collections API for search
            # Check if client has search method (newer versions) or use query_points (older)
            if hasattr(self.client, 'search'):
                results = self.client.search(
                    collection_name=self.collection_name,
                    query_vector=query_embedding,
                    query_filter=query_filter,
                    limit=top_k,
                    score_threshold=score_threshold
                )
            elif hasattr(self.client, 'query_points'):
                # Fallback for older API
                results = self.client.query_points(
                    collection_name=self.collection_name,
                    query=query_embedding,
                    query_filter=query_filter,
                    top=top_k,
                    score_threshold=score_threshold
                )
            else:
                # Try using the collection directly
                collection = self.client.get_collection(self.collection_name)
                if hasattr(collection, 'search'):
                    results = collection.search(
                        query_vector=query_embedding,
                        query_filter=query_filter,
                        limit=top_k,
                        score_threshold=score_threshold
                    )
                else:
                    logger.error("QdrantClient does not have search or query_points method")
                    return []
            # Convert to list of dicts
            search_results = []
            for result in results:
                search_results.append({
                    "id": str(result.id),
                    "score": result.score,
                    "payload": result.payload
                })
            return search_results
        except Exception as exc:
            logger.warning("Vector search failed: %s", exc)
            import traceback
            logger.debug("Search error traceback: %s", traceback.format_exc())
            return []
    def search_by_text(
        self,
        query_text: str,
        embedder,
        job_id: str | None = None,
        top_k: int = 10
    ) -> List[Dict]:
        """Search using text query (embeds it first)."""
        query_embedding = embedder.embed_text(query_text)
        return self.search(query_embedding, job_id=job_id, top_k=top_k)
    def delete_job_vectors(self, job_id: str) -> int:
        """Delete all vectors for a specific job."""
        try:
            # Qdrant doesn't have a direct delete by filter, so we need to:
            # 1. Search for all points with job_id
            # 2. Delete them by ID
            # This is a simplified version - in production, you might want
            # to use scroll API for large datasets
            query_filter = Filter(
                must=[
                    FieldCondition(
                        key="job_id",
                        match=MatchValue(value=job_id)
                    )
                ]
            )
            # Scroll to get all points
            points, _ = self.client.scroll(
                collection_name=self.collection_name,
                scroll_filter=query_filter,
                limit=10000  # Adjust based on expected size
            )
            if points:
                point_ids = [str(point.id) for point in points]
                self.client.delete(
                    collection_name=self.collection_name,
                    points_selector=point_ids
                )
                logger.info("Deleted %d vectors for job %s", len(point_ids), job_id)
                return len(point_ids)
            return 0
        except Exception as exc:
            logger.warning("Failed to delete job vectors: %s", exc)
            return 0
--- a/services/multi-document-upload-service/src/multi_document_upload_service/workflows/pipeline.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/workflows/pipeline.py
@ -4,14 +4,19 @@ import logging
 from pathlib import Path
 from typing import Iterable, List
 from ..claude_client import ClaudeCausalExtractor
 from ..config import get_settings
 from ..extractors.auto import extract_text
 from ..extractors.image_extractor import extract_images_from_file
 from ..extractors.pymupdf_extractor import extract_all_text, extract_text_with_context
 from ..extractors.qwen_vision import QwenVisionClient
 from ..jobs import JobStore
 from ..models import CausalRelation, JobStage
-from ..processors.chunker import TextChunker
+from ..processors.dowhy_analyzer import DoWhyAnalyzer
 from ..processors.embedder import Embedder
 from ..processors.entity_resolver import EntityResolver
 from ..processors.graph_writer import GraphWriter
 from ..processors.relationship_extractor import RelationshipExtractor
 from ..processors.report_generator import ReportGenerator
 from ..processors.vector_store import VectorStore
 from ..storage import StorageManager
 logger = logging.getLogger(__name__)
@ -23,31 +28,60 @@ class JobPipeline:
        job_store: JobStore,
        storage: StorageManager,
        graph_writer: GraphWriter,
        claude_extractor: ClaudeCausalExtractor,
    ):
        self.job_store = job_store
        self.storage = storage
        self.graph_writer = graph_writer
-        self.claude_extractor = claude_extractor
+        
        settings = get_settings()
-        self.chunker = TextChunker(
+        
-            model_name=settings.claude_model,
+        # Initialize extractors
-            token_target=settings.chunk_token_target,
+        self.qwen_client = QwenVisionClient()  # Only for images/diagrams
-            overlap=settings.chunk_token_overlap,
+        self.relationship_extractor = RelationshipExtractor()  # NLP (SpaCy) + Claude AI for text (as per README)
-        )
+        self.entity_resolver = EntityResolver()  # Claude AI entity resolution (as per README Stage 4)
        # Initialize processors
        try:
            self.dowhy_analyzer = DoWhyAnalyzer() if settings.dowhy_enabled else None
        except Exception as e:
            logger.warning("DoWhy not available: %s", e)
            self.dowhy_analyzer = None
        try:
            self.embedder = Embedder()
            self.vector_store = VectorStore()
        except Exception as e:
            logger.warning("Vector store not available: %s", e)
            self.embedder = None
            self.vector_store = None
        try:
            self.report_generator = ReportGenerator()
        except Exception as e:
            logger.warning("Report generator not available: %s", e)
            self.report_generator = None
    def process_job(self, job_id: str, saved_files: Iterable[str]) -> None:
        job = self.job_store.get(job_id)
        logger.info("Processing job %s with %d files", job_id, job.total_files)
-        relations: List[CausalRelation] = []
+        all_text_content: List[str] = []
        all_relations: List[CausalRelation] = []
        try:
-            self.job_store.update(job_id, stage=JobStage.EXTRACTING, status_message="Extracting content")
+            # ============================================================
            # STEP 1: CONTENT EXTRACTION (PyMuPDF + Qwen2.5-VL)
            # ============================================================
            self.job_store.update(
                job_id, 
                stage=JobStage.EXTRACTING, 
                status_message="Extracting content from documents"
            )
            for count, file_path in enumerate(saved_files, start=1):
                file_path_obj = Path(file_path)
                file_record = next((f for f in job.files if f.stored_path == file_path), None)
-                logger.info("Processing %s", file_path_obj.name)
+                logger.info("Processing %s (%d/%d)", file_path_obj.name, count, job.total_files)
                source_file_id = file_record.id if file_record else file_path_obj.name
                suffix = file_path_obj.suffix.lower()
@ -55,27 +89,36 @@ class JobPipeline:
                is_direct_image = suffix in {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}
                try:
-                    # Extract text from document (if not a direct image)
+                    # Step 2.1: IDENTIFY FILE TYPE and route to appropriate extractor
                    # Step 2.2: Extract text based on file type (as per README)
                    text = ""
                    if not is_direct_image:
                        try:
-                            text = extract_text(file_path_obj)
+                            # extract_all_text() handles routing:
                            # - PDF → PyMuPDF (Step 2.2a)
                            # - DOCX → python-docx (Step 2.2b)
                            # - PPTX → python-pptx (Step 2.2c)
                            # - CSV/XLSX → pandas (Step 2.2d)
                            # - Text files → direct read
                            # Also performs Step 2.3: Text cleaning
                            text = extract_all_text(file_path_obj)
                            # Process text if available
                            if text and text.strip():
-                                # Validate text is readable
+                                # Validate text is readable (basic check)
                                printable_chars = sum(1 for c in text if c.isprintable() or c.isspace())
                                total_chars = len(text)
                                if total_chars > 100 and printable_chars / total_chars < 0.3:
-                                    logger.warning("Text from %s appears to be binary, skipping text processing", file_path_obj.name)
+                                    logger.warning("Text from %s appears to be binary, skipping", file_path_obj.name)
                                    text = ""
                                else:
                                    # Step 2.4: STORE EXTRACTED TEXT
                                    all_text_content.append(text)
                                    extracted_path = self.storage.stage_extracted_content(job_id, file_path_obj.name, text)
                                    if file_record:
                                        file_record.extracted_path = str(extracted_path)
-                                    logger.info("Successfully extracted %d characters from %s", len(text), file_path_obj.name)
+                                    logger.info("Extracted %d characters from %s", len(text), file_path_obj.name)
                        except Exception as text_exc:
-                            logger.warning("Text extraction failed for %s: %s. Will continue with image extraction if available.", file_path_obj.name, text_exc)
+                            logger.warning("Text extraction failed for %s: %s", file_path_obj.name, text_exc)
                            text = ""
                    # Extract images from documents (PDF, DOCX, PPTX)
@ -93,7 +136,25 @@ class JobPipeline:
                        extracted_images = [file_path_obj]
                        logger.info("Direct image upload detected: %s", file_path_obj.name)
-                except Exception as exc:  # noqa: BLE001
+                    # Process images with Qwen2.5-VL
                    if extracted_images:
                        for image_path in extracted_images:
                            try:
                                qwen_results = self.qwen_client.extract_relationships_from_image(
                                    image_path, source_file_id
                                )
                                if qwen_results:
                                    # Convert Qwen results to CausalRelation objects
                                    qwen_relations = self.relationship_extractor.extract_from_qwen_results(
                                        qwen_results, source_file_id
                                    )
                                    all_relations.extend(qwen_relations)
                                    logger.info("Extracted %d relations from image %s using Qwen2.5-VL", 
                                              len(qwen_relations), image_path.name)
                            except Exception as img_exc:
                                logger.warning("Failed to analyze image %s with Qwen: %s", image_path, img_exc)
                except Exception as exc:
                    logger.exception("Extraction failed for %s", file_path_obj)
                    if file_record:
                        file_record.error = str(exc)
@ -103,62 +164,188 @@ class JobPipeline:
                    job_id,
                    files=job.files,
                    processed_files=count,
-                    status_message=f"Analyzing causal relations ({count}/{job.total_files})",
+                    status_message=f"Extracting content ({count}/{job.total_files})",
                    stage=JobStage.ANALYZING,
                )
-                # Process text content
+            # ============================================================
-                if text and text.strip():
+            # STEP 2: RELATIONSHIP EXTRACTION (NLP + Claude AI as per README)
-                    chunks = self.chunker.chunk(text)
+            # ============================================================
-                    text_relations = self.claude_extractor.analyze(chunks, source_file_id=source_file_id)
+            logger.info("Extracting relationships from text content using NLP (SpaCy) + Claude AI")
-                    relations.extend(text_relations)
+            combined_text = "\n\n".join(all_text_content)
                    logger.info("Extracted %d relations from text in %s", len(text_relations), file_path_obj.name)
-                # Process images (extracted from documents or direct uploads)
+            if combined_text.strip():
-                if extracted_images:
+                # Extract relationships using NLP (Step 3.1) + Claude AI (Step 3.2)
-                    for image_path in extracted_images:
+                # This implements the flow described in README.md
-                        try:
+                text_relations = self.relationship_extractor.extract_from_text(
-                            image_relations = self.claude_extractor.analyze_image(image_path, source_file_id=source_file_id)
+                    combined_text, 
-                            relations.extend(image_relations)
+                    source_file_id="combined_text"
-                            logger.info("Extracted %d relations from image %s", len(image_relations), image_path.name)
+                )
-                        except Exception as img_exc:
+                all_relations.extend(text_relations)
-                            logger.warning("Failed to analyze image %s: %s", image_path, img_exc)
+                logger.info("NLP + Claude AI extracted %d relationships from text", len(text_relations))
                            # Continue with other images
                elif not text or not text.strip():
                    # No text and no images - file might be empty or unsupported
                    logger.warning("File %s has no extractable text or images", file_path_obj.name)
                    if file_record:
                        file_record.error = "No extractable content found (no text or images)"
-            # Write relations to Neo4j if any were found
+            # ============================================================
-            if relations:
+            # STEP 3: ENTITY RESOLUTION (Claude AI as per README Stage 4)
-                self.job_store.update(job_id, status_message="Writing to knowledge graph", stage=JobStage.BUILDING_GRAPH)
+            # ============================================================
            if all_relations and self.entity_resolver.client:
                logger.info("Resolving entities using Claude AI")
                resolved_entities = self.entity_resolver.resolve_entities(all_relations)
                if resolved_entities:
                    # Apply resolution to relationships
                    all_relations = self.entity_resolver.apply_resolution_to_relations(
                        all_relations, resolved_entities
                    )
                    logger.info("Entity resolution completed: %d canonical entities", len(resolved_entities))
                else:
                    logger.info("Entity resolution returned no results")
            else:
                if not self.entity_resolver.client:
                    logger.info("Entity resolution skipped (Claude AI not available)")
            # ============================================================
            # STEP 4: DOWHY VALIDATION
            # ============================================================
            if self.dowhy_analyzer and all_relations:
                self.job_store.update(
                    job_id,
                    status_message="Validating relationships with DoWhy",
                    stage=JobStage.BUILDING_GRAPH
                )
                logger.info("Validating %d relationships with DoWhy", len(all_relations))
                validated_relations = self.dowhy_analyzer.validate_relationships(
                    all_relations,
                    text_data=combined_text
                )
                all_relations = validated_relations
                logger.info("DoWhy validated %d relationships", len(all_relations))
            else:
                if not self.dowhy_analyzer:
                    logger.info("DoWhy validation skipped (not available)")
                self.job_store.update(
                    job_id,
                    status_message="Building knowledge graph",
                    stage=JobStage.BUILDING_GRAPH
                )
            # ============================================================
            # STEP 5: WRITE TO NEO4J (Documents, Entities, Relationships)
            # ============================================================
            if all_relations:
                try:
-                    self.graph_writer.write_relations(job_id, relations)
+                    # Write documents, entities, and relationships with types
-                    logger.info("Wrote %d relations to Neo4j for job %s", len(relations), job_id)
+                    self.graph_writer.write_relations(job_id, all_relations, files=job.files)
-                    status_message = f"Completed with {len(relations)} causal relationship(s) written to Neo4j"
+                    logger.info("Wrote %d relations to Neo4j for job %s", len(all_relations), job_id)
                except Exception as graph_exc:
-                    logger.exception("Failed to write relations to Neo4j for job %s: %s", job_id, graph_exc)
+                    logger.exception("Failed to write relations to Neo4j: %s", graph_exc)
-                    status_message = f"Completed with {len(relations)} relations extracted, but failed to write to Neo4j: {graph_exc}"
+                    raise
-            else:
+
-                logger.warning("Job %s completed with 0 relations - no causal relationships found", job_id)
+            # ============================================================
-                # Check if any files failed to extract
+            # STEP 6: VECTOR DATABASE INDEXING (Qdrant)
-                failed_files = [f for f in job.files if f.error]
+            # ============================================================
-                if failed_files:
+            if self.vector_store and self.embedder and all_relations:
-                    status_message = f"Completed but {len(failed_files)} file(s) failed to extract. No relations found."
+                self.job_store.update(
-                else:
+                    job_id,
-                    status_message = "Completed but no causal relationships were found in the documents."
+                    status_message="Indexing knowledge graph in vector database",
                    stage=JobStage.INDEXING_VECTORS
                )
                logger.info("Indexing %d relationships in Qdrant", len(all_relations))
                indexed_count = 0
                for relation in all_relations:
                    try:
                        # Generate embedding for the relationship
                        embedding = self.embedder.embed_relation(
                            relation.cause,
                            relation.effect,
                            relation.explanation
                        )
                        # Store in Qdrant
                        self.vector_store.store_relation(relation, embedding, job_id)
                        indexed_count += 1
                    except Exception as e:
                        logger.warning("Failed to index relation %s -> %s: %s", 
                                    relation.cause, relation.effect, e)
                # Also index concepts (nodes)
                concepts = set()
                for rel in all_relations:
                    concepts.add(rel.cause)
                    concepts.add(rel.effect)
                for concept in concepts:
                    try:
                        embedding = self.embedder.embed_concept(concept)
                        self.vector_store.store_concept(concept, embedding, job_id)
                    except Exception as e:
                        logger.warning("Failed to index concept %s: %s", concept, e)
                logger.info("Indexed %d relationships and %d concepts in Qdrant", 
                          indexed_count, len(concepts))
            # ============================================================
            # STEP 7: GENERATE ONBOARDING REPORT
            # ============================================================
            if self.report_generator and self.vector_store and self.embedder:
                self.job_store.update(
                    job_id,
                    status_message="Generating beginner-friendly onboarding report",
                    stage=JobStage.GENERATING_REPORT
                )
                logger.info("Generating onboarding report for job %s", job_id)
                try:
                    kg_summary = {
                        "total_relations": len(all_relations),
                        "total_files": job.total_files,
                        "processed_files": job.processed_files
                    }
                    report = self.report_generator.generate_onboarding_report(
                        job_id=job_id,
                        relations=all_relations,
                        vector_store=self.vector_store,
                        embedder=self.embedder,
                        graph_writer=self.graph_writer,  # Pass graph_writer for Neo4j queries
                        kg_summary=kg_summary
                    )
                    logger.info("Generated onboarding report: %d sections, %d pages", 
                              len(report.sections), report.total_pages)
                except Exception as report_exc:
                    logger.exception("Failed to generate report: %s", report_exc)
                    report = None
                    # Store report generation error in job metadata
                    report_error_msg = str(report_exc)
                    if "credit balance" in report_error_msg.lower() or "too low" in report_error_msg.lower():
                        report_error_msg = "Report generation failed: Claude API credit balance is too low. Please add credits to your Anthropic account."
                    self.job_store.update(
                        job_id,
                        error=f"Report generation failed: {report_error_msg}"
                    )
            else:
                logger.warning("Report generation skipped (components not available)")
                report = None
            # ============================================================
            # FINAL UPDATE
            # ============================================================
            status_message = f"Completed successfully"
            if all_relations:
                status_message += f" with {len(all_relations)} relationships"
            if report:
                status_message += f" and generated onboarding report"
            # Final update
            self.job_store.update(
                job_id,
                stage=JobStage.COMPLETED,
                status_message=status_message,
-                relations=relations,
+                relations=all_relations,
                report=report,
                processed_files=job.total_files,
            )
-            logger.info("Job %s completed with %d relations", job_id, len(relations))
+            logger.info("Job %s completed successfully", job_id)
-        except Exception as exc:  # noqa: BLE001
+            
        except Exception as exc:
            logger.exception("Job %s failed: %s", job_id, exc)
            self.job_store.mark_error(job_id, f"Pipeline failed: {exc}")