added qdrant db in multi doc service

2025-12-01 09:04:09 +05:30 · 2025-12-01 09:04:09 +05:30 · 72fea0dee8
commit 72fea0dee8
parent 603e9b4b20
35 changed files with 5398 additions and 1765 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -196,27 +196,45 @@ services:
  #     retries: 5
  #     start_period: 60s 

-  chromadb:
-    image: chromadb/chroma:latest
-    container_name: pipeline_chromadb
+  # chromadb:
+  #   image: chromadb/chroma:latest
+  #   container_name: pipeline_chromadb
+  #   ports:
+  #     - "8010:8000"
+  #   environment:
+  #     - CHROMA_SERVER_HOST=0.0.0.0
+  #     - CHROMA_SERVER_HTTP_PORT=8000
+  #     - IS_PERSISTENT=TRUE
+  #     - PERSIST_DIRECTORY=/chroma/chroma
+  #     - ANONYMIZED_TELEMETRY=TRUE
+  #   volumes:
+  #     - chromadb_data:/chroma/chroma
+  #   networks:
+  #     - pipeline_network
+  #   healthcheck:
+  #     test: ["CMD-SHELL", "timeout 5 bash -c '</dev/tcp/127.0.0.1/8000' || exit 1"]
+  #     interval: 15s
+  #     timeout: 10s
+  #     retries: 3
+  #     start_period: 30s
+
+  qdrant:
+    image: qdrant/qdrant:latest
+    container_name: pipeline_qdrant
    ports:
-      - "8010:8000"
-    environment:
-      - CHROMA_SERVER_HOST=0.0.0.0
-      - CHROMA_SERVER_HTTP_PORT=8000
-      - IS_PERSISTENT=TRUE
-      - PERSIST_DIRECTORY=/chroma/chroma
-      - ANONYMIZED_TELEMETRY=TRUE
+      - "6333:6333"
+      - "6334:6334"
    volumes:
-      - chromadb_data:/chroma/chroma
+      - qdrant_data:/qdrant/storage
    networks:
      - pipeline_network
    healthcheck:
-      test: ["CMD-SHELL", "timeout 5 bash -c '</dev/tcp/127.0.0.1/8000' || exit 1"]
-      interval: 15s
+      test: ["CMD-SHELL", "timeout 2 bash -c '</dev/tcp/127.0.0.1/6333' || exit 1"]
+      interval: 30s
      timeout: 10s
-      retries: 3
+      retries: 5
      start_period: 30s
+    restart: unless-stopped



@ -294,97 +312,97 @@ services:
      start_period: 40s
    restart: unless-stopped

-  requirement-processor:
-    build: ./services/requirement-processor
-    container_name: pipeline_requirement_processor
-    ports:
-      - "8001:8001"
-    environment:
-      - POSTGRES_HOST=postgres
-      - POSTGRES_PORT=5432
-      - POSTGRES_DB=dev_pipeline
-      - POSTGRES_USER=pipeline_admin
-      - POSTGRES_PASSWORD=secure_pipeline_2024
-      - DATABASE_URL=postgresql://pipeline_admin:secure_pipeline_2024@postgres:5432/dev_pipeline
-      - REDIS_HOST=redis
-      - REDIS_PORT=6379
-      - REDIS_PASSWORD=redis_secure_2024
-      - MONGODB_HOST=mongodb
-      - MONGODB_PORT=27017
-      - NEO4J_URI=bolt://neo4j:7687
-      - NEO4J_USER=neo4j
-      - NEO4J_PASSWORD=password
-      - CHROMA_HOST=chromadb
-      - CHROMA_PORT=8000
-      - REDIS_URL=redis://:redis_secure_2024@redis:6379
-    networks:
-      - pipeline_network
-    depends_on:
-      postgres:
-        condition: service_healthy
-      redis:
-        condition: service_healthy
-      mongodb:
-        condition: service_started
-      migrations:
-        condition: service_completed_successfully
+  # requirement-processor:
+  #   build: ./services/requirement-processor
+  #   container_name: pipeline_requirement_processor
+  #   ports:
+  #     - "8001:8001"
+  #   environment:
+  #     - POSTGRES_HOST=postgres
+  #     - POSTGRES_PORT=5432
+  #     - POSTGRES_DB=dev_pipeline
+  #     - POSTGRES_USER=pipeline_admin
+  #     - POSTGRES_PASSWORD=secure_pipeline_2024
+  #     - DATABASE_URL=postgresql://pipeline_admin:secure_pipeline_2024@postgres:5432/dev_pipeline
+  #     - REDIS_HOST=redis
+  #     - REDIS_PORT=6379
+  #     - REDIS_PASSWORD=redis_secure_2024
+  #     - MONGODB_HOST=mongodb
+  #     - MONGODB_PORT=27017
+  #     - NEO4J_URI=bolt://neo4j:7687
+  #     - NEO4J_USER=neo4j
+  #     - NEO4J_PASSWORD=password
+  #     - CHROMA_HOST=chromadb
+  #     - CHROMA_PORT=8000
+  #     - REDIS_URL=redis://:redis_secure_2024@redis:6379
+  #   networks:
+  #     - pipeline_network
+  #   depends_on:
+  #     postgres:
+  #       condition: service_healthy
+  #     redis:
+  #       condition: service_healthy
+  #     mongodb:
+  #       condition: service_started
+  #     migrations:
+  #       condition: service_completed_successfully

-  tech-stack-selector:
-    build: ./services/tech-stack-selector
-    container_name: pipeline_tech_stack_selector
-    ports:
-      - "8002:8002"
-    environment:
-      - POSTGRES_HOST=postgres
-      - POSTGRES_PORT=5432
-      - POSTGRES_DB=dev_pipeline
-      - POSTGRES_USER=pipeline_admin
-      - POSTGRES_PASSWORD=secure_pipeline_2024
-      - REDIS_HOST=redis
-      - REDIS_PORT=6379
-      - REDIS_PASSWORD=redis_secure_2024
-      - CLAUDE_API_KEY=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
-    networks:
-      - pipeline_network
-    depends_on:
-      postgres:
-        condition: service_healthy
-      redis:
-        condition: service_healthy
-      migrations:
-        condition: service_completed_successfully
+  # tech-stack-selector:
+  #   build: ./services/tech-stack-selector
+  #   container_name: pipeline_tech_stack_selector
+  #   ports:
+  #     - "8002:8002"
+  #   environment:
+  #     - POSTGRES_HOST=postgres
+  #     - POSTGRES_PORT=5432
+  #     - POSTGRES_DB=dev_pipeline
+  #     - POSTGRES_USER=pipeline_admin
+  #     - POSTGRES_PASSWORD=secure_pipeline_2024
+  #     - REDIS_HOST=redis
+  #     - REDIS_PORT=6379
+  #     - REDIS_PASSWORD=redis_secure_2024
+  #     - CLAUDE_API_KEY=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
+  #   networks:
+  #     - pipeline_network
+  #   depends_on:
+  #     postgres:
+  #       condition: service_healthy
+  #     redis:
+  #       condition: service_healthy
+  #     migrations:
+  #       condition: service_completed_successfully

-  architecture-designer:
-    build: ./services/architecture-designer
-    container_name: pipeline_architecture_designer
-    ports:
-      - "8003:8003"
-    environment:
-      - PORT=8003
-      - HOST=0.0.0.0
-      - CLAUDE_API_KEY=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
-      - ANTHROPIC_API_KEY=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
-      - POSTGRES_HOST=postgres
-      - POSTGRES_PORT=5432
-      - POSTGRES_DB=dev_pipeline
-      - POSTGRES_USER=pipeline_admin
-      - POSTGRES_PASSWORD=secure_pipeline_2024
-      - MONGODB_HOST=mongodb
-      - MONGODB_PORT=27017
-    networks:
-      - pipeline_network
-    depends_on:
-      postgres:
-        condition: service_healthy
-      mongodb:
-        condition: service_started
-      migrations:
-        condition: service_completed_successfully
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:8003/health"]
-      interval: 30s
-      timeout: 10s
-      retries: 3
+  # architecture-designer:
+  #   build: ./services/architecture-designer
+  #   container_name: pipeline_architecture_designer
+  #   ports:
+  #     - "8003:8003"
+  #   environment:
+  #     - PORT=8003
+  #     - HOST=0.0.0.0
+  #     - CLAUDE_API_KEY=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
+  #     - ANTHROPIC_API_KEY=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
+  #     - POSTGRES_HOST=postgres
+  #     - POSTGRES_PORT=5432
+  #     - POSTGRES_DB=dev_pipeline
+  #     - POSTGRES_USER=pipeline_admin
+  #     - POSTGRES_PASSWORD=secure_pipeline_2024
+  #     - MONGODB_HOST=mongodb
+  #     - MONGODB_PORT=27017
+  #   networks:
+  #     - pipeline_network
+  #   depends_on:
+  #     postgres:
+  #       condition: service_healthy
+  #     mongodb:
+  #       condition: service_started
+  #     migrations:
+  #       condition: service_completed_successfully
+  #   healthcheck:
+  #     test: ["CMD", "curl", "-f", "http://localhost:8003/health"]
+  #     interval: 30s
+  #     timeout: 10s
+  #     retries: 3

  # code-generator:
  #   build: ./services/code-generator
@ -461,34 +479,34 @@ services:
      migrations:
        condition: service_completed_successfully

-  deployment-manager:
-    build: ./services/deployment-manager
-    container_name: pipeline_deployment_manager
-    ports:
-      - "8006:8006"
-    environment:
-      - POSTGRES_HOST=postgres
-      - POSTGRES_PORT=5432
-      - POSTGRES_DB=dev_pipeline
-      - POSTGRES_USER=pipeline_admin
-      - POSTGRES_PASSWORD=secure_pipeline_2024
-      - MONGODB_HOST=mongodb
-      - MONGODB_PORT=27017
-      - RABBITMQ_HOST=rabbitmq
-      - RABBITMQ_PORT=5672
-      - RABBITMQ_USER=pipeline_admin
-      - RABBITMQ_PASSWORD=rabbit_secure_2024
-    networks:
-      - pipeline_network
-    depends_on:
-      postgres:
-        condition: service_healthy
-      rabbitmq:
-        condition: service_healthy
-      mongodb:
-        condition: service_started
-      migrations:
-        condition: service_completed_successfully
+  # deployment-manager:
+  #   build: ./services/deployment-manager
+  #   container_name: pipeline_deployment_manager
+  #   ports:
+  #     - "8006:8006"
+  #   environment:
+  #     - POSTGRES_HOST=postgres
+  #     - POSTGRES_PORT=5432
+  #     - POSTGRES_DB=dev_pipeline
+  #     - POSTGRES_USER=pipeline_admin
+  #     - POSTGRES_PASSWORD=secure_pipeline_2024
+  #     - MONGODB_HOST=mongodb
+  #     - MONGODB_PORT=27017
+  #     - RABBITMQ_HOST=rabbitmq
+  #     - RABBITMQ_PORT=5672
+  #     - RABBITMQ_USER=pipeline_admin
+  #     - RABBITMQ_PASSWORD=rabbit_secure_2024
+  #   networks:
+  #     - pipeline_network
+  #   depends_on:
+  #     postgres:
+  #       condition: service_healthy
+  #     rabbitmq:
+  #       condition: service_healthy
+  #     mongodb:
+  #       condition: service_started
+  #     migrations:
+  #       condition: service_completed_successfully

  user-auth:
    build: ./services/user-auth
@ -583,38 +601,38 @@ services:
    restart: unless-stopped

  # AI Mockup / Wireframe Generation Service
-  ai-mockup-service:
-    build: ./services/ai-mockup-service
-    container_name: pipeline_ai_mockup_service
-    ports:
-      - "8021:8021"
-    environment:
-      - PORT=8021
-      - HOST=0.0.0.0
-      - CLAUDE_API_KEY=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
-      - POSTGRES_HOST=postgres
-      - POSTGRES_PORT=5432
-      - POSTGRES_DB=dev_pipeline
-      - POSTGRES_USER=pipeline_admin
-      - POSTGRES_PASSWORD=secure_pipeline_2024
-      - REDIS_HOST=redis
-      - REDIS_PORT=6379
-      - REDIS_PASSWORD=redis_secure_2024
-      - JWT_ACCESS_SECRET=access-secret-key-2024-tech4biz-secure_pipeline_2024
-      - USER_AUTH_SERVICE_URL=http://user-auth:8011
-      - FLASK_ENV=development
-    networks:
-      - pipeline_network
-    depends_on:
-      postgres:
-        condition: service_healthy
-      user-auth:
-        condition: service_healthy
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:8021/health"]
-      interval: 30s
-      timeout: 10s
-      retries: 3
+  # ai-mockup-service:
+  #   build: ./services/ai-mockup-service
+  #   container_name: pipeline_ai_mockup_service
+  #   ports:
+  #     - "8021:8021"
+  #   environment:
+  #     - PORT=8021
+  #     - HOST=0.0.0.0
+  #     - CLAUDE_API_KEY=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
+  #     - POSTGRES_HOST=postgres
+  #     - POSTGRES_PORT=5432
+  #     - POSTGRES_DB=dev_pipeline
+  #     - POSTGRES_USER=pipeline_admin
+  #     - POSTGRES_PASSWORD=secure_pipeline_2024
+  #     - REDIS_HOST=redis
+  #     - REDIS_PORT=6379
+  #     - REDIS_PASSWORD=redis_secure_2024
+  #     - JWT_ACCESS_SECRET=access-secret-key-2024-tech4biz-secure_pipeline_2024
+  #     - USER_AUTH_SERVICE_URL=http://user-auth:8011
+  #     - FLASK_ENV=development
+  #   networks:
+  #     - pipeline_network
+  #   depends_on:
+  #     postgres:
+  #       condition: service_healthy
+  #     user-auth:
+  #       condition: service_healthy
+  #   healthcheck:
+  #     test: ["CMD", "curl", "-f", "http://localhost:8021/health"]
+  #     interval: 30s
+  #     timeout: 10s
+  #     retries: 3

  git-integration:
    build: ./services/git-integration
@ -731,7 +749,7 @@ services:
    environment:
      - PORT=8022
      - HOST=0.0.0.0
-      - ANTHROPIC_API_KEY=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
+      - ANTHROPIC_API_KEY=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
      
      # Neo4j Configuration
      - USE_NEO4J_KG=true
@ -790,17 +808,37 @@ services:
    environment:
      - PORT=8024
      - HOST=0.0.0.0
-      - ANTHROPIC_API_KEY=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
+      
+      # Claude/Anthropic Configuration
+      - ANTHROPIC_API_KEY=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
+      - MULTI_DOC_CLAUDE_MODEL=claude-3-5-haiku-latest
      - CLAUDE_MODEL=claude-3-5-haiku-latest
      
+      # Qwen2.5-VL API Configuration
+      - QWEN_API_KEY=${QWEN_API_KEY:-}
+      - QWEN_API_URL=${QWEN_API_URL:-https://api.example.com/v1/chat/completions}
+      - QWEN_MODEL=qwen2.5-vl
+      
      # Neo4j Configuration
      - NEO4J_URI=bolt://neo4j:7687
      - NEO4J_USER=neo4j
      - NEO4J_PASSWORD=password
      - NEO4J_DATABASE=neo4j
      
+      # Qdrant Configuration
+      - QDRANT_URL=http://qdrant:6333
+      - QDRANT_COLLECTION_NAME=kg_embeddings
+      
+      # DoWhy Configuration
+      - DOWHY_ENABLED=true
+      - DOWHY_CONFIDENCE_THRESHOLD=0.05
+      
+      # Embedding Configuration
+      - EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
+      - EMBEDDING_DIMENSION=384
+      
      # Storage Configuration
-      - STORAGE_DIR=/app/storage
+      - MULTI_DOC_STORAGE_ROOT=/app/storage
      
      # Database configurations (optional, for job tracking)
      - POSTGRES_HOST=pipeline_postgres
@ -817,6 +855,8 @@ services:
    depends_on:
      neo4j:
        condition: service_healthy
+      qdrant:
+        condition: service_healthy
      postgres:
        condition: service_healthy
      redis:
@ -958,6 +998,8 @@ volumes:
    driver: local
  multi_document_storage:
    driver: local
+  qdrant_data:
+    driver: local

 # =====================================
 # Networks
--- a/services/ai-analysis-service/ai-analyze.py
+++ b/services/ai-analysis-service/ai-analyze.py
@ -7094,8 +7094,29 @@ async def main():
        js_files = [fa for fa in frontend_files if fa.path.lower().endswith(('.js', '.jsx', '.mjs', '.cjs'))]
        ts_files = [fa for fa in frontend_files if fa.path.lower().endswith(('.ts', '.tsx'))]
        
+        # Allocate frontend persona
+        from persona_system import allocate_code_persona, build_code_analysis_persona_prompt
+        
+        # Determine if it's UI or state management focused
+        has_state_files = len(state_files) > 0
+        sample_file = frontend_files[0] if frontend_files else None
+        sample_path = sample_file.path if sample_file else ""
+        sample_content = getattr(sample_file, 'content', '')[:1000] if sample_file else ""
+        
+        # Allocate persona - prefer state management if state files exist
+        if has_state_files:
+            # Try to get state management persona
+            persona = allocate_code_persona("store/state.ts", sample_content, "frontend_state")
+            if "state" not in persona.get("role", "").lower():
+                # Fallback to UI persona
+                persona = allocate_code_persona(sample_path, sample_content, "frontend_ui")
+        else:
+            persona = allocate_code_persona(sample_path, sample_content, "frontend_ui")
+        
+        assignment_context = f"CTO has assigned you to analyze the frontend codebase for this project. You are analyzing {len(frontend_files)} frontend files including components, routing, state management, and configuration."
+        
        front_end_prompt = f"""
-You are a Senior Frontend Architect and Technical Writer with 20+ years of experience. Analyze this frontend codebase and produce a comprehensive, technically precise report. The audience includes senior engineers and stakeholders who expect evidence-based, objective findings.
+Analyze this frontend codebase and produce a comprehensive, technically precise report. The audience includes senior engineers and stakeholders who expect evidence-based, objective findings.

 STRICT STYLE RULES:
 - Use professional, technical language only. Do not use analogies, metaphors, storytelling, or colloquial comparisons.
@ -7211,6 +7232,9 @@ FINAL REQUIREMENTS:
 - Ensure total length between 2000-3000 words.
 """
        
+        # Enhance prompt with persona
+        enhanced_prompt = build_code_analysis_persona_prompt(front_end_prompt, persona, assignment_context)
+        
        try:
            print(f"🤖 [FRONTEND AI] Calling Claude API for comprehensive frontend analysis...")
            print(f"🤖 [FRONTEND AI] Analyzing {len(frontend_files)} frontend files...")
@ -7220,7 +7244,7 @@ FINAL REQUIREMENTS:
                model=os.getenv("CLAUDE_MODEL", "claude-3-5-haiku-latest"),
                max_tokens=8000,  # Increased from 6000 to 8000 for more detailed analysis
                temperature=0.1,
-                messages=[{"role": "user", "content": front_end_prompt}]
+                messages=[{"role": "user", "content": enhanced_prompt}]
            )
            
            ai_analysis = message.content[0].text.strip()
@ -7230,7 +7254,7 @@ FINAL REQUIREMENTS:
            if not ai_analysis or len(ai_analysis) < 100:
                print("⚠️ [FRONTEND AI] AI analysis too short, regenerating...")
                # Retry with more emphasis on detail
-                retry_prompt = front_end_prompt + "\n\nIMPORTANT: Provide a VERY DETAILED analysis. The previous response was too short. Please provide at least 2000 words of detailed explanation."
+                retry_prompt = enhanced_prompt + "\n\nIMPORTANT: Provide a VERY DETAILED analysis. The previous response was too short. Please provide at least 2000 words of detailed explanation."
                message = self.client.messages.create(
                    model=os.getenv("CLAUDE_MODEL", "claude-3-5-haiku-latest"),
                    max_tokens=8000,
--- a/services/ai-analysis-service/enhanced_chunking.py
+++ b/services/ai-analysis-service/enhanced_chunking.py
@ -524,7 +524,11 @@ class ChunkAnalyzer:
    def _build_chunk_analysis_prompt(self, file_path: str, chunk: ChunkInfo, 
                                   chunk_index: int, total_chunks: int, 
                                   context_memories: Dict[str, Any]) -> str:
-        """Build comprehensive analysis prompt for a chunk."""
+        """Build comprehensive analysis prompt for a chunk with persona."""
+        from persona_system import allocate_code_persona, build_code_analysis_persona_prompt
+        
+        # Allocate persona based on file path and chunk content
+        persona = allocate_code_persona(file_path, chunk.content, chunk.chunk_type)
        
        # Build context information
        context_info = ""
@ -538,8 +542,10 @@ class ChunkAnalyzer:
            for practice in context_memories['best_practices'][:3]:
                context_info += f"- {practice['content'][:100]}...\n"
        
+        assignment_context = f"CTO has assigned you to analyze chunk {chunk_index + 1} of {total_chunks} from file: {file_path}. This is a {chunk.chunk_type} chunk covering lines {chunk.start_line}-{chunk.end_line}."
+        
        prompt = f"""
-You are a senior software engineer analyzing chunk {chunk_index + 1} of {total_chunks} from file: {file_path}
+Analyzing chunk {chunk_index + 1} of {total_chunks} from file: {file_path}

 CHUNK INFORMATION:
 - Chunk Type: {chunk.chunk_type}
@ -564,7 +570,10 @@ Provide a focused analysis of this specific chunk, considering:

 Focus on actionable insights for this specific code section.
 """
-        return prompt
+        
+        # Enhance with persona
+        enhanced_prompt = build_code_analysis_persona_prompt(prompt, persona, assignment_context)
+        return enhanced_prompt
    
    def _detect_language_from_path(self, file_path: str) -> str:
        """Detect language from file path."""
--- a/services/ai-analysis-service/persona_system.py
+++ b/services/ai-analysis-service/persona_system.py
@ -0,0 +1,755 @@
+"""
+World-Class Persona System for AI Analysis
+Simulates real-world team allocation with domain-specific experts from top companies.
+"""
+
+from typing import Dict, List, Optional, Tuple
+import re
+
+
+# ============================================================================
+# CODE ANALYSIS PERSONAS (for AI Analysis Service)
+# ============================================================================
+
+CODE_ANALYSIS_PERSONAS = {
+    # BACKEND DOMAINS
+    "backend_api": {
+        "role": "Senior Backend API Architect",
+        "companies": ["Google", "Amazon", "Stripe"],
+        "expertise": ["REST APIs", "GraphQL", "gRPC", "API Gateway", "Microservices"],
+        "experience_years": "18+",
+        "achievements": [
+            "Designed APIs at Google Cloud Platform handling 10M+ requests/day",
+            "Built scalable API infrastructure at Amazon AWS serving millions of customers",
+            "Led API architecture at Stripe processing billions in transactions"
+        ],
+        "detection_keywords": ["api", "controller", "route", "endpoint", "service", "rest", "graphql"],
+        "focus_areas": [
+            "API design patterns and best practices",
+            "API versioning and backward compatibility",
+            "Rate limiting and throttling strategies",
+            "API documentation quality",
+            "Security vulnerabilities in API endpoints"
+        ]
+    },
+    
+    "backend_database": {
+        "role": "Senior Database Architect",
+        "companies": ["Amazon", "Oracle", "MongoDB"],
+        "expertise": ["SQL", "NoSQL", "Database Design", "Query Optimization", "Data Modeling"],
+        "experience_years": "20+",
+        "achievements": [
+            "Designed database systems at Amazon handling petabytes of data",
+            "Optimized databases at Oracle for enterprise-scale applications",
+            "Built distributed databases at MongoDB for global scale"
+        ],
+        "detection_keywords": ["database", "db", "model", "schema", "migration", "repository", "orm", "query"],
+        "focus_areas": [
+            "Database schema design and normalization",
+            "Query performance and optimization",
+            "Data integrity and constraints",
+            "Indexing strategies",
+            "Transaction management"
+        ]
+    },
+    
+    "backend_business": {
+        "role": "Senior Backend Business Logic Architect",
+        "companies": ["Microsoft", "Salesforce", "SAP"],
+        "expertise": ["Business Logic", "Domain Modeling", "Design Patterns", "Service Layer"],
+        "experience_years": "17+",
+        "achievements": [
+            "Architected business logic systems at Microsoft for enterprise applications",
+            "Designed domain models at Salesforce for CRM platforms",
+            "Built service layers at SAP for ERP systems"
+        ],
+        "detection_keywords": ["service", "business", "logic", "domain", "entity", "dto", "handler"],
+        "focus_areas": [
+            "Code organization and structure",
+            "Design patterns implementation",
+            "Business logic maintainability",
+            "Domain modeling quality",
+            "Service layer architecture"
+        ]
+    },
+    
+    # FRONTEND DOMAINS
+    "frontend_ui": {
+        "role": "Senior Frontend UI Architect",
+        "companies": ["Apple", "Meta", "Netflix"],
+        "expertise": ["React", "Vue", "Angular", "Component Design", "UI/UX"],
+        "experience_years": "15+",
+        "achievements": [
+            "Built user interfaces at Apple used by millions daily",
+            "Led React architecture at Meta (Facebook) for large-scale applications",
+            "Designed performance-optimized UIs at Netflix for 200M+ users"
+        ],
+        "detection_keywords": ["component", "ui", "view", "page", "jsx", "tsx", "vue", "template"],
+        "focus_areas": [
+            "Component architecture and reusability",
+            "User experience and accessibility",
+            "UI performance optimization",
+            "Design system consistency",
+            "Responsive design implementation"
+        ]
+    },
+    
+    "frontend_state": {
+        "role": "Senior Frontend State Management Architect",
+        "companies": ["Meta", "Netflix", "Airbnb"],
+        "expertise": ["Redux", "Zustand", "Context API", "State Management", "Data Flow"],
+        "experience_years": "14+",
+        "achievements": [
+            "Architected state management at Meta for complex applications",
+            "Designed data flow patterns at Netflix for real-time updates",
+            "Built state systems at Airbnb for booking platforms"
+        ],
+        "detection_keywords": ["store", "state", "redux", "context", "recoil", "zustand", "mobx"],
+        "focus_areas": [
+            "State architecture and patterns",
+            "Data flow optimization",
+            "State synchronization",
+            "Performance in state updates",
+            "State management best practices"
+        ]
+    },
+    
+    # DEVOPS DOMAINS
+    "devops_ci_cd": {
+        "role": "Senior DevOps CI/CD Architect",
+        "companies": ["Google", "Netflix", "Uber"],
+        "expertise": ["CI/CD", "Jenkins", "GitHub Actions", "GitLab CI", "Deployment Automation"],
+        "experience_years": "12+",
+        "achievements": [
+            "Built CI/CD pipelines at Google handling 50K+ deployments/day",
+            "Designed deployment systems at Netflix for zero-downtime releases",
+            "Architected automation at Uber for global scale"
+        ],
+        "detection_keywords": ["ci", "cd", "pipeline", "jenkins", "github-actions", "gitlab", "deploy"],
+        "focus_areas": [
+            "CI/CD pipeline efficiency",
+            "Deployment strategy and automation",
+            "Quality gates and testing",
+            "Rollback strategies",
+            "Build optimization"
+        ]
+    },
+    
+    "devops_infrastructure": {
+        "role": "Senior Infrastructure Architect",
+        "companies": ["Amazon", "Google", "Microsoft"],
+        "expertise": ["Kubernetes", "Docker", "Terraform", "Cloud Infrastructure", "Scalability"],
+        "experience_years": "16+",
+        "achievements": [
+            "Designed infrastructure at Amazon AWS for global scale",
+            "Built container orchestration at Google for millions of containers",
+            "Architected cloud systems at Microsoft Azure with 99.99% uptime"
+        ],
+        "detection_keywords": ["docker", "kubernetes", "terraform", "infrastructure", "cloud", "aws", "gcp", "azure"],
+        "focus_areas": [
+            "Infrastructure scalability",
+            "System reliability and uptime",
+            "Cost optimization",
+            "Security in infrastructure",
+            "Monitoring and observability"
+        ]
+    },
+    
+    # SECURITY DOMAINS
+    "security_engineer": {
+        "role": "Senior Security Engineer",
+        "companies": ["Google", "Microsoft", "Cloudflare"],
+        "expertise": ["Security", "Vulnerability Assessment", "Penetration Testing", "Security Architecture"],
+        "experience_years": "15+",
+        "achievements": [
+            "Led security initiatives at Google protecting billions of users",
+            "Designed security systems at Microsoft for enterprise applications",
+            "Built security infrastructure at Cloudflare for DDoS protection"
+        ],
+        "detection_keywords": ["security", "auth", "encryption", "jwt", "oauth", "ssl", "tls", "cors"],
+        "focus_areas": [
+            "Security vulnerabilities and threats",
+            "Authentication and authorization",
+            "Data encryption and protection",
+            "Security best practices",
+            "Compliance and regulations"
+        ]
+    },
+    
+    # DATA DOMAINS
+    "data_engineer": {
+        "role": "Senior Data Engineer",
+        "companies": ["Google", "Netflix", "Uber"],
+        "expertise": ["Data Pipelines", "ETL", "Big Data", "Data Warehousing", "Spark"],
+        "experience_years": "13+",
+        "achievements": [
+            "Built data pipelines at Google processing petabytes daily",
+            "Designed ETL systems at Netflix for real-time analytics",
+            "Architected data infrastructure at Uber for millions of rides"
+        ],
+        "detection_keywords": ["data", "pipeline", "etl", "warehouse", "spark", "hadoop", "kafka"],
+        "focus_areas": [
+            "Data architecture and pipelines",
+            "ETL performance and optimization",
+            "Data quality and validation",
+            "Scalability in data processing",
+            "Data governance"
+        ]
+    },
+    
+    "ml_engineer": {
+        "role": "Senior ML/AI Engineer",
+        "companies": ["OpenAI", "Anthropic", "Google DeepMind"],
+        "expertise": ["Machine Learning", "Deep Learning", "AI Systems", "Model Training"],
+        "experience_years": "12+",
+        "achievements": [
+            "Developed ML models at OpenAI for language understanding",
+            "Built AI systems at Anthropic for safety-critical applications",
+            "Designed training pipelines at Google DeepMind for large-scale models"
+        ],
+        "detection_keywords": ["ml", "ai", "model", "training", "neural", "tensorflow", "pytorch", "learning"],
+        "focus_areas": [
+            "ML model architecture",
+            "Training pipeline optimization",
+            "Model performance and accuracy",
+            "Scalability in ML systems",
+            "AI safety and ethics"
+        ]
+    },
+    
+    # TESTING DOMAINS
+    "qa_automation": {
+        "role": "Senior QA Automation Architect",
+        "companies": ["Google", "Microsoft", "Amazon"],
+        "expertise": ["Test Automation", "Selenium", "Cypress", "Jest", "Testing Strategy"],
+        "experience_years": "14+",
+        "achievements": [
+            "Built test automation at Google for thousands of test cases",
+            "Designed testing frameworks at Microsoft for enterprise software",
+            "Architected QA systems at Amazon for e-commerce platforms"
+        ],
+        "detection_keywords": ["test", "spec", "jest", "cypress", "selenium", "pytest", "testing"],
+        "focus_areas": [
+            "Test coverage and quality",
+            "Automation strategy",
+            "Test maintainability",
+            "Performance testing",
+            "Testing best practices"
+        ]
+    },
+    
+    "performance_engineer": {
+        "role": "Senior Performance Engineer",
+        "companies": ["Google", "Netflix", "Amazon"],
+        "expertise": ["Performance Optimization", "Load Testing", "Profiling", "Scalability"],
+        "experience_years": "16+",
+        "achievements": [
+            "Optimized systems at Google handling billions of requests",
+            "Designed performance solutions at Netflix for streaming at scale",
+            "Built performance infrastructure at Amazon for peak traffic"
+        ],
+        "detection_keywords": ["performance", "load", "stress", "benchmark", "profiling", "optimization"],
+        "focus_areas": [
+            "Performance bottlenecks",
+            "Optimization strategies",
+            "Scalability concerns",
+            "Resource utilization",
+            "Performance testing"
+        ]
+    },
+    
+    # CTO (for synthesis)
+    "cto": {
+        "role": "Chief Technology Officer",
+        "companies": ["Google", "Microsoft", "Amazon"],
+        "expertise": ["Strategic Planning", "System Architecture", "Team Leadership", "Technology Strategy"],
+        "experience_years": "25+",
+        "achievements": [
+            "Former VP of Engineering at Google, leading teams of 500+ engineers",
+            "CTO at Microsoft Azure, responsible for cloud infrastructure strategy",
+            "Strategic advisor at Amazon Web Services for enterprise architecture"
+        ],
+        "focus_areas": [
+            "Strategic technology insights",
+            "System-wide risk assessment",
+            "Architectural recommendations",
+            "Cross-domain synthesis",
+            "Executive-level analysis"
+        ]
+    }
+}
+
+
+# ============================================================================
+# DOCUMENT ANALYSIS PERSONAS (for Multi-Document Upload Service)
+# ============================================================================
+
+DOCUMENT_ANALYSIS_PERSONAS = {
+    "technical_doc_analyst": {
+        "role": "Senior Technical Documentation Analyst",
+        "companies": ["Google", "Stripe", "Microsoft"],
+        "expertise_domain": "technical documentation and API specifications",
+        "document_types": ["API docs", "technical specs", "developer guides"],
+        "experience_years": "15+",
+        "achievements": [
+            "Analyzed technical documentation at Google for millions of API integrations",
+            "Led documentation analysis at Stripe for developer experience",
+            "Mapped technical relationships at Microsoft for enterprise systems"
+        ],
+        "focus_areas": [
+            "Technical dependencies and relationships",
+            "System integration points",
+            "API contract relationships",
+            "Technical process flows",
+            "Code-to-documentation mappings"
+        ],
+        "visual_focus_areas": [
+            "API flow diagrams",
+            "System integration diagrams",
+            "Technical architecture flows"
+        ],
+        "detection_keywords": ["api", "technical", "specification", "documentation", "guide", "reference", "developer"]
+    },
+    
+    "business_process_analyst": {
+        "role": "Senior Business Process Analyst",
+        "companies": ["McKinsey", "Deloitte", "Accenture"],
+        "expertise_domain": "business processes and stakeholder requirements",
+        "document_types": ["business requirements", "user stories", "business plans"],
+        "experience_years": "18+",
+        "achievements": [
+            "Analyzed business processes at McKinsey for Fortune 500 companies",
+            "Led process mapping at Deloitte for enterprise transformations",
+            "Mapped stakeholder relationships at Accenture for global projects"
+        ],
+        "focus_areas": [
+            "Business process flows",
+            "Requirement dependencies",
+            "Stakeholder impact chains",
+            "Business decision consequences",
+            "Organizational impact analysis"
+        ],
+        "visual_focus_areas": [
+            "Business process diagrams",
+            "Stakeholder impact maps",
+            "Decision flowcharts"
+        ],
+        "detection_keywords": ["business", "requirement", "stakeholder", "user story", "process", "workflow", "business plan"]
+    },
+    
+    "system_architecture_analyst": {
+        "role": "Senior System Architecture Document Analyst",
+        "companies": ["Google", "Amazon", "Microsoft"],
+        "expertise_domain": "system architecture and design documents",
+        "document_types": ["architecture docs", "design documents", "system designs"],
+        "experience_years": "20+",
+        "achievements": [
+            "Analyzed architecture documents at Google for large-scale distributed systems",
+            "Mapped system relationships at Amazon for cloud infrastructure",
+            "Led architecture analysis at Microsoft for enterprise solutions"
+        ],
+        "focus_areas": [
+            "Architecture relationships",
+            "Component dependencies",
+            "System interaction flows",
+            "Design decision impacts",
+            "Scalability relationships"
+        ],
+        "visual_focus_areas": [
+            "Architecture diagrams",
+            "Component interaction diagrams",
+            "System dependency maps"
+        ],
+        "detection_keywords": ["architecture", "design", "system", "component", "diagram", "architectural"]
+    },
+    
+    "requirements_analyst": {
+        "role": "Senior Requirements & Specification Analyst",
+        "companies": ["IBM", "Oracle", "SAP"],
+        "expertise_domain": "requirements and functional specifications",
+        "document_types": ["requirements docs", "functional specs", "feature specs"],
+        "experience_years": "17+",
+        "achievements": [
+            "Analyzed requirements at IBM for enterprise software implementations",
+            "Mapped specifications at Oracle for database systems",
+            "Led requirement analysis at SAP for ERP platforms"
+        ],
+        "focus_areas": [
+            "Requirement dependencies",
+            "Feature relationships",
+            "Specification impacts",
+            "Change propagation",
+            "Implementation dependencies"
+        ],
+        "visual_focus_areas": [
+            "Requirement traceability diagrams",
+            "Feature dependency maps",
+            "Impact analysis charts"
+        ],
+        "detection_keywords": ["requirement", "specification", "feature", "functional", "traceability", "spec"]
+    },
+    
+    "process_flow_analyst": {
+        "role": "Senior Process Flow Analyst",
+        "companies": ["Amazon", "Netflix", "Uber"],
+        "expertise_domain": "operational processes and workflows",
+        "document_types": ["process docs", "workflows", "operational manuals"],
+        "experience_years": "14+",
+        "achievements": [
+            "Analyzed processes at Amazon for fulfillment operations",
+            "Mapped workflows at Netflix for content delivery",
+            "Led process analysis at Uber for ride-sharing operations"
+        ],
+        "focus_areas": [
+            "Process step relationships",
+            "Workflow dependencies",
+            "Sequential cause-effects",
+            "Decision impacts",
+            "Operational dependencies"
+        ],
+        "visual_focus_areas": [
+            "Process flowcharts",
+            "Workflow diagrams",
+            "Decision trees",
+            "Operational flow maps"
+        ],
+        "detection_keywords": ["process", "workflow", "procedure", "operational", "manual", "step", "flow"]
+    },
+    
+    "visual_architecture_analyst": {
+        "role": "Senior Visual Architecture Analyst",
+        "companies": ["Google", "Microsoft", "Apple"],
+        "expertise_domain": "visual diagrams and architecture drawings",
+        "document_types": ["diagrams", "flowcharts", "architecture drawings"],
+        "experience_years": "16+",
+        "achievements": [
+            "Analyzed visual diagrams at Google for complex system mappings",
+            "Mapped architecture drawings at Microsoft for enterprise solutions",
+            "Led visual analysis at Apple for product architecture"
+        ],
+        "focus_areas": [
+            "Visual relationship extraction",
+            "Diagram dependency mapping",
+            "Flow analysis",
+            "Component interactions",
+            "Visual pattern recognition"
+        ],
+        "visual_focus_areas": [
+            "All types of visual diagrams",
+            "Architecture drawings",
+            "Flowcharts and process diagrams",
+            "Component and sequence diagrams"
+        ],
+        "detection_keywords": ["diagram", "flowchart", "visual", "drawing", "chart", "map", "image"]
+    }
+}
+
+
+# ============================================================================
+# DOCUMENT TYPE MAPPING
+# ============================================================================
+
+DOCUMENT_PERSONA_MAPPING = {
+    # Technical Documents
+    "api_documentation": "technical_doc_analyst",
+    "technical_specification": "technical_doc_analyst",
+    "code_documentation": "technical_doc_analyst",
+    "developer_guide": "technical_doc_analyst",
+    
+    # Business Documents
+    "business_requirements": "business_process_analyst",
+    "user_stories": "business_process_analyst",
+    "business_plan": "business_process_analyst",
+    "product_specification": "business_process_analyst",
+    "stakeholder_document": "business_process_analyst",
+    
+    # Architecture Documents
+    "architecture_document": "system_architecture_analyst",
+    "system_design": "system_architecture_analyst",
+    "design_document": "system_architecture_analyst",
+    "technical_design": "system_architecture_analyst",
+    
+    # Requirements Documents
+    "requirements_document": "requirements_analyst",
+    "functional_specification": "requirements_analyst",
+    "feature_specification": "requirements_analyst",
+    
+    # Process Documents
+    "process_document": "process_flow_analyst",
+    "workflow_document": "process_flow_analyst",
+    "procedure_guide": "process_flow_analyst",
+    "operational_manual": "process_flow_analyst",
+    
+    # Visual/Diagram Documents
+    "architecture_diagram": "visual_architecture_analyst",
+    "flowchart": "visual_architecture_analyst",
+    "sequence_diagram": "visual_architecture_analyst",
+    "component_diagram": "visual_architecture_analyst",
+    "process_diagram": "visual_architecture_analyst",
+    "system_diagram": "visual_architecture_analyst",
+}
+
+
+# ============================================================================
+# PERSONA ALLOCATION FUNCTIONS
+# ============================================================================
+
+def allocate_code_persona(file_path: str, content: str, chunk_type: str = "module") -> Dict:
+    """
+    Intelligently allocates code analysis persona based on file path, content, and type.
+    Returns persona config with prompt context.
+    """
+    file_lower = file_path.lower()
+    content_lower = content.lower()[:2000] if content else ""  # Sample content
+    
+    # Score each persona based on detection rules
+    persona_scores = {}
+    
+    for persona_id, persona_config in CODE_ANALYSIS_PERSONAS.items():
+        if persona_id == "cto":  # Skip CTO for individual analysis
+            continue
+            
+        score = 0
+        detection_keywords = persona_config.get("detection_keywords", [])
+        
+        # Check file path (higher weight)
+        for keyword in detection_keywords:
+            if keyword in file_lower:
+                score += 15
+        
+        # Check content (medium weight)
+        for keyword in detection_keywords:
+            if keyword in content_lower:
+                score += 8
+        
+        # Check chunk type
+        if chunk_type and chunk_type.lower() in detection_keywords:
+            score += 10
+        
+        # Domain-specific boosts
+        if "test" in file_lower and "qa" in persona_id:
+            score += 20
+        if "security" in file_lower and "security" in persona_id:
+            score += 20
+        if "performance" in file_lower and "performance" in persona_id:
+            score += 20
+        
+        if score > 0:
+            persona_scores[persona_id] = score
+    
+    # Select top persona
+    if persona_scores:
+        selected_id = max(persona_scores, key=persona_scores.get)
+        return CODE_ANALYSIS_PERSONAS[selected_id]
+    
+    # Default fallback to backend business logic
+    return CODE_ANALYSIS_PERSONAS.get("backend_business", {})
+
+
+def allocate_document_persona(file_path: str, content: str, file_type: str = "text") -> Dict:
+    """
+    Intelligently allocates document analysis persona based on file path, content, and type.
+    Returns persona config for document analysis.
+    """
+    file_lower = file_path.lower()
+    content_lower = content.lower()[:2000] if content else ""
+    
+    # Check if it's an image/diagram
+    if file_type == "image" or any(ext in file_lower for ext in [".png", ".jpg", ".jpeg", ".gif", ".svg", ".pdf"]):
+        return DOCUMENT_ANALYSIS_PERSONAS.get("visual_architecture_analyst", {})
+    
+    # Score each persona based on detection rules
+    persona_scores = {}
+    
+    for persona_id, persona_config in DOCUMENT_ANALYSIS_PERSONAS.items():
+        score = 0
+        detection_keywords = persona_config.get("detection_keywords", [])
+        
+        # Check file path (higher weight)
+        for keyword in detection_keywords:
+            if keyword in file_lower:
+                score += 15
+        
+        # Check content (medium weight)
+        for keyword in detection_keywords:
+            if keyword in content_lower:
+                score += 8
+        
+        # Check document type mapping
+        for doc_type, mapped_persona in DOCUMENT_PERSONA_MAPPING.items():
+            if doc_type in file_lower and mapped_persona == persona_id:
+                score += 20
+        
+        if score > 0:
+            persona_scores[persona_id] = score
+    
+    # Select top persona
+    if persona_scores:
+        selected_id = max(persona_scores, key=persona_scores.get)
+        return DOCUMENT_ANALYSIS_PERSONAS[selected_id]
+    
+    # Default fallback to technical doc analyst
+    return DOCUMENT_ANALYSIS_PERSONAS.get("technical_doc_analyst", {})
+
+
+def get_cto_persona() -> Dict:
+    """Returns CTO persona for synthesis and high-level analysis."""
+    return CODE_ANALYSIS_PERSONAS.get("cto", {})
+
+
+# ============================================================================
+# PROMPT BUILDING FUNCTIONS
+# ============================================================================
+
+def build_persona_intro(persona: Dict, assignment_context: str = "", analysis_type: str = "code") -> str:
+    """
+    Builds persona introduction section for prompts.
+    Works for both code and document analysis.
+    """
+    if not persona:
+        return ""
+    
+    role = persona.get("role", "Senior Engineer")
+    companies = persona.get("companies", [])
+    experience = persona.get("experience_years", "15+")
+    achievements = persona.get("achievements", [])
+    focus_areas = persona.get("focus_areas", [])
+    
+    # Build company background
+    company_bg = ""
+    if companies:
+        company_bg = f"- Previously worked at {', '.join(companies[:2])}"
+        if len(companies) > 2:
+            company_bg += f" and {companies[2]}"
+    
+    # Build achievements section
+    achievements_text = ""
+    if achievements:
+        achievements_text = "\n".join([f"- {achievement}" for achievement in achievements[:2]])
+    
+    # Build focus areas
+    focus_text = ""
+    if focus_areas:
+        focus_text = "\n".join([f"- {focus}" for focus in focus_areas[:5]])
+    
+    intro = f"""You are {role} with {experience} years of experience.
+
+COMPANY BACKGROUND:
+{company_bg}
+
+KEY ACHIEVEMENTS:
+{achievements_text}
+
+YOUR ASSIGNMENT:
+{assignment_context if assignment_context else 'Analyze the provided code/document for quality, issues, and recommendations.'}
+
+YOUR FOCUS AREAS:
+{focus_text}
+
+---
+"""
+    return intro
+
+
+def build_code_analysis_persona_prompt(base_prompt: str, persona: Dict, 
+                                      assignment_context: str = "") -> str:
+    """
+    Enhances code analysis prompt with persona context.
+    """
+    if not persona:
+        return base_prompt
+    
+    persona_intro = build_persona_intro(persona, assignment_context, "code")
+    return persona_intro + base_prompt
+
+
+def build_document_analysis_persona_prompt(base_prompt: str, persona: Dict,
+                                          document_type: str = "document",
+                                          assignment_context: str = "") -> str:
+    """
+    Enhances document analysis prompt with persona context.
+    """
+    if not persona:
+        return base_prompt
+    
+    role = persona.get("role", "Senior Analyst")
+    companies = persona.get("companies", [])
+    expertise_domain = persona.get("expertise_domain", "document analysis")
+    experience = persona.get("experience_years", "15+")
+    achievements = persona.get("achievements", [])
+    focus_areas = persona.get("focus_areas", [])
+    
+    company_bg = f"- Previously worked at {', '.join(companies[:2])}" if companies else ""
+    achievements_text = "\n".join([f"- {achievement}" for achievement in achievements[:2]]) if achievements else ""
+    focus_text = "\n".join([f"- {focus}" for focus in focus_areas[:5]]) if focus_areas else ""
+    
+    intro = f"""You are {role}, a specialist in analyzing {expertise_domain} with {experience} years of experience.
+
+COMPANY BACKGROUND:
+{company_bg}
+
+KEY ACHIEVEMENTS:
+{achievements_text}
+
+YOUR SPECIALIZATION:
+You excel at identifying:
+{focus_text}
+
+YOUR ASSIGNMENT:
+{assignment_context if assignment_context else f'Analyze this {document_type} to extract causal relationships and dependencies.'}
+
+---
+"""
+    return intro + base_prompt
+
+
+def build_cto_synthesis_prompt(base_prompt: str, team_findings: List[Dict] = None) -> str:
+    """
+    Builds CTO-level synthesis prompt with team allocation context.
+    """
+    cto_persona = get_cto_persona()
+    
+    if not cto_persona:
+        return base_prompt
+    
+    role = cto_persona.get("role", "Chief Technology Officer")
+    companies = cto_persona.get("companies", [])
+    experience = cto_persona.get("experience_years", "25+")
+    achievements = cto_persona.get("achievements", [])
+    focus_areas = cto_persona.get("focus_areas", [])
+    
+    company_bg = f"- Former VP of Engineering at {companies[0] if companies else 'Google'}, leading teams of 500+ engineers"
+    if len(companies) > 1:
+        company_bg += f"\n- CTO at {companies[1]}, responsible for cloud infrastructure strategy"
+    
+    achievements_text = "\n".join([f"- {achievement}" for achievement in achievements[:2]]) if achievements else ""
+    focus_text = "\n".join([f"- {focus}" for focus in focus_areas[:5]]) if focus_areas else ""
+    
+    team_allocation = ""
+    if team_findings:
+        team_allocation = "\n\nTEAM ALLOCATION:\n"
+        team_allocation += "You have allocated your expert team to analyze different domains:\n"
+        for finding in team_findings[:5]:
+            domain = finding.get("domain", "unknown")
+            team_allocation += f"- {domain}: Expert analysis completed\n"
+    
+    intro = f"""You are {role} with {experience} years of experience.
+
+COMPANY BACKGROUND:
+{company_bg}
+
+KEY ACHIEVEMENTS:
+{achievements_text}
+{team_allocation}
+
+YOUR ROLE:
+You have received this project and allocated your expert team to analyze different domains.
+Now, synthesize all team findings into strategic recommendations.
+
+YOUR FOCUS AREAS:
+{focus_text}
+
+---
+"""
+    return intro + base_prompt
+
--- a/services/ai-analysis-service/server.py
+++ b/services/ai-analysis-service/server.py
@ -2673,8 +2673,10 @@ def build_intelligent_chunk_prompt(chunk: Dict, analysis_state: Optional[Dict] =
    """
    Build comprehensive prompt for analyzing a semantically grouped chunk.
    Generates detailed module-level analysis with context awareness.
-    Now includes progressive context from previous chunks.
+    Now includes progressive context from previous chunks and world-class persona.
    """
+    from persona_system import allocate_code_persona, build_code_analysis_persona_prompt
+    
    chunk_name = chunk.get('name', 'unknown')
    chunk_type = chunk.get('chunk_type', 'module')
    files_batch = chunk.get('files', [])
@ -2694,15 +2696,22 @@ def build_intelligent_chunk_prompt(chunk: Dict, analysis_state: Optional[Dict] =
        
        optimized_files.append((file_path, optimized_content))
    
+    # Allocate appropriate persona based on files in chunk
+    # Use the first file to determine persona (or combine if multiple domains)
+    primary_file_path = optimized_files[0][0] if optimized_files else ""
+    primary_content = optimized_files[0][1] if optimized_files else ""
+    persona = allocate_code_persona(primary_file_path, primary_content, chunk_type)
+    
    # Build context from previous analyses (progressive learning)
    context_section = build_context_from_state(analysis_state, chunk)
    
+    # Build assignment context
+    assignment_context = f"CTO has assigned you to analyze the '{chunk_name}' module/chunk for this project. This is a {chunk_type} type chunk containing {len(optimized_files)} files."
+    
    # Build comprehensive prompt with module context
    prompt_parts = [
        f"# COMPREHENSIVE ANALYSIS: {chunk_name.upper()}",
        f"Chunk Type: {chunk_type}",
-        "",
-        "You are a senior software architect with 30+ years of experience. Analyze this module/chunk comprehensively.",
        ""
    ]
    
@ -2794,7 +2803,12 @@ def build_intelligent_chunk_prompt(chunk: Dict, analysis_state: Optional[Dict] =
        "Focus on providing detailed, actionable insights that help understand the complete module context."
    ])
    
-    return "\n".join(prompt_parts)
+    base_prompt = "\n".join(prompt_parts)
+    
+    # Enhance with persona
+    enhanced_prompt = build_code_analysis_persona_prompt(base_prompt, persona, assignment_context)
+    
+    return enhanced_prompt

 def build_smart_batch_prompt(files_batch: List[Tuple[str, str]]) -> str:
    """Legacy function: Build prompt for simple batch (backward compatibility)."""
@ -4719,13 +4733,13 @@ def build_synthesis_prompt(analysis_state: Dict, all_chunk_analyses: List[Dict]
    """
    Build comprehensive prompt for cross-module synthesis analysis.
    Synthesizes all individual module analyses into system-level insights.
+    Uses CTO persona for executive-level synthesis.
    """
+    from persona_system import get_cto_persona, build_cto_synthesis_prompt
+    
    prompt_parts = [
        "# CROSS-MODULE SYNTHESIS ANALYSIS",
        "",
-        "You are a senior software architect with 30+ years of experience. Your task is to synthesize",
-        "findings from multiple module-level analyses into comprehensive system-level insights.",
-        "",
        "## CONTEXT: PREVIOUSLY ANALYZED MODULES",
        ""
    ]
@ -4842,7 +4856,19 @@ def build_synthesis_prompt(analysis_state: Dict, all_chunk_analyses: List[Dict]
        "across all analyzed modules, not just repeating individual module findings."
    ])
    
-    return "\n".join(prompt_parts)
+    base_prompt = "\n".join(prompt_parts)
+    
+    # Get team findings for CTO context
+    team_findings = []
+    if all_chunk_analyses:
+        for chunk_analysis in all_chunk_analyses:
+            module_name = chunk_analysis.get('module_name', 'unknown')
+            team_findings.append({"domain": module_name, "analysis": chunk_analysis})
+    
+    # Enhance with CTO persona
+    enhanced_prompt = build_cto_synthesis_prompt(base_prompt, team_findings)
+    
+    return enhanced_prompt

 def parse_synthesis_response(response_text: str) -> Dict:
    """Parse synthesis response from Claude API."""
--- a/services/git-integration/src/routes/github-oauth.js
+++ b/services/git-integration/src/routes/github-oauth.js
@ -141,17 +141,19 @@ router.get('/auth/github/callback', async (req, res) => {
        setImmediate(async () => {
          try {
            console.log('[GitHub OAuth] Starting background repository attachment for:', repoContext.repoUrl);
+            console.log('[GitHub OAuth] Using newly stored token for user:', user_id);
            const GitHubIntegrationService = require('../services/github-integration.service');
            const database = require('../config/database');
            const githubService = new GitHubIntegrationService();
            const { owner, repo, branch } = githubService.parseGitHubUrl(repoContext.repoUrl);
            
-            // Get metadata using authenticated Octokit
-            const repositoryData = await githubService.fetchRepositoryMetadata(owner, repo);
+            // Get metadata using authenticated Octokit with the specific user's token
+            // Pass userId to ensure we use the newly stored token
+            const repositoryData = await githubService.fetchRepositoryMetadata(owner, repo, false, user_id);
            let actualBranch = repoContext.branchName || branch || repositoryData.default_branch || 'main';
            
-            // Attempt analysis and sync with fallback
-            const codebaseAnalysis = await githubService.analyzeCodebase(owner, repo, actualBranch, false);
+            // Attempt analysis and sync with fallback - use userId to ensure correct token
+            const codebaseAnalysis = await githubService.analyzeCodebase(owner, repo, actualBranch, false, user_id);
            const insertQuery = `
              INSERT INTO all_repositories (
                repository_url, repository_name, owner_name, 
@ -170,14 +172,14 @@ router.get('/auth/github/callback', async (req, res) => {
              JSON.stringify(codebaseAnalysis),
              'syncing',
              repositoryData.visibility === 'private',
-              repoContext.userId || null,
+              user_id || repoContext.userId || null, // Use user_id from OAuth callback (most reliable)
              'github' // This is GitHub OAuth callback, so provider is always github
            ];
            const insertResult = await database.query(insertQuery, insertValues);
            const repositoryRecord = insertResult.rows[0];
            
-            // Clone repository
-            const downloadResult = await githubService.syncRepositoryWithFallback(owner, repo, actualBranch, repositoryRecord.id, repositoryData.visibility !== 'private');
+            // Clone repository - use userId to ensure correct token
+            const downloadResult = await githubService.syncRepositoryWithFallback(owner, repo, actualBranch, repositoryRecord.id, repositoryData.visibility !== 'private', user_id);
            const finalSyncStatus = downloadResult.success ? 'synced' : 'error';
            await database.query('UPDATE all_repositories SET sync_status = $1, updated_at = NOW() WHERE id = $2', [finalSyncStatus, repositoryRecord.id]);
            
--- a/services/git-integration/src/routes/vcs.routes.js
+++ b/services/git-integration/src/routes/vcs.routes.js
@ -163,12 +163,28 @@ router.post('/:provider/attach-repository', async (req, res) => {
    const { template_id, repository_url, branch_name } = req.body;
    const userId = req.headers['x-user-id'] || req.query.user_id || req.body.user_id || (req.user && (req.user.id || req.user.userId));
    
+    console.log(`[VCS Attach] Extracted userId:`, userId, `from headers:`, req.headers['x-user-id'], `query:`, req.query.user_id, `body:`, req.body.user_id);
+
    // Validate input - only repository_url is required (like GitHub)
    if (!repository_url) {
      return res.status(400).json({ success: false, message: 'Repository URL is required' });
    }

-    const { owner, repo, branch } = provider.parseRepoUrl(repository_url);
+    // Clean and normalize the repository URL (trim whitespace, decode URL encoding)
+    let cleanedUrl = repository_url.trim();
+    // Decode URL-encoded characters (like %20 for spaces)
+    try {
+      cleanedUrl = decodeURIComponent(cleanedUrl);
+    } catch (e) {
+      // If decoding fails, use original URL
+      console.warn(`[VCS Attach] Failed to decode URL, using original: ${cleanedUrl}`);
+    }
+    // Trim again after decoding
+    cleanedUrl = cleanedUrl.trim();
+    
+    console.log(`[VCS Attach] Original URL: ${repository_url}, Cleaned URL: ${cleanedUrl}`);
+
+    const { owner, repo, branch } = provider.parseRepoUrl(cleanedUrl);

    // Enhanced flow: Detect private repos and redirect to OAuth immediately
    const providerKey = (req.params.provider || '').toLowerCase();
@ -248,7 +264,44 @@ router.post('/:provider/attach-repository', async (req, res) => {
    // For public repos or authenticated private repos, proceed with normal flow
    const accessCheck = await provider.checkRepositoryAccess(owner, repo, userId);
    
+    console.log(`[VCS Attach] Access check result for ${owner}/${repo}:`, {
+      hasAccess: accessCheck.hasAccess,
+      requiresAuth: accessCheck.requiresAuth,
+      authError: accessCheck.authError,
+      error: accessCheck.error,
+      exists: accessCheck.exists,
+      github_username: accessCheck.github_username
+    });
+
    if (!accessCheck.hasAccess) {
+      // If access check failed but requires auth, trigger OAuth flow
+      if (accessCheck.requiresAuth || accessCheck.authError) {
+        const oauthService = getOAuthService(providerKey);
+        if (oauthService) {
+          console.log(`🔒 [VCS Attach] Token exists but cannot access repository (or no valid token), redirecting to OAuth: ${repository_url}`);
+          console.log(`🔒 [VCS Attach] Reason: ${accessCheck.error || 'Authentication required'}, userId: ${userId}`);
+          
+          // Generate OAuth URL with repository context in state
+          const stateBase = Math.random().toString(36).substring(7);
+          const state = `${stateBase}|uid=${userId || 'unknown'}|repo=${encodeURIComponent(repository_url)}|branch=${encodeURIComponent(branch_name || 'main')}|private_repo=true`;
+
+          const authUrl = oauthService.getAuthUrl(state, userId);
+          
+          console.log(`🔒 [VCS Attach] Generated OAuth URL for ${providerKey}, returning requires_auth response`);
+
+          return res.json({
+            success: false,
+            message: `${providerKey.charAt(0).toUpperCase() + providerKey.slice(1)} authentication required for private repository`,
+            requires_auth: true,
+            is_private_repo: true,
+            auth_url: authUrl,
+            state: state
+          });
+        }
+      }
+      
+      // If it's not an auth issue, return 404
+      console.log(`[VCS Attach] Access check failed without auth requirement, returning 404`);
      return res.status(404).json({ success: false, message: accessCheck.error || 'Repository not accessible' });
    }

--- a/services/git-integration/src/services/github-integration.service.js
+++ b/services/git-integration/src/services/github-integration.service.js
@ -21,8 +21,8 @@ class GitHubIntegrationService {
  }

  // Get authenticated Octokit instance
-  async getAuthenticatedOctokit() {
-    return await this.oauthService.getAuthenticatedOctokit();
+  async getAuthenticatedOctokit(userId = null) {
+    return await this.oauthService.getAuthenticatedOctokit(userId);
  }

  // Extract owner, repo, and branch from GitHub URL using parse-github-url library
@ -31,8 +31,15 @@ class GitHubIntegrationService {
      throw new Error('URL must be a non-empty string');
    }

-    // Normalize the URL first
+    // Normalize the URL first - trim and decode URL encoding
    let normalizedUrl = url.trim();
+    // Decode URL-encoded characters (like %20 for spaces)
+    try {
+      normalizedUrl = decodeURIComponent(normalizedUrl).trim();
+    } catch (e) {
+      // If decoding fails, just trim
+      normalizedUrl = normalizedUrl.trim();
+    }
    
    // Remove trailing slashes and .git extensions
    normalizedUrl = normalizedUrl.replace(/\/+$/, '').replace(/\.git$/, '');
@ -216,7 +223,7 @@ class GitHubIntegrationService {
        };
      }
      
-      // No token found - try unauthenticated access first to check if it's public
+      // No token found that can access this repo - try unauthenticated access to check if it's public
      try {
        const unauthenticatedOctokit = new Octokit({
          userAgent: 'CodeNuk-GitIntegration/1.0.0',
@ -234,13 +241,18 @@ class GitHubIntegrationService {
        };
      } catch (unauthenticatedError) {
        if (unauthenticatedError.status === 404) {
-          // Repository truly doesn't exist
+          // 404 from unauthenticated access could mean:
+          // 1. Repository truly doesn't exist
+          // 2. Repository is private and requires authentication
+          // Since we already tried to find a token and none could access it, 
+          // and we're being called from a private repo flow, assume it requires auth
+          console.log(`🔒 [GitHub] 404 from unauthenticated access - assuming private repo requires authentication`);
          return {
-            exists: false,
+            exists: null, // Unknown - could be missing or private
            isPrivate: null,
            hasAccess: false,
-            requiresAuth: false,
-            error: 'Repository not found'
+            requiresAuth: true, // Changed from false to true - trigger OAuth
+            error: 'Repository not found or requires authentication'
          };
        } else if (unauthenticatedError.status === 401 || unauthenticatedError.status === 403) {
          // Repository exists but requires authentication (private) - generate auth URL
@ -289,13 +301,13 @@ class GitHubIntegrationService {
  }

  // Get repository information from GitHub
-  async fetchRepositoryMetadata(owner, repo, skipAuth = false) {
+  async fetchRepositoryMetadata(owner, repo, skipAuth = false, userId = null) {
    // If skipAuth is true, try with unauthenticated octokit first to check visibility
    let octokit;
    if (skipAuth) {
      octokit = this.octokit; // Use unauthenticated instance
    } else {
-      octokit = await this.getAuthenticatedOctokit();
+      octokit = await this.getAuthenticatedOctokit(userId);
    }

    const safe = async (fn, fallback) => {
@ -309,26 +321,41 @@ class GitHubIntegrationService {

    let repoData;
    try {
+      console.log(`🔍 [GitHub] fetchRepositoryMetadata: skipAuth=${skipAuth}, calling octokit.repos.get for ${owner}/${repo}`);
      const response = await octokit.repos.get({ owner, repo });
-      if (skipAuth) {
-        if (response.status === 401 || response.status === 403) {
-          throw new Error('Authentication required to access repository');
-        } else if (response.status === 404) {
-          throw new Error('Repository not found');
-        }
-      }
      repoData = response.data;
+      console.log(`✅ [GitHub] Successfully fetched repository data: ${repoData?.full_name || 'no full_name'}`);
+      
+      // Validate we got real data
+      if (!repoData || !repoData.full_name) {
+        console.log(`❌ [GitHub] Invalid repository data received, throwing error`);
+        throw new Error('Invalid repository data received');
+      }
    } catch (error) {
-      console.log(`🔍 [GitHub] Error in fetchRepositoryMetadata:`, error.message, error.status);
+      // Check error status from various possible locations
+      const status = error.status || error.response?.status || error.code;
+      const errorMessage = error.message || '';
+      const is404 = status === 404 || status === '404' || errorMessage.includes('404') || errorMessage.includes('Not Found');
+      const isAuthError = status === 401 || status === 403 || status === '401' || status === '403';
+      
+      console.log(`🔍 [GitHub] Error in fetchRepositoryMetadata CATCH BLOCK:`, errorMessage, `Status: ${status || 'unknown'}`, `is404: ${is404}`, `isAuthError: ${isAuthError}`, `skipAuth: ${skipAuth}`);
+      console.log(`🔍 [GitHub] Error object:`, JSON.stringify({ 
+        status: error.status, 
+        responseStatus: error.response?.status, 
+        code: error.code,
+        message: error.message,
+        name: error.name
+      }));
+      
      if (skipAuth) {
-        // For GitHub, any error when skipAuth=true likely means private repo
-        if (error.status === 401 || error.status === 403 || error.status === 404) {
+        // For GitHub, any error when skipAuth=true means private repo or doesn't exist
+        // Always throw authentication required - let the caller decide if it's truly missing or private
+        console.log(`🔒 [GitHub] skipAuth=true, THROWING authentication required error - NOT using safe fallback`);
        throw new Error('Authentication required to access repository');
      }
-        // For other errors, also assume private repo
-        throw new Error('Authentication required to access repository');
-      }
-      // For other errors, use safe fallback
+      
+      // For authenticated requests, use safe fallback (but only if skipAuth is false)
+      console.log(`⚠️ [GitHub] skipAuth=false, using safe fallback`);
      repoData = await safe(
        async () => {
          const response = await octokit.repos.get({ owner, repo });
@ -336,6 +363,12 @@ class GitHubIntegrationService {
        },
        {}
      );
+      
+      // If safe fallback also failed, throw
+      if (!repoData || !repoData.full_name) {
+        console.log(`❌ [GitHub] Safe fallback also failed, throwing Repository not found`);
+        throw new Error('Repository not found');
+      }
    }

    const languages = await safe(
@ -364,7 +397,7 @@ class GitHubIntegrationService {
  }

  // Analyze codebase structure
-  async analyzeCodebase(owner, repo, branch, isPublicRepo = false) {
+  async analyzeCodebase(owner, repo, branch, isPublicRepo = false, userId = null) {
    try {
      // Use appropriate octokit instance based on repository type
      let octokit;
@ -374,8 +407,8 @@ class GitHubIntegrationService {
          userAgent: 'CodeNuk-GitIntegration/1.0.0',
        });
      } else {
-        // For private repos, use authenticated octokit
-        octokit = await this.getAuthenticatedOctokit();
+        // For private repos, use authenticated octokit with userId
+        octokit = await this.getAuthenticatedOctokit(userId);
      }
      
      // Get the commit SHA for the branch
@ -519,7 +552,7 @@ class GitHubIntegrationService {
  }

  // Git-based: clone or update local repo and re-index into DB
-  async syncRepositoryWithGit(owner, repo, branch, repositoryId, isPublicRepo = false) {
+  async syncRepositoryWithGit(owner, repo, branch, repositoryId, isPublicRepo = false, userId = null) {
    const database = require('../config/database');
    const localPath = this.gitRepoService.getLocalRepoPath(owner, repo, branch);
    let storageRecord = null;
@ -544,7 +577,7 @@ class GitHubIntegrationService {
          console.warn(`Failed to clone public repo without auth: ${error.message}`);
          // Fallback to authenticated clone if available
          try {
-            const tokenRecord = await this.oauthService.getToken();
+            const tokenRecord = userId ? await this.oauthService.getTokenForUser(userId) : await this.oauthService.getToken();
            if (tokenRecord?.access_token) {
              repoPath = await this.gitRepoService.cloneIfMissingWithAuth(
                owner,
@ -560,7 +593,7 @@ class GitHubIntegrationService {
      } else {
        // For private repos, try authenticated clone first
        try {
-          const tokenRecord = await this.oauthService.getToken();
+          const tokenRecord = userId ? await this.oauthService.getTokenForUser(userId) : await this.oauthService.getToken();
          if (tokenRecord?.access_token) {
            repoPath = await this.gitRepoService.cloneIfMissingWithAuth(
              owner,
@ -628,7 +661,7 @@ class GitHubIntegrationService {
    try {
      // Try to ensure repo exists for the preferred branch
      try {
-        const tokenRecord = await this.oauthService.getToken().catch(() => null);
+        const tokenRecord = userId ? await this.oauthService.getTokenForUser(userId).catch(() => null) : await this.oauthService.getToken().catch(() => null);
        if (tokenRecord?.access_token) {
          repoPath = await this.gitRepoService.cloneIfMissingWithAuth(owner, repo, preferredBranch, 'github.com', tokenRecord.access_token, 'oauth2');
        } else {
@ -637,7 +670,7 @@ class GitHubIntegrationService {
      } catch (cloneErr) {
        // If the branch doesn't exist (e.g., refs/heads not found), try the alternate branch
        try {
-          const tokenRecordAlt = await this.oauthService.getToken().catch(() => null);
+          const tokenRecordAlt = userId ? await this.oauthService.getTokenForUser(userId).catch(() => null) : await this.oauthService.getToken().catch(() => null);
          repoPath = tokenRecordAlt?.access_token
            ? await this.gitRepoService.cloneIfMissingWithAuth(owner, repo, alternateBranch, 'github.com', tokenRecordAlt.access_token, 'oauth2')
            : await this.gitRepoService.cloneIfMissing(owner, repo, alternateBranch);
@ -679,7 +712,7 @@ class GitHubIntegrationService {
    try {
      // Ensure repo exists similarly to diff flow
      try {
-        const tokenRecord = await this.oauthService.getToken().catch(() => null);
+        const tokenRecord = userId ? await this.oauthService.getTokenForUser(userId).catch(() => null) : await this.oauthService.getToken().catch(() => null);
        if (tokenRecord?.access_token) {
          repoPath = await this.gitRepoService.cloneIfMissingWithAuth(owner, repo, preferredBranch, 'github.com', tokenRecord.access_token, 'oauth2');
        } else {
@ -687,7 +720,7 @@ class GitHubIntegrationService {
        }
      } catch (_) {
        try {
-          const tokenRecordAlt = await this.oauthService.getToken().catch(() => null);
+          const tokenRecordAlt = userId ? await this.oauthService.getTokenForUser(userId).catch(() => null) : await this.oauthService.getToken().catch(() => null);
          repoPath = tokenRecordAlt?.access_token
            ? await this.gitRepoService.cloneIfMissingWithAuth(owner, repo, alternateBranch, 'github.com', tokenRecordAlt.access_token, 'oauth2')
            : await this.gitRepoService.cloneIfMissing(owner, repo, alternateBranch);
@ -720,15 +753,15 @@ class GitHubIntegrationService {
  }

  // Try git-based sync first, fall back to GitHub API download on failure
-  async syncRepositoryWithFallback(owner, repo, branch, repositoryId, isPublicRepo = false) {
+  async syncRepositoryWithFallback(owner, repo, branch, repositoryId, isPublicRepo = false, userId = null) {
    // First attempt: full git clone/fetch and index
-    const gitResult = await this.syncRepositoryWithGit(owner, repo, branch, repositoryId, isPublicRepo);
+    const gitResult = await this.syncRepositoryWithGit(owner, repo, branch, repositoryId, isPublicRepo, userId);
    if (gitResult && gitResult.success) {
      return { method: 'git', ...gitResult };
    }

    // Fallback: API-based download and storage
-    const apiResult = await this.downloadRepositoryWithStorage(owner, repo, branch, repositoryId, isPublicRepo);
+    const apiResult = await this.downloadRepositoryWithStorage(owner, repo, branch, repositoryId, isPublicRepo, userId);
    if (apiResult && apiResult.success) {
      return { method: 'api', ...apiResult, git_error: gitResult?.error };
    }
@ -737,7 +770,7 @@ class GitHubIntegrationService {
  }

  // Download repository files locally and store in database
-  async downloadRepositoryWithStorage(owner, repo, branch, repositoryId, isPublicRepo = false) {
+  async downloadRepositoryWithStorage(owner, repo, branch, repositoryId, isPublicRepo = false, userId = null) {
    const targetDir = path.join(
      process.env.ATTACHED_REPOS_DIR,
      `${owner}__${repo}__${branch}`
@ -765,8 +798,8 @@ class GitHubIntegrationService {
          userAgent: 'CodeNuk-GitIntegration/1.0.0',
        });
      } else {
-        // For private repos, use authenticated octokit
-        octokit = await this.getAuthenticatedOctokit();
+        // For private repos, use authenticated octokit with userId
+        octokit = await this.getAuthenticatedOctokit(userId);
      }
      
      // Get the commit SHA for the branch
--- a/services/git-integration/src/services/github-oauth.js
+++ b/services/git-integration/src/services/github-oauth.js
@ -199,8 +199,16 @@ class GitHubOAuthService {
  }

  // Create authenticated Octokit instance
-  async getAuthenticatedOctokit() {
-    const tokenRecord = await this.getToken();
+  async getAuthenticatedOctokit(userId = null) {
+    // If userId is provided, get the newest token for that user
+    // Otherwise, get the newest token overall
+    let tokenRecord;
+    if (userId) {
+      tokenRecord = await this.getTokenForUser(userId);
+      console.log(`[GitHub OAuth] Using token for user ${userId}: ${tokenRecord?.github_username || 'none'}`);
+    } else {
+      tokenRecord = await this.getToken();
+    }
    
    if (!tokenRecord) {
      throw new Error('No GitHub token found. Please authenticate with GitHub first.');
--- a/services/git-integration/src/services/provider-registry.js
+++ b/services/git-integration/src/services/provider-registry.js
@ -15,7 +15,11 @@ class GithubAdapter {
    return this.impl.parseGitHubUrl(url);
  }

-  async checkRepositoryAccess(owner, repo) {
+  async checkRepositoryAccess(owner, repo, userId = null) {
+    // Use user-specific method if userId is provided
+    if (userId) {
+      return await this.impl.checkRepositoryAccessWithUser(owner, repo, userId);
+    }
    return await this.impl.checkRepositoryAccess(owner, repo);
  }

--- a/services/multi-document-upload-service/.dockerignore
+++ b/services/multi-document-upload-service/.dockerignore
@ -0,0 +1,58 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+*.egg-info/
+dist/
+build/
+*.egg
+
+# Virtual environments
+venv/
+env/
+ENV/
+.venv
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# Documentation
+*.md
+!README.md
+
+# Testing
+.pytest_cache/
+.coverage
+htmlcov/
+*.log
+
+# Storage and temporary files
+storage/
+*.tmp
+*.temp
+
+# Git
+.git/
+.gitignore
+
+# Docker
+Dockerfile*
+docker-compose*.yml
+.dockerignore
+
+# Environment files
+.env
+.env.local
+*.env
+
+# OS
+.DS_Store
+Thumbs.db
+
+
--- a/services/multi-document-upload-service/Dockerfile
+++ b/services/multi-document-upload-service/Dockerfile
@ -1,29 +1,60 @@
-FROM python:3.11-slim
+# Build stage - install dependencies that require compilation
+FROM python:3.11-slim as builder

 ENV PYTHONDONTWRITEBYTECODE=1 \
    PYTHONUNBUFFERED=1

 WORKDIR /app

+# Install build dependencies only
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        build-essential \
+        curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy and install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir --user -r requirements.txt && \
+    pip cache purge
+
+# Download SpaCy English model
+RUN python -m spacy download en_core_web_sm
+
+# Runtime stage - minimal image with only runtime dependencies
+FROM python:3.11-slim
+
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PYTHONPATH=/app/src \
+    PATH=/root/.local/bin:$PATH \
+    MULTI_DOC_STORAGE_ROOT=/app/storage \
+    MULTI_DOC_CLAUDE_MODEL=claude-3-5-haiku-latest \
+    CLAUDE_MODEL=claude-3-5-haiku-latest \
+    PORT=8024
+
+WORKDIR /app
+
+# Install only runtime dependencies (no build tools)
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
        poppler-utils \
        tesseract-ocr \
        ffmpeg \
        libmagic1 \
-    && rm -rf /var/lib/apt/lists/*
+        curl \
+        # Required for some Python packages at runtime
+        libgomp1 \
+        libglib2.0-0 \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean

-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
+# Copy Python packages from builder stage (includes spacy model)
+COPY --from=builder /root/.local /root/.local

+# Copy application code
 COPY src ./src

-ENV PYTHONPATH=/app/src \
-    MULTI_DOC_STORAGE_ROOT=/app/storage \
-    MULTI_DOC_CLAUDE_MODEL=claude-3-5-sonnet-20241022 \
-    PORT=8024
-
 EXPOSE 8024

 CMD ["sh", "-c", "uvicorn multi_document_upload_service.main:app --host 0.0.0.0 --port ${PORT:-8024}"]
--- a/services/multi-document-upload-service/FIX_EMPTY_GRAPH.md
+++ b/services/multi-document-upload-service/FIX_EMPTY_GRAPH.md
@ -1,144 +0,0 @@
-# Fix: Empty Graph in Neo4j (No Relationships Found)
-
-## Problem
-
-When querying Neo4j for `CAUSES` relationships, you get "(no changes, no records)" because:
-
-1. **PDF extraction failed** - Missing dependencies (`unstructured[pdf]`)
-2. **0 relations extracted** - No text was extracted, so no analysis happened
-3. **0 relations written** - Nothing was written to Neo4j (correct behavior)
-
-## Root Cause
-
-The service completed with 0 relations because:
- PDF file extraction failed: `partition_pdf() is not available because one or more dependencies are not installed`
- No text was extracted from the PDF
- No chunks were created
- No Claude analysis happened
- 0 relations were extracted
- 0 relations were written to Neo4j
-
-## Solution
-
-### Step 1: Update Dependencies
-
-The `requirements.txt` has been updated to include:
-```
-unstructured[pdf]>=0.15.0
-unstructured[docx]>=0.15.0
-unstructured[pptx]>=0.15.0
-unstructured[xlsx]>=0.15.0
-```
-
-### Step 2: Rebuild the Service
-
-```bash
-cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine
-
-# Rebuild the service with new dependencies
-docker-compose build multi-document-upload-service
-
-# Restart the service
-docker-compose restart multi-document-upload-service
-
-# Check logs to verify it's working
-docker-compose logs -f multi-document-upload-service
-```
-
-### Step 3: Verify Dependencies
-
-```bash
-# Check if unstructured[pdf] is installed
-docker-compose exec multi-document-upload-service pip list | grep unstructured
-```
-
-### Step 4: Re-upload Documents
-
-1. Go to Project Builder in the frontend
-2. Click on "Upload Documents for Knowledge Graph"
-3. Upload a PDF or other document
-4. Wait for processing to complete
-5. Check Neo4j for relationships
-
-### Step 5: Check Neo4j
-
-Run these queries in Neo4j Browser:
-
-```cypher
-// Check if any nodes exist
-MATCH (n)
-RETURN count(n) as node_count
-
-// Check for CAUSES relationships
-MATCH (n:Concept)-[r:CAUSES]->(m:Concept)
-RETURN n.name as cause, m.name as effect, r.confidence as confidence
-LIMIT 50
-```
-
-## Expected Behavior After Fix
-
-1. **PDF extraction succeeds** - Text is extracted from PDF files
-2. **Text is chunked** - Document is split into manageable chunks
-3. **Claude analyzes** - Causal relationships are extracted
-4. **Relations are written** - Relationships are stored in Neo4j
-5. **Query returns results** - Neo4j query shows relationships
-
-## Verification Steps
-
-1. **Check service logs**:
-   ```bash
-   docker-compose logs multi-document-upload-service | grep -i "extracted\|relation\|neo4j"
-   ```
-
-2. **Check job status**:
-   ```bash
-   curl http://localhost:8000/api/multi-docs/jobs/{job_id}
-   ```
-   Should show: `"processed_files": 1` and relations count > 0
-
-3. **Check Neo4j**:
-   ```cypher
-   MATCH (n:Concept)-[r:CAUSES]->(m:Concept)
-   RETURN count(r) as relation_count
-   ```
-
-## Improvements Made
-
-1. ✅ **Added PDF dependencies** - `unstructured[pdf]`, `unstructured[docx]`, etc.
-2. ✅ **Added fallback extractors** - Uses `pdfplumber` if unstructured fails
-3. ✅ **Better error handling** - Shows actual errors in job status
-4. ✅ **Improved logging** - More detailed logs for debugging
-5. ✅ **Better Neo4j query** - Validates data before writing
-
-## Troubleshooting
-
-If you still see 0 relations after rebuilding:
-
-1. **Check extraction logs**:
-   ```bash
-   docker-compose logs multi-document-upload-service | grep -i "extract"
-   ```
-
-2. **Check Claude analysis**:
-   ```bash
-   docker-compose logs multi-document-upload-service | grep -i "claude\|analyze"
-   ```
-
-3. **Check Neo4j connection**:
-   ```bash
-   docker-compose logs multi-document-upload-service | grep -i "neo4j\|graph"
-   ```
-
-4. **Verify document has causal language**:
-   - Not all documents contain causal relationships
-   - Try uploading a document with clear cause-effect statements
-   - Example: "Smoking causes lung cancer" or "Rain causes flooding"
-
-## Next Steps
-
-1. Rebuild the service with new dependencies
-2. Re-upload documents
-3. Check Neo4j for relationships
-4. If still no results, check service logs for errors
-5. Verify the document contains causal language
-
--- a/services/multi-document-upload-service/NEO4J_DIAGNOSTIC_QUERIES.md
+++ b/services/multi-document-upload-service/NEO4J_DIAGNOSTIC_QUERIES.md
@ -1,176 +0,0 @@
-# Neo4j Diagnostic Queries
-
-## Issue: No relationships found in Neo4j
-
-If you're seeing "(no changes, no records)" when querying for `CAUSES` relationships, here are diagnostic queries to check what's actually in the database.
-
-## Diagnostic Queries
-
-### 1. Check if any nodes exist
-```cypher
-MATCH (n)
-RETURN count(n) as node_count
-LIMIT 1
-```
-
-### 2. Check if Concept nodes exist
-```cypher
-MATCH (n:Concept)
-RETURN count(n) as concept_count, 
-       collect(DISTINCT labels(n)) as labels,
-       collect(DISTINCT keys(n)) as properties
-LIMIT 10
-```
-
-### 3. Check all relationship types
-```cypher
-CALL db.relationshipTypes() YIELD relationshipType
-RETURN relationshipType
-```
-
-### 4. Check all node labels
-```cypher
-CALL db.labels() YIELD label
-RETURN label
-```
-
-### 5. Check all relationships (any type)
-```cypher
-MATCH (n)-[r]->(m)
-RETURN type(r) as relationship_type, 
-       count(r) as count,
-       labels(n) as from_labels,
-       labels(m) as to_labels
-LIMIT 50
-```
-
-### 6. Check for CAUSES relationships specifically
-```cypher
-MATCH (n)-[r:CAUSES]->(m)
-RETURN n, r, m
-LIMIT 50
-```
-
-### 7. Check for relationships with lowercase "causes"
-```cypher
-MATCH (n)-[r]->(m)
-WHERE type(r) =~ '(?i)causes'
-RETURN type(r) as relationship_type, n, r, m
-LIMIT 50
-```
-
-### 8. Check all nodes and their relationships
-```cypher
-MATCH (n)
-OPTIONAL MATCH (n)-[r]->(m)
-RETURN n, labels(n) as node_labels, 
-       type(r) as relationship_type, 
-       m, labels(m) as target_labels
-LIMIT 50
-```
-
-### 9. Check for nodes created by the service (by job_id property)
-```cypher
-MATCH (n)-[r]->(m)
-WHERE r.job_id IS NOT NULL
-RETURN n, r, m, r.job_id as job_id
-LIMIT 50
-```
-
-### 10. Check database statistics
-```cypher
-MATCH (n)
-RETURN count(n) as total_nodes,
-       size([(n)-[r]->() | r]) as total_relationships
-```
-
-## Common Issues and Solutions
-
-### Issue 1: No nodes at all
-**Symptom**: Query 1 returns 0 nodes
-**Cause**: Service hasn't written anything to Neo4j, or connection failed
-**Solution**: 
- Check service logs: `docker-compose logs multi-document-upload-service`
- Verify Neo4j connection in service configuration
- Check if job completed with 0 relations (extraction failed)
-
-### Issue 2: Nodes exist but no relationships
-**Symptom**: Query 1 returns nodes, but Query 6 returns no relationships
-**Cause**: Relationships weren't created, or different relationship type
-**Solution**:
- Check Query 5 to see what relationship types actually exist
- Check service logs for graph writing errors
- Verify the job actually extracted relations (check job status)
-
-### Issue 3: Different relationship type
-**Symptom**: Query 5 shows relationships but not `CAUSES`
-**Cause**: Service might be using a different relationship type
-**Solution**:
- Check Query 3 to see all relationship types
- Update query to use the correct relationship type
-
-### Issue 4: Different node labels
-**Symptom**: Query 6 returns no results, but Query 2 shows different labels
-**Cause**: Service might be using different node labels
-**Solution**:
- Check Query 2 to see what labels exist
- Update query to match actual labels
-
-## Expected Structure
-
-After a successful upload, you should see:
-
-### Nodes
- **Label**: `Concept`
- **Properties**: `name`, `lastSeen`
-
-### Relationships
- **Type**: `CAUSES`
- **Properties**: `confidence`, `explanation`, `source_file_id`, `source_snippet`, `job_id`, `model`, `updated_at`
-
-### Example Query
-```cypher
-MATCH (cause:Concept)-[r:CAUSES]->(effect:Concept)
-RETURN cause.name as cause, 
-       effect.name as effect, 
-       r.confidence as confidence,
-       r.job_id as job_id,
-       r.source_file_id as source_file
-LIMIT 50
-```
-
-## Troubleshooting Steps
-
-1. **Check service logs**:
-   ```bash
-   docker-compose logs -f multi-document-upload-service
-   ```
-
-2. **Check if job completed successfully**:
-   ```bash
-   curl http://localhost:8000/api/multi-docs/jobs/{job_id}
-   ```
-
-3. **Check Neo4j connection**:
-   ```bash
-   docker-compose logs neo4j | grep -i error
-   ```
-
-4. **Verify Neo4j is running**:
-   ```bash
-   docker-compose ps neo4j
-   ```
-
-5. **Test Neo4j connection manually**:
-   ```bash
-   docker-compose exec neo4j cypher-shell -u neo4j -p password "MATCH (n) RETURN count(n)"
-   ```
-
-## Next Steps
-
-1. Run the diagnostic queries above
-2. Check the service logs for errors
-3. Verify the job status via API
-4. Re-upload documents after fixing dependencies
-5. Check if relations were actually extracted (job status should show relation count)
-
--- a/services/multi-document-upload-service/QUICK_TEST.md
+++ b/services/multi-document-upload-service/QUICK_TEST.md
@ -1,85 +0,0 @@
-# Quick Testing Guide - Multi-Document Upload
-
-## 🚀 Quick Start Testing
-
-### 1. Start Services
-```bash
-cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine
-docker-compose up -d multi-document-upload-service neo4j redis postgres api-gateway
-```
-
-### 2. Verify Services
-```bash
-# Check health
-curl http://localhost:8024/health
-curl http://localhost:8000/api/multi-docs/health
-```
-
-### 3. Test via Frontend
-
-1. **Open Frontend**: `http://localhost:3001`
-2. **Login** (if required)
-3. **Go to Project Builder**
-4. **Complete Steps 1-2** (Project Type & Features)
-5. **Step 3: Multi Docs Upload** appears
-6. **Upload files**:
-   - Click upload area
-   - Select multiple files (PDF, DOCX, etc.)
-   - Click "Start Upload"
-7. **Watch Progress**:
-   - Progress bar updates
-   - Status messages appear
-   - Polls every 4 seconds
-8. **Auto-proceeds** when completed
-
-### 4. Verify in Neo4j
-
-```bash
-# Open Neo4j Browser: http://localhost:7474
-# Login: neo4j / password
-
-# Query causal relationships:
-MATCH (n)-[r:CAUSES]->(m)
-RETURN n, r, m
-LIMIT 50
-```
-
-## 📝 Test Checklist
-
- [ ] Service starts successfully
- [ ] Health endpoint works
- [ ] Frontend component renders
- [ ] File upload works
- [ ] Progress updates correctly
- [ ] Job completes successfully
- [ ] Neo4j graph contains relationships
- [ ] Error handling works
- [ ] Skip button works
-
-## 🔍 Debug Commands
-
-```bash
-# View service logs
-docker-compose logs -f multi-document-upload-service
-
-# Check job status (replace {job_id})
-curl http://localhost:8000/api/multi-docs/jobs/{job_id}
-
-# Check graph summary
-curl http://localhost:8000/api/multi-docs/jobs/{job_id}/graph
-```
-
-## ⚠️ Common Issues
-
-1. **502 Bad Gateway**: Service not running → `docker-compose ps`
-2. **413 Too Large**: File too big → Reduce file size
-3. **No progress**: Check browser console → Check network tab
-4. **No relationships**: Check Claude API key → Check service logs
-
-## 🎯 Expected Flow
-
-```
-Upload Files → Job Created → Files Saved → Content Extracted → 
-Claude Analysis → Graph Built → Completed → Auto-proceed to Next Step
-```
-
--- a/services/multi-document-upload-service/README.md
+++ b/services/multi-document-upload-service/README.md
--- a/services/multi-document-upload-service/REBUILD_INSTRUCTIONS.md
+++ b/services/multi-document-upload-service/REBUILD_INSTRUCTIONS.md
@ -1,152 +0,0 @@
-# Rebuild Instructions - Multi-Document Upload Service
-
-## Issue: Empty Graph in Neo4j
-
-**Problem**: Query returns "(no changes, no records)" because the job completed with 0 relations.
-
-**Root Cause**: PDF extraction failed due to missing dependencies (`unstructured[pdf]`).
-
-## Fixes Applied
-
-1. ✅ Added PDF dependencies (`unstructured[pdf]`, `unstructured[docx]`, etc.)
-2. ✅ Added fallback extractors (pdfplumber, python-docx, python-pptx)
-3. ✅ Improved error handling and logging
-4. ✅ Fixed Neo4j query syntax
-5. ✅ Better status messages
-
-## Rebuild Steps
-
-### Step 1: Rebuild the Service
-
-```bash
-cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine
-
-# Stop the service
-docker-compose stop multi-document-upload-service
-
-# Rebuild with new dependencies
-docker-compose build --no-cache multi-document-upload-service
-
-# Start the service
-docker-compose up -d multi-document-upload-service
-
-# Check logs to verify it's starting correctly
-docker-compose logs -f multi-document-upload-service
-```
-
-### Step 2: Verify Dependencies
-
-```bash
-# Check if unstructured[pdf] is installed
-docker-compose exec multi-document-upload-service pip list | grep unstructured
-
-# You should see:
-# unstructured
-# unstructured-pdf
-# unstructured-docx
-# etc.
-```
-
-### Step 3: Test the Service
-
-```bash
-# Check health endpoint
-curl http://localhost:8024/health
-
-# Should return:
-# {
-#   "status": "ok",
-#   "claude_model": "claude-3-5-haiku-latest",
-#   ...
-# }
-```
-
-### Step 4: Re-upload Documents
-
-1. Open frontend: `http://localhost:3001/project-builder`
-2. Go to Step 1: Project Type
-3. Find "Upload Documents for Knowledge Graph" section
-4. Upload a PDF or other document
-5. Wait for processing to complete
-6. Check status - should show relation count > 0
-
-### Step 5: Verify in Neo4j
-
-Run these queries in Neo4j Browser (`http://localhost:7474`):
-
-```cypher
-// Check if any nodes exist
-MATCH (n)
-RETURN count(n) as node_count
-
-// Check for CAUSES relationships
-MATCH (n:Concept)-[r:CAUSES]->(m:Concept)
-RETURN n.name as cause, 
-       m.name as effect, 
-       r.confidence as confidence,
-       r.job_id as job_id
-LIMIT 50
-```
-
-## Expected Results
-
-After rebuilding and re-uploading:
-
-1. **PDF extraction succeeds** ✅
-2. **Text is extracted** ✅
-3. **Relations are extracted** ✅
-4. **Relations are written to Neo4j** ✅
-5. **Query returns results** ✅
-
-## Troubleshooting
-
-If you still see 0 relations:
-
-1. **Check service logs**:
-   ```bash
-   docker-compose logs multi-document-upload-service | tail -50
-   ```
-
-2. **Check extraction logs**:
-   ```bash
-   docker-compose logs multi-document-upload-service | grep -i "extract\|pdf"
-   ```
-
-3. **Check Claude analysis**:
-   ```bash
-   docker-compose logs multi-document-upload-service | grep -i "claude\|analyze\|relation"
-   ```
-
-4. **Check Neo4j connection**:
-   ```bash
-   docker-compose logs multi-document-upload-service | grep -i "neo4j\|graph\|write"
-   ```
-
-5. **Verify document has causal language**:
-   - Not all documents contain causal relationships
-   - Try uploading a document with clear cause-effect statements
-   - Example: "Smoking causes lung cancer"
-
-## Quick Test
-
-Test with a simple text file:
-
-1. Create a test file `test_causal.txt`:
-   ```
-   Smoking cigarettes causes lung cancer.
-   Heavy rain causes flooding.
-   Exercise improves health.
-   ```
-
-2. Upload it via the frontend
-3. Check Neo4j for relationships
-4. Should see 3 causal relationships
-
-## Next Steps
-
-1. Rebuild the service
-2. Re-upload documents
-3. Check Neo4j for relationships
-4. If still no results, check service logs
-5. Verify the document contains causal language
-
--- a/services/multi-document-upload-service/TESTING_GUIDE.md
+++ b/services/multi-document-upload-service/TESTING_GUIDE.md
@ -1,300 +0,0 @@
-# Multi-Document Upload Service - Frontend Testing Guide
-
-## Prerequisites
-
-1. **Backend Services Running**:
-   ```bash
-   cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine
-   docker-compose up -d
-   ```
-
-2. **Verify Services are Running**:
-   - API Gateway: `http://localhost:8000/health`
-   - Multi-Document Upload Service: `http://localhost:8024/health`
-   - Neo4j: `http://localhost:7474` (Browser interface)
-   - Frontend: `http://localhost:3001` (or your frontend port)
-
-3. **Check Service Health**:
-   ```bash
-   # Check API Gateway
-   curl http://localhost:8000/health
-
-   # Check Multi-Document Upload Service directly
-   curl http://localhost:8024/health
-
-   # Check via API Gateway proxy
-   curl http://localhost:8000/api/multi-docs/health
-   ```
-
-## Frontend Testing Steps
-
-### Step 1: Navigate to Project Builder
-
-1. Open your browser and go to: `http://localhost:3001` (or your frontend URL)
-2. Log in if required
-3. Click on **"Project Builder"** in the navigation
-
-### Step 2: Go to Multi Docs Upload Step
-
-1. In the Project Builder, you should see the workflow steps:
-   - **Step 1**: Project Type
-   - **Step 2**: Features
-   - **Step 3**: Multi Docs Upload ← **This is the new step**
-   - **Step 4**: Business Context
-   - **Step 5**: Generate
-   - **Step 6**: Architecture
-
-2. Complete Steps 1 and 2 (Project Type and Features selection)
-3. You will automatically be taken to **Step 3: Multi Docs Upload**
-
-### Step 3: Upload Documents
-
-1. **Click on the upload area** or **drag and drop files**
-2. **Select multiple files** (you can mix different formats):
-   - PDF files (`.pdf`)
-   - Word documents (`.doc`, `.docx`)
-   - PowerPoint (`.ppt`, `.pptx`)
-   - Excel files (`.xls`, `.xlsx`)
-   - JSON files (`.json`)
-   - XML files (`.xml`)
-   - Markdown files (`.md`)
-   - Images (`.png`, `.jpg`, `.jpeg`) - will use OCR
-   - Audio files (`.mp3`, `.wav`) - will be transcribed
-   - Video files (`.mp4`, `.avi`) - will be transcribed
-
-3. **View selected files**: You should see a list of all selected files with:
-   - File icon
-   - File name
-   - Remove button for each file
-
-4. **Click "Start Upload"** button
-
-### Step 4: Monitor Upload Progress
-
-After clicking "Start Upload", you should see:
-
-1. **Upload Status**:
-   - Button shows "Uploading..." with spinner
-   - Progress bar appears
-   - Stage messages appear:
-     - "Job received"
-     - "Saving files"
-     - "Extracting document content"
-     - "Calling Claude for causal relations"
-     - "Writing to Neo4j knowledge graph"
-     - "Completed"
-
-2. **Progress Indicators**:
-   - Progress percentage (0-100%)
-   - Status message showing current stage
-   - Processed files count vs total files count
-
-3. **Polling**: The frontend automatically polls the job status every 4 seconds
-
-### Step 5: Verify Results
-
-Once the job is completed:
-
-1. **Check Neo4j Graph**:
-   - Open Neo4j Browser: `http://localhost:7474`
-   - Login with:
-     - Username: `neo4j`
-     - Password: `password`
-   - Run Cypher query to see the graph:
-     ```cypher
-     MATCH (n)-[r:CAUSES]->(m)
-     RETURN n, r, m
-     LIMIT 50
-     ```
-
-2. **Check Job Status via API**:
-   ```bash
-   # Replace {job_id} with the actual job ID from the frontend
-   curl http://localhost:8000/api/multi-docs/jobs/{job_id}
-   ```
-
-3. **Get Graph Summary**:
-   ```bash
-   curl http://localhost:8000/api/multi-docs/jobs/{job_id}/graph
-   ```
-
-## Testing Different Scenarios
-
-### Scenario 1: Single PDF File
- Upload one PDF file
- Verify it processes correctly
- Check Neo4j for causal relationships
-
-### Scenario 2: Multiple Mixed Format Files
- Upload 3-5 files of different formats (PDF, DOCX, JSON, image)
- Verify all files are processed
- Check that progress updates correctly
-
-### Scenario 3: Large Files
- Upload a large PDF (10+ MB)
- Verify it handles large files correctly
- Check processing time
-
-### Scenario 4: Error Handling
- Try uploading an unsupported file type
- Verify error message appears
- Check that the error is displayed clearly
-
-### Scenario 5: Skip Option
- Upload files
- Click "Skip" button before completion
- Verify you can proceed to the next step
- Job continues processing in the background
-
-## Browser Developer Tools
-
-### Check Network Requests
-
-1. **Open Developer Tools** (F12)
-2. **Go to Network tab**
-3. **Filter by "multi-docs"**
-4. **Monitor requests**:
-   - `POST /api/multi-docs/jobs` - Upload files
-   - `GET /api/multi-docs/jobs/{job_id}` - Poll job status
-   - `GET /api/multi-docs/jobs/{job_id}/graph` - Get graph summary
-
-### Check Console Logs
-
-1. **Open Console tab**
-2. **Look for**:
-   - Upload progress logs
-   - Job status updates
-   - Any error messages
-
-### Check Response Data
-
-Verify the API responses:
-
-```javascript
-// Upload response should be:
-{
-  "job_id": "uuid-here",
-  "stage": "received",
-  "total_files": 3,
-  "created_at": "2024-01-01T00:00:00Z"
-}
-
-// Status response should be:
-{
-  "job_id": "uuid-here",
-  "stage": "extracting",
-  "status_message": "Extracting document content",
-  "total_files": 3,
-  "processed_files": 1,
-  "error": null,
-  "created_at": "2024-01-01T00:00:00Z",
-  "updated_at": "2024-01-01T00:01:00Z",
-  "files": [...]
-}
-```
-
-## Troubleshooting
-
-### Issue: Upload fails with 502 Bad Gateway
-**Solution**:
- Check if multi-document-upload-service is running:
-  ```bash
-  docker-compose ps multi-document-upload-service
-  ```
- Check service logs:
-  ```bash
-  docker-compose logs multi-document-upload-service
-  ```
-
-### Issue: Upload fails with 413 Request Entity Too Large
-**Solution**:
- Check file sizes (max 500MB total per job)
- Reduce number of files or file sizes
- Check API Gateway body size limits
-
-### Issue: Status polling stops working
-**Solution**:
- Check browser console for errors
- Verify job ID is correct
- Check if job completed or failed
- Check network tab for failed requests
-
-### Issue: No causal relationships found
-**Solution**:
- Check Claude API key is configured correctly
- Check service logs for Claude API errors
- Verify documents contain causal language
- Check Neo4j connection
-
-### Issue: Frontend shows "Failed" status
-**Solution**:
- Check the error message in the frontend
- Check backend service logs:
-  ```bash
-  docker-compose logs -f multi-document-upload-service
-  ```
- Verify all dependencies are running (Neo4j, Redis, Postgres)
-
-## Expected Behavior
-
-### Successful Flow:
-1. ✅ Files upload successfully
-2. ✅ Job ID is returned
-3. ✅ Status polling starts automatically
-4. ✅ Progress updates every 4 seconds
-5. ✅ Stage changes are displayed
-6. ✅ Progress bar updates
-7. ✅ Job completes successfully
-8. ✅ Frontend automatically proceeds to next step
-9. ✅ Neo4j contains causal relationships
-
-### Error Flow:
-1. ✅ Error message is displayed clearly
-2. ✅ User can retry upload
-3. ✅ User can skip and proceed
-4. ✅ Error details are logged in console
-
-## API Endpoints Reference
-
-### Upload Files
-```bash
-POST /api/multi-docs/jobs
-Content-Type: multipart/form-data
-
-Form Data:
- files: File[] (multiple files)
- job_name: string (optional)
-```
-
-### Get Job Status
-```bash
-GET /api/multi-docs/jobs/{job_id}
-```
-
-### Get Graph Summary
-```bash
-GET /api/multi-docs/jobs/{job_id}/graph
-```
-
-### Health Check
-```bash
-GET /api/multi-docs/health
-```
-
-## Next Steps After Testing
-
-1. **Verify Neo4j Graph**: Check that causal relationships are stored correctly
-2. **Check Storage**: Verify files are stored in the persistent volume
-3. **Monitor Performance**: Check processing times for different file types
-4. **Test Error Scenarios**: Verify error handling works correctly
-5. **Test Large Batches**: Upload 50+ files to test scalability
-
-## Support
-
-If you encounter issues:
-1. Check service logs: `docker-compose logs multi-document-upload-service`
-2. Check API Gateway logs: `docker-compose logs api-gateway`
-3. Check Neo4j logs: `docker-compose logs neo4j`
-4. Verify all environment variables are set correctly
-5. Check network connectivity between services
-
--- a/services/multi-document-upload-service/requirements.txt
+++ b/services/multi-document-upload-service/requirements.txt
@ -8,10 +8,6 @@ pydantic-settings>=2.2.1
 aiofiles>=23.2.1
 tenacity>=8.2.3
 python-dotenv>=1.0.1
-unstructured[pdf]>=0.15.0
-unstructured[docx]>=0.15.0
-unstructured[pptx]>=0.15.0
-unstructured[xlsx]>=0.15.0
 pdfplumber>=0.11.0
 python-docx>=1.1.0
 python-pptx>=0.6.23
@ -30,5 +26,13 @@ beautifulsoup4>=4.12.3
 lxml>=5.2.1
 sqlalchemy>=2.0.25
 httpx>=0.27.0
-tiktoken>=0.7.0
+dowhy>=0.11.0
+qdrant-client>=1.7.0
+sentence-transformers>=2.2.0
+numpy>=1.24.0
+scipy>=1.11.0
+networkx>=3.1
+spacy>=3.7.0
+markdown>=3.5.0
+weasyprint>=60.0

--- a/services/multi-document-upload-service/src/multi_document_upload_service/claude_client.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/claude_client.py
@ -1,328 +0,0 @@
-from __future__ import annotations
-
-import base64
-import json
-import logging
-import re
-from pathlib import Path
-from typing import Iterable, List
-
-from anthropic import Anthropic, BadRequestError
-from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential, RetryCallState
-
-from .models import CausalRelation
-
-logger = logging.getLogger(__name__)
-
-
-def is_billing_error(exception: Exception) -> bool:
-    """Check if the exception is a billing/credit related error that shouldn't be retried."""
-    if isinstance(exception, BadRequestError):
-        error_message = str(exception).lower()
-        billing_keywords = ["credit", "balance", "too low", "billing", "upgrade", "purchase credits"]
-        return any(keyword in error_message for keyword in billing_keywords)
-    return False
-
-
-def should_retry_exception(retry_state: RetryCallState) -> bool:
-    """Custom retry condition that excludes billing errors."""
-    exception = retry_state.outcome.exception()
-    if exception is None:
-        return False
-    # Don't retry billing errors - they won't be resolved by retrying
-    if is_billing_error(exception):
-        return False
-    # Retry other exceptions
-    return True
-
-
-CLAUDE_PROMPT_TEMPLATE = """You are an expert analyst extracting causal relationships from documents.
-
-Given the following text chunk, identify all explicit or strongly implied cause and effect pairs.
-Return JSON with the schema:
-[
-  {
-    "cause": "<short phrase>",
-    "effect": "<short phrase>",
-    "confidence": 0-1 float,
-    "explanation": "<why this is causal>",
-    "source_snippet": "<exact quote or paraphrase>"
-  }
-]
-
-Only include items when the causal direction is clear.
-If none are found, return an empty list [].
-
-Text chunk:
-```
-<<<CHUNK_PLACEHOLDER>>>
-```"""
-
-IMAGE_PROMPT_TEMPLATE = """You are an expert analyst extracting causal relationships from images, diagrams, and visual content.
-
-Analyze this image/diagram for causal relationships. Look for:
- Architecture flows (A → B → C)
- Dependency relationships
- Cause-effect chains in diagrams
- Process flows
- System interactions
- Data flows
- Sequential relationships
- Visual connections between components
-
-Return JSON with the schema:
-[
-  {
-    "cause": "<short phrase describing the cause>",
-    "effect": "<short phrase describing the effect>",
-    "confidence": 0-1 float,
-    "explanation": "<why this is causal, referencing visual elements>",
-    "source_snippet": "<description of what you see in the image that shows this relationship>"
-  }
-]
-
-Only include items when the causal direction is clear from the visual structure.
-If none are found, return an empty list []."""
-
-
-class ClaudeCausalExtractor:
-    def __init__(self, api_key: str, model: str, max_output_tokens: int = 4000):
-        self.client = Anthropic(api_key=api_key)
-        self.model = model
-        self.max_output_tokens = max_output_tokens
-
-    @retry(
-        retry=should_retry_exception,
-        wait=wait_exponential(multiplier=1, min=1, max=10),
-        stop=stop_after_attempt(3),
-        reraise=True,
-    )
-    def analyze_chunk(self, chunk: str, source_file_id: str) -> List[CausalRelation]:
-        logger.debug("Analyzing chunk with Claude model %s", self.model)
-        
-        # Validate chunk is not empty and is readable text
-        if not chunk or not chunk.strip():
-            logger.warning("Empty or whitespace-only chunk, skipping")
-            return []
-        
-        # Check if chunk contains mostly readable text (not binary data)
-        # Simple heuristic: if >50% of characters are non-printable or control chars, skip it
-        printable_chars = sum(1 for c in chunk if c.isprintable() or c.isspace())
-        if len(chunk) > 100 and printable_chars / len(chunk) < 0.5:
-            logger.warning("Chunk appears to contain binary data, skipping analysis")
-            return []
-        
-        # Use string replacement with a unique placeholder to avoid KeyError with braces in content
-        # This prevents Python's .format() from interpreting braces in the chunk text as format placeholders
-        prompt_text = CLAUDE_PROMPT_TEMPLATE.replace("<<<CHUNK_PLACEHOLDER>>>", chunk)
-
-        try:
-            message = self.client.messages.create(
-                model=self.model,
-                max_tokens=self.max_output_tokens,
-                temperature=0.0,
-                system="You extract causal (cause→effect) relations with high precision.",
-                messages=[
-                    {
-                        "role": "user",
-                        "content": [{"type": "text", "text": prompt_text}],
-                    }
-                ],
-            )
-        except BadRequestError as e:
-            # Check if it's a billing error
-            if is_billing_error(e):
-                error_msg = (
-                    "Anthropic API credit balance is too low. "
-                    "Please go to Plans & Billing to upgrade or purchase credits. "
-                    f"Error: {str(e)}"
-                )
-                logger.error(error_msg)
-                raise RuntimeError(error_msg) from e
-            # Re-raise other BadRequestErrors
-            raise
-
-        content_blocks = message.content or []
-        raw_text = "".join(block.text for block in content_blocks if hasattr(block, "text"))  # type: ignore[attr-defined]
-        if not raw_text:
-            return []
-
-        # Try to extract JSON from markdown code blocks if present
-        json_text = raw_text.strip()
-        
-        # Look for JSON in markdown code blocks (```json ... ```)
-        json_match = re.search(r'```(?:json)?\s*(\[.*?\])\s*```', json_text, re.DOTALL)
-        if json_match:
-            json_text = json_match.group(1)
-        else:
-            # Look for JSON array/object at the start or end
-            json_match = re.search(r'(\[.*?\]|{.*?})', json_text, re.DOTALL)
-            if json_match:
-                json_text = json_match.group(1)
-
-        try:
-            data = json.loads(json_text)
-            if not isinstance(data, list):
-                logger.warning("Claude response is not a list: %s", type(data))
-                return []
-            
-            relations: List[CausalRelation] = []
-            for item in data:
-                if not isinstance(item, dict):
-                    continue
-                cause = item.get("cause", "").strip()
-                effect = item.get("effect", "").strip()
-                if not cause or not effect:
-                    continue  # Skip invalid relations
-                    
-                relations.append(
-                    CausalRelation(
-                        cause=cause,
-                        effect=effect,
-                        confidence=float(item.get("confidence", 0.0)),
-                        explanation=item.get("explanation"),
-                        source_file_id=source_file_id,
-                        source_snippet=item.get("source_snippet"),
-                        metadata={"model": self.model},
-                    )
-                )
-            logger.info("Extracted %d relations from Claude response", len(relations))
-            return relations
-        except json.JSONDecodeError as e:
-            logger.warning("Failed to parse Claude response as JSON: %s. Raw text: %s", e, raw_text[:200])
-            return []
-
-    def analyze(self, chunks: Iterable[str], source_file_id: str) -> List[CausalRelation]:
-        relations: List[CausalRelation] = []
-        for chunk in chunks:
-            relations.extend(self.analyze_chunk(chunk, source_file_id=source_file_id))
-        return relations
-
-    @retry(
-        retry=should_retry_exception,
-        wait=wait_exponential(multiplier=1, min=1, max=10),
-        stop=stop_after_attempt(3),
-        reraise=True,
-    )
-    def analyze_image(self, image_path: Path, source_file_id: str) -> List[CausalRelation]:
-        """
-        Analyze an image using Claude Vision API to extract causal relationships.
-        Sends image directly to Claude (no OCR).
-        """
-        logger.info("Analyzing image with Claude Vision: %s", image_path.name)
-        
-        try:
-            # Read and encode image as base64
-            with open(image_path, "rb") as image_file:
-                image_data = image_file.read()
-            
-            # Determine media type
-            suffix = image_path.suffix.lower()
-            media_type_map = {
-                ".png": "image/png",
-                ".jpg": "image/jpeg",
-                ".jpeg": "image/jpeg",
-                ".gif": "image/gif",
-                ".webp": "image/webp",
-            }
-            media_type = media_type_map.get(suffix, "image/png")
-            
-            # Encode to base64
-            base64_image = base64.b64encode(image_data).decode("utf-8")
-            
-            # Prepare content for Claude Vision API
-            content = [
-                {
-                    "type": "image",
-                    "source": {
-                        "type": "base64",
-                        "media_type": media_type,
-                        "data": base64_image,
-                    },
-                },
-                {
-                    "type": "text",
-                    "text": IMAGE_PROMPT_TEMPLATE,
-                },
-            ]
-            
-            # Call Claude Vision API
-            try:
-                message = self.client.messages.create(
-                    model=self.model,  # Claude models support vision
-                    max_tokens=self.max_output_tokens,
-                    temperature=0.0,
-                    system="You extract causal (cause→effect) relations from visual content with high precision.",
-                    messages=[
-                        {
-                            "role": "user",
-                            "content": content,
-                        }
-                    ],
-                )
-            except BadRequestError as e:
-                # Check if it's a billing error
-                if is_billing_error(e):
-                    error_msg = (
-                        "Anthropic API credit balance is too low. "
-                        "Please go to Plans & Billing to upgrade or purchase credits. "
-                        f"Error: {str(e)}"
-                    )
-                    logger.error(error_msg)
-                    raise RuntimeError(error_msg) from e
-                # Re-raise other BadRequestErrors
-                raise
-            
-            # Parse response
-            content_blocks = message.content or []
-            raw_text = "".join(block.text for block in content_blocks if hasattr(block, "text"))  # type: ignore[attr-defined]
-            if not raw_text:
-                logger.warning("No text response from Claude Vision for image %s", image_path.name)
-                return []
-            
-            # Extract JSON from response
-            json_text = raw_text.strip()
-            json_match = re.search(r'```(?:json)?\s*(\[.*?\])\s*```', json_text, re.DOTALL)
-            if json_match:
-                json_text = json_match.group(1)
-            else:
-                json_match = re.search(r'(\[.*?\]|{.*?})', json_text, re.DOTALL)
-                if json_match:
-                    json_text = json_match.group(1)
-            
-            try:
-                data = json.loads(json_text)
-                if not isinstance(data, list):
-                    logger.warning("Claude Vision response is not a list: %s", type(data))
-                    return []
-                
-                relations: List[CausalRelation] = []
-                for item in data:
-                    if not isinstance(item, dict):
-                        continue
-                    cause = item.get("cause", "").strip()
-                    effect = item.get("effect", "").strip()
-                    if not cause or not effect:
-                        continue
-                    
-                    relations.append(
-                        CausalRelation(
-                            cause=cause,
-                            effect=effect,
-                            confidence=float(item.get("confidence", 0.0)),
-                            explanation=item.get("explanation"),
-                            source_file_id=source_file_id,
-                            source_snippet=item.get("source_snippet") or f"Image: {image_path.name}",
-                            metadata={"model": self.model, "content_type": "image", "image_path": str(image_path)},
-                        )
-                    )
-                logger.info("Extracted %d relations from image %s", len(relations), image_path.name)
-                return relations
-            except json.JSONDecodeError as e:
-                logger.warning("Failed to parse Claude Vision response as JSON: %s. Raw text: %s", e, raw_text[:200])
-                return []
-                
-        except Exception as exc:
-            logger.exception("Failed to analyze image %s: %s", image_path, exc)
-            return []
-
--- a/services/multi-document-upload-service/src/multi_document_upload_service/config.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/config.py
@ -20,7 +20,7 @@ class Settings(BaseSettings):
    model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")

    anthropic_api_key: str | None = Field(default=None, validation_alias="ANTHROPIC_API_KEY")
-    claude_model: str = Field(default=os.getenv("MULTI_DOC_CLAUDE_MODEL", "claude-3-5-sonnet-20241022"))
+    claude_model: str = Field(default=os.getenv("MULTI_DOC_CLAUDE_MODEL", os.getenv("CLAUDE_MODEL", "claude-3-5-haiku-latest")))
    claude_max_input_tokens: int = Field(default=200_000)
    claude_max_output_tokens: int = Field(default=16_000)

@ -37,6 +37,27 @@ class Settings(BaseSettings):

    job_retention_days: int = Field(default=30)

+    # Qwen2.5-VL API configuration
+    qwen_api_key: str | None = Field(default=None, validation_alias="QWEN_API_KEY")
+    qwen_api_url: str = Field(default=os.getenv("QWEN_API_URL", "https://api.example.com/v1/chat/completions"))
+    qwen_model: str = Field(default=os.getenv("QWEN_MODEL", "qwen2.5-vl"))
+
+    # DoWhy configuration
+    dowhy_enabled: bool = Field(default=True)
+    dowhy_confidence_threshold: float = Field(default=0.05)
+
+    # Embedding configuration
+    embedding_model: str = Field(default="sentence-transformers/all-MiniLM-L6-v2")
+    embedding_dimension: int = Field(default=384)
+
+    # Qdrant configuration
+    qdrant_url: str = Field(default=os.getenv("QDRANT_URL", "http://localhost:6333"))
+    qdrant_collection_name: str = Field(default="kg_embeddings")
+    qdrant_vector_size: int = Field(default=384)
+
+    # Report generation configuration
+    report_format: str = Field(default="markdown")
+
    def ensure_storage_dirs(self) -> None:
        (self.storage_root / "jobs").mkdir(parents=True, exist_ok=True)
        (self.storage_root / "uploads").mkdir(parents=True, exist_ok=True)
--- a/services/multi-document-upload-service/src/multi_document_upload_service/extractors/auto.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/extractors/auto.py
@ -1,168 +0,0 @@
-from __future__ import annotations
-
-import logging
-from pathlib import Path
-from typing import List
-
-logger = logging.getLogger(__name__)
-
-# Try to import unstructured, but fall back to alternatives if not available
-try:
-    from unstructured.partition.auto import partition
-    HAS_UNSTRUCTURED = True
-except ImportError:
-    HAS_UNSTRUCTURED = False
-    logger.warning("unstructured not available, will use fallback extractors")
-
-# Fallback extractors
-try:
-    import pdfplumber
-    HAS_PDFPLUMBER = True
-except ImportError:
-    HAS_PDFPLUMBER = False
-
-try:
-    from docx import Document as DocxDocument
-    HAS_DOCX = True
-except ImportError:
-    HAS_DOCX = False
-
-try:
-    from pptx import Presentation
-    HAS_PPTX = True
-except ImportError:
-    HAS_PPTX = False
-
-# Image processing libraries
-try:
-    from PIL import Image
-    import pytesseract
-    HAS_OCR = True
-except ImportError:
-    HAS_OCR = False
-    logger.warning("OCR libraries not available, image extraction will be limited")
-
-
-def extract_text(path: Path) -> str:
-    """
-    Extract text from a file using multiple strategies.
-    Falls back through: unstructured -> format-specific -> plain text read.
-    """
-    suffix = path.suffix.lower()
-    
-    # Validate PDF file before processing
-    if suffix == ".pdf":
-        # Quick validation: check if file starts with PDF magic bytes
-        try:
-            with path.open("rb") as f:
-                header = f.read(4)
-                if header != b"%PDF":
-                    raise ValueError(
-                        f"File {path.name} does not appear to be a valid PDF. "
-                        f"PDF files must start with '%PDF' magic bytes. "
-                        f"Got: {header[:20] if len(header) > 0 else 'empty file'}"
-                    )
-        except Exception as exc:
-            if isinstance(exc, ValueError):
-                raise
-            logger.warning("Could not validate PDF header: %s", exc)
-    
-    # Image files - return empty text (will be processed directly with Claude Vision)
-    # We skip OCR and send images directly to Claude Vision API
-    if suffix in {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}:
-        logger.info("Image file detected: %s. Will be processed directly with Claude Vision (no OCR)", path.name)
-        # Return empty string - images will be handled separately in pipeline
-        return ""
-    
-    # Plain text files - direct read
-    if suffix in {".txt", ".md", ".json", ".xml", ".html", ".csv"}:
-        try:
-            return path.read_text(encoding="utf-8", errors="ignore")
-        except Exception as exc:
-            logger.warning("Failed to read %s as text: %s", path, exc)
-            raise
-    
-    # Try unstructured first (if available)
-    if HAS_UNSTRUCTURED:
-        try:
-            elements = partition(filename=str(path))
-            lines: List[str] = []
-            for element in elements:
-                text = getattr(element, "text", None)
-                if text:
-                    lines.append(text.strip())
-            if lines:
-                logger.info("Extracted %d lines using unstructured", len(lines))
-                return "\n".join(lines)
-        except Exception as exc:
-            logger.warning("unstructured extraction failed for %s: %s", path, exc)
-            # Continue to fallback methods
-    
-    # Fallback: PDF with pdfplumber
-    if suffix == ".pdf" and HAS_PDFPLUMBER:
-        try:
-            with pdfplumber.open(path) as pdf:
-                text_parts = []
-                for page in pdf.pages:
-                    page_text = page.extract_text()
-                    if page_text:
-                        text_parts.append(page_text)
-                if text_parts:
-                    logger.info("Extracted PDF using pdfplumber")
-                    return "\n".join(text_parts)
-        except Exception as exc:
-            logger.warning("pdfplumber extraction failed for %s: %s", path, exc)
-    
-    # Fallback: DOCX
-    if suffix == ".docx" and HAS_DOCX:
-        try:
-            doc = DocxDocument(path)
-            paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
-            if paragraphs:
-                logger.info("Extracted DOCX using python-docx")
-                return "\n".join(paragraphs)
-        except Exception as exc:
-            logger.warning("python-docx extraction failed for %s: %s", path, exc)
-    
-    # Fallback: PPTX
-    if suffix in {".pptx", ".ppt"} and HAS_PPTX:
-        try:
-            prs = Presentation(path)
-            text_parts = []
-            for slide in prs.slides:
-                for shape in slide.shapes:
-                    if hasattr(shape, "text") and shape.text:
-                        text_parts.append(shape.text.strip())
-            if text_parts:
-                logger.info("Extracted PPTX using python-pptx")
-                return "\n".join(text_parts)
-        except Exception as exc:
-            logger.warning("python-pptx extraction failed for %s: %s", path, exc)
-    
-    # Last resort: try to read as text anyway, but validate it's readable
-    try:
-        content = path.read_text(encoding="utf-8", errors="ignore")
-        if content.strip():
-            # Check if content is actually readable text (not binary data)
-            # Simple heuristic: if >30% of characters are printable, consider it text
-            printable_chars = sum(1 for c in content if c.isprintable() or c.isspace())
-            total_chars = len(content)
-            
-            if total_chars > 0 and printable_chars / total_chars > 0.3:
-                logger.warning("Read %s as plain text (may contain binary data)", path)
-                return content
-            else:
-                logger.error("Content from %s appears to be binary data, cannot extract text", path)
-                raise ValueError(f"File {path} appears to be binary or corrupted. Cannot extract readable text.")
-    except Exception as exc:
-        if isinstance(exc, ValueError):
-            raise
-        logger.warning("Failed to read %s as text: %s", path, exc)
-    
-    # If all else fails, raise an error
-    raise ValueError(
-        f"Could not extract text from {path}. "
-        f"File type may not be supported, file may be corrupted, or dependencies are missing. "
-        f"Supported formats: PDF, DOCX, PPTX, XLSX, TXT, MD, JSON, XML, HTML, CSV, PNG, JPG, JPEG (with OCR)"
-    )
-
--- a/services/multi-document-upload-service/src/multi_document_upload_service/extractors/pymupdf_extractor.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/extractors/pymupdf_extractor.py
@ -0,0 +1,320 @@
+from __future__ import annotations
+
+import logging
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional
+
+logger = logging.getLogger(__name__)
+
+try:
+    import fitz  # PyMuPDF
+    HAS_PYMUPDF = True
+except ImportError:
+    HAS_PYMUPDF = False
+    logger.warning("PyMuPDF not available")
+
+try:
+    from docx import Document as DocxDocument
+    HAS_DOCX = True
+except ImportError:
+    HAS_DOCX = False
+    logger.warning("python-docx not available")
+
+try:
+    from pptx import Presentation
+    HAS_PPTX = True
+except ImportError:
+    HAS_PPTX = False
+    logger.warning("python-pptx not available")
+
+try:
+    import pandas as pd
+    HAS_PANDAS = True
+except ImportError:
+    HAS_PANDAS = False
+    logger.warning("pandas not available")
+
+
+@dataclass
+class ExtractedText:
+    """Structured text extraction with context."""
+    text: str
+    page_number: int
+    metadata: dict
+    context: Optional[str] = None  # Surrounding context
+
+
+def extract_text_with_context(path: Path) -> List[ExtractedText]:
+    """
+    Extract text from PDF using PyMuPDF with page-level context.
+    Returns structured text with metadata.
+    """
+    if not HAS_PYMUPDF:
+        raise ImportError("PyMuPDF is required for text extraction")
+    
+    if not path.exists():
+        raise FileNotFoundError(f"File not found: {path}")
+    
+    if path.suffix.lower() != ".pdf":
+        # For non-PDF files, fall back to simple text reading
+        try:
+            text = path.read_text(encoding="utf-8", errors="ignore")
+            return [ExtractedText(
+                text=text,
+                page_number=1,
+                metadata={"file_type": path.suffix, "filename": path.name},
+                context=None
+            )]
+        except Exception as exc:
+            logger.warning("Failed to read %s as text: %s", path, exc)
+            raise
+    
+    extracted_pages: List[ExtractedText] = []
+    
+    try:
+        doc = fitz.open(path)
+        
+        for page_num in range(len(doc)):
+            page = doc[page_num]
+            
+            # Extract text
+            text = page.get_text()
+            
+            # Extract metadata
+            metadata = {
+                "page_number": page_num + 1,
+                "page_count": len(doc),
+                "filename": path.name,
+                "file_type": "pdf",
+                "page_rect": {
+                    "width": page.rect.width,
+                    "height": page.rect.height
+                }
+            }
+            
+            # Extract context (surrounding pages for better understanding)
+            context = None
+            if page_num > 0:
+                prev_page = doc[page_num - 1]
+                prev_text = prev_page.get_text()[:500]  # Last 500 chars of previous page
+                context = f"Previous page context: {prev_text}"
+            
+            if text.strip():
+                extracted_pages.append(ExtractedText(
+                    text=text,
+                    page_number=page_num + 1,
+                    metadata=metadata,
+                    context=context
+                ))
+        
+        doc.close()
+        logger.info("Extracted text from %d pages in %s", len(extracted_pages), path.name)
+        return extracted_pages
+        
+    except Exception as exc:
+        logger.exception("Failed to extract text from PDF %s: %s", path, exc)
+        raise
+
+
+def extract_text_from_docx(path: Path) -> str:
+    """
+    Extract text from DOCX file using python-docx.
+    Reads paragraphs and tables as per README Step 2.2b.
+    """
+    if not HAS_DOCX:
+        raise ImportError("python-docx is required for DOCX extraction")
+    
+    try:
+        doc = DocxDocument(path)
+        text_parts = []
+        
+        # Extract paragraphs
+        for paragraph in doc.paragraphs:
+            if paragraph.text.strip():
+                text_parts.append(paragraph.text.strip())
+        
+        # Extract tables
+        for table in doc.tables:
+            table_text = []
+            for row in table.rows:
+                row_text = []
+                for cell in row.cells:
+                    if cell.text.strip():
+                        row_text.append(cell.text.strip())
+                if row_text:
+                    table_text.append(" | ".join(row_text))
+            if table_text:
+                text_parts.append("\n".join(table_text))
+        
+        result = "\n\n".join(text_parts)
+        logger.info("Extracted %d characters from DOCX %s", len(result), path.name)
+        return result
+    except Exception as exc:
+        logger.exception("Failed to extract text from DOCX %s: %s", path, exc)
+        raise
+
+
+def extract_text_from_pptx(path: Path) -> str:
+    """
+    Extract text from PPTX file using python-pptx.
+    Reads slides, titles, and notes as per README Step 2.2c.
+    """
+    if not HAS_PPTX:
+        raise ImportError("python-pptx is required for PPTX extraction")
+    
+    try:
+        prs = Presentation(path)
+        text_parts = []
+        
+        for slide_num, slide in enumerate(prs.slides, 1):
+            slide_text = []
+            
+            # Extract slide title
+            if slide.shapes.title and slide.shapes.title.text:
+                slide_text.append(f"Slide {slide_num} Title: {slide.shapes.title.text.strip()}")
+            
+            # Extract content from shapes
+            for shape in slide.shapes:
+                if hasattr(shape, "text") and shape.text.strip():
+                    # Skip title (already extracted)
+                    if not (slide.shapes.title and shape == slide.shapes.title):
+                        slide_text.append(shape.text.strip())
+            
+            # Extract notes (if available)
+            if hasattr(slide, "notes_slide") and slide.notes_slide:
+                notes_text = ""
+                for shape in slide.notes_slide.shapes:
+                    if hasattr(shape, "text") and shape.text.strip():
+                        notes_text += shape.text.strip() + " "
+                if notes_text.strip():
+                    slide_text.append(f"Notes: {notes_text.strip()}")
+            
+            if slide_text:
+                text_parts.append("\n".join(slide_text))
+        
+        result = "\n\n".join(text_parts)
+        logger.info("Extracted %d characters from PPTX %s (%d slides)", 
+                   len(result), path.name, len(prs.slides))
+        return result
+    except Exception as exc:
+        logger.exception("Failed to extract text from PPTX %s: %s", path, exc)
+        raise
+
+
+def extract_text_from_spreadsheet(path: Path) -> str:
+    """
+    Extract text from CSV/XLSX file using pandas.
+    Reads rows and columns, converts to text representation as per README Step 2.2d.
+    """
+    if not HAS_PANDAS:
+        raise ImportError("pandas is required for spreadsheet extraction")
+    
+    try:
+        suffix = path.suffix.lower()
+        text_parts = []
+        
+        if suffix == ".csv":
+            df = pd.read_csv(path, encoding="utf-8", errors="ignore")
+        elif suffix in {".xlsx", ".xls"}:
+            # Read first sheet by default
+            df = pd.read_excel(path, engine="openpyxl" if suffix == ".xlsx" else None)
+        else:
+            raise ValueError(f"Unsupported spreadsheet format: {suffix}")
+        
+        # Convert DataFrame to text representation
+        # Add column headers
+        headers = " | ".join(str(col) for col in df.columns)
+        text_parts.append(f"Columns: {headers}")
+        
+        # Add rows (limit to first 1000 rows to avoid huge output)
+        max_rows = min(1000, len(df))
+        for idx, row in df.head(max_rows).iterrows():
+            row_values = " | ".join(str(val) if pd.notna(val) else "" for val in row)
+            text_parts.append(f"Row {idx + 1}: {row_values}")
+        
+        if len(df) > max_rows:
+            text_parts.append(f"... ({len(df) - max_rows} more rows)")
+        
+        result = "\n".join(text_parts)
+        logger.info("Extracted %d characters from spreadsheet %s (%d rows)", 
+                   len(result), path.name, len(df))
+        return result
+    except Exception as exc:
+        logger.exception("Failed to extract text from spreadsheet %s: %s", path, exc)
+        raise
+
+
+def clean_text(text: str) -> str:
+    """
+    Clean extracted text as per README Step 2.3.
+    - Remove extra whitespace
+    - Fix encoding issues
+    - Preserve important structure
+    """
+    if not text:
+        return ""
+    
+    # Fix encoding issues (remove non-printable characters except newlines and tabs)
+    cleaned = "".join(char for char in text if char.isprintable() or char in "\n\t\r")
+    
+    # Remove extra whitespace (but preserve paragraph breaks)
+    # Replace multiple spaces with single space
+    cleaned = re.sub(r'[ \t]+', ' ', cleaned)
+    
+    # Normalize line breaks (preserve double newlines for paragraphs)
+    cleaned = re.sub(r'\r\n', '\n', cleaned)  # Windows line breaks
+    cleaned = re.sub(r'\r', '\n', cleaned)    # Old Mac line breaks
+    
+    # Preserve paragraph structure (double newlines)
+    # But remove excessive blank lines (more than 2 consecutive)
+    cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
+    
+    # Remove leading/trailing whitespace from each line
+    lines = [line.strip() for line in cleaned.split('\n')]
+    cleaned = '\n'.join(lines)
+    
+    # Remove leading/trailing whitespace overall
+    cleaned = cleaned.strip()
+    
+    return cleaned
+
+
+def extract_all_text(path: Path) -> str:
+    """
+    Extract all text from a file based on type (as per README Step 2).
+    Routes to appropriate extractor: PDF, DOCX, PPTX, CSV/XLSX, or plain text.
+    """
+    suffix = path.suffix.lower()
+    
+    # Step 2.2a: PDF
+    if suffix == ".pdf" and HAS_PYMUPDF:
+        extracted_pages = extract_text_with_context(path)
+        text = "\n\n".join([page.text for page in extracted_pages])
+    
+    # Step 2.2b: DOCX (Word)
+    elif suffix == ".docx" and HAS_DOCX:
+        text = extract_text_from_docx(path)
+    
+    # Step 2.2c: PPTX (PowerPoint)
+    elif suffix in {".pptx", ".ppt"} and HAS_PPTX:
+        text = extract_text_from_pptx(path)
+    
+    # Step 2.2d: CSV/XLSX (Spreadsheet)
+    elif suffix in {".csv", ".xlsx", ".xls"} and HAS_PANDAS:
+        text = extract_text_from_spreadsheet(path)
+    
+    # Fallback: Plain text files
+    else:
+        try:
+            text = path.read_text(encoding="utf-8", errors="ignore")
+        except Exception as exc:
+            logger.warning("Failed to read %s as text: %s", path, exc)
+            raise
+    
+    # Step 2.3: TEXT CLEANING
+    text = clean_text(text)
+    
+    return text
+
--- a/services/multi-document-upload-service/src/multi_document_upload_service/extractors/qwen_vision.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/extractors/qwen_vision.py
@ -0,0 +1,153 @@
+from __future__ import annotations
+
+import base64
+import json
+import logging
+from pathlib import Path
+from typing import Dict, List, Optional
+
+import httpx
+
+from ..config import get_settings
+
+logger = logging.getLogger(__name__)
+
+
+class QwenVisionClient:
+    """Client for Qwen2.5-VL API to extract relationships from diagrams and ERDs."""
+    
+    def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None, model: Optional[str] = None):
+        settings = get_settings()
+        self.api_key = api_key or settings.qwen_api_key
+        self.api_url = api_url or settings.qwen_api_url
+        self.model = model or settings.qwen_model
+        
+        if not self.api_key:
+            logger.warning("Qwen API key not configured")
+    
+    def extract_relationships_from_image(self, image_path: Path, source_file_id: str) -> List[Dict]:
+        """
+        Extract relationships (entities, connections, flows) from an image using Qwen2.5-VL.
+        Returns list of extracted relationships.
+        """
+        if not self.api_key:
+            logger.warning("Qwen API key not configured, skipping image analysis")
+            return []
+        
+        try:
+            # Read and encode image
+            with open(image_path, "rb") as img_file:
+                image_data = img_file.read()
+            
+            base64_image = base64.b64encode(image_data).decode("utf-8")
+            
+            # Determine media type
+            suffix = image_path.suffix.lower()
+            media_type_map = {
+                ".png": "image/png",
+                ".jpg": "image/jpeg",
+                ".jpeg": "image/jpeg",
+                ".gif": "image/gif",
+                ".webp": "image/webp",
+            }
+            media_type = media_type_map.get(suffix, "image/png")
+            
+            # Prepare prompt for relationship extraction
+            prompt = """Analyze this diagram/ERD/image and extract all relationships, entities, and connections.
+
+Extract:
+1. Entities (boxes, nodes, components)
+2. Relationships between entities (arrows, connections, flows)
+3. Data flows and dependencies
+4. Process flows
+5. Architecture patterns
+
+Return JSON with this structure:
+[
+  {
+    "entity1": "name of first entity",
+    "entity2": "name of second entity",
+    "relationship_type": "causes|depends_on|flows_to|contains|uses",
+    "description": "description of the relationship",
+    "confidence": 0.0-1.0
+  }
+]
+
+Focus on cause-effect relationships, dependencies, and flows."""
+            
+            # Prepare API request
+            payload = {
+                "model": self.model,
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:{media_type};base64,{base64_image}"
+                                }
+                            },
+                            {
+                                "type": "text",
+                                "text": prompt
+                            }
+                        ]
+                    }
+                ],
+                "max_tokens": 4000,
+                "temperature": 0.0
+            }
+            
+            headers = {
+                "Authorization": f"Bearer {self.api_key}",
+                "Content-Type": "application/json"
+            }
+            
+            # Make API call
+            with httpx.Client(timeout=60.0) as client:
+                response = client.post(self.api_url, json=payload, headers=headers)
+                response.raise_for_status()
+                result = response.json()
+            
+            # Parse response
+            content = result.get("choices", [{}])[0].get("message", {}).get("content", "")
+            
+            if not content:
+                logger.warning("Empty response from Qwen API for image %s", image_path.name)
+                return []
+            
+            # Extract JSON from response
+            json_text = content.strip()
+            
+            # Try to find JSON in markdown code blocks
+            if "```json" in json_text:
+                json_text = json_text.split("```json")[1].split("```")[0].strip()
+            elif "```" in json_text:
+                json_text = json_text.split("```")[1].split("```")[0].strip()
+            
+            # Parse JSON
+            try:
+                relationships = json.loads(json_text)
+                if not isinstance(relationships, list):
+                    relationships = [relationships]
+                
+                # Add source metadata
+                for rel in relationships:
+                    rel["source_file_id"] = source_file_id
+                    rel["source_image"] = str(image_path.name)
+                    rel["extraction_method"] = "qwen2.5-vl"
+                
+                logger.info("Extracted %d relationships from image %s using Qwen2.5-VL", 
+                           len(relationships), image_path.name)
+                return relationships
+                
+            except json.JSONDecodeError as e:
+                logger.warning("Failed to parse Qwen response as JSON: %s. Content: %s", 
+                             e, content[:200])
+                return []
+                
+        except Exception as exc:
+            logger.exception("Failed to extract relationships from image %s: %s", image_path, exc)
+            return []
+
--- a/services/multi-document-upload-service/src/multi_document_upload_service/main.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/main.py
@ -2,15 +2,16 @@ from __future__ import annotations

 import logging
 from dataclasses import dataclass
+from pathlib import Path
 from typing import List, Optional

 from fastapi import BackgroundTasks, Depends, FastAPI, File, Form, HTTPException, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import FileResponse

-from .claude_client import ClaudeCausalExtractor
 from .config import Settings, get_settings
 from .jobs import JobStore
-from .models import CreateJobResponse, JobGraphSummary, JobStage, JobStatusResponse
+from .models import CreateJobResponse, JobGraphSummary, JobStage, JobStatusResponse, ProjectReport
 from .processors.graph_writer import GraphWriter
 from .storage import StorageManager
 from .workflows.pipeline import JobPipeline
@ -20,8 +21,8 @@ logging.basicConfig(level=logging.INFO)

 app = FastAPI(
    title="Multi Document Upload Service",
-    version="0.1.0",
-    description="Processes multi-format documents to build causal knowledge graphs using Claude.",
+    version="0.2.0",
+    description="Processes multi-format documents to build knowledge graphs and generate beginner-friendly onboarding reports.",
 )


@ -40,7 +41,6 @@ class ServiceContainer:
    storage: StorageManager
    job_store: JobStore
    graph_writer: GraphWriter
-    claude_extractor: ClaudeCausalExtractor
    pipeline: JobPipeline


@ -51,29 +51,24 @@ def get_container() -> ServiceContainer:
    global _container
    if _container is None:
        settings = get_settings()
-        if not settings.anthropic_api_key:
-            raise HTTPException(status_code=500, detail="ANTHROPIC_API_KEY is not configured")
+        # Anthropic API key is only needed for report generation, not required at startup
+        # if not settings.anthropic_api_key:
+        #     raise HTTPException(status_code=500, detail="ANTHROPIC_API_KEY is not configured")

        storage = StorageManager(settings.storage_root)
        job_store = JobStore(settings.storage_root)
        graph_writer = GraphWriter(settings.neo4j_uri, settings.neo4j_user, settings.neo4j_password)
-        claude_extractor = ClaudeCausalExtractor(
-            api_key=settings.anthropic_api_key,
-            model=settings.claude_model,
-            max_output_tokens=min(settings.claude_max_output_tokens, 4000),
-        )
+        
        pipeline = JobPipeline(
            job_store=job_store,
            storage=storage,
            graph_writer=graph_writer,
-            claude_extractor=claude_extractor,
        )
        _container = ServiceContainer(
            settings=settings,
            storage=storage,
            job_store=job_store,
            graph_writer=graph_writer,
-            claude_extractor=claude_extractor,
            pipeline=pipeline,
        )
    return _container
@ -170,14 +165,86 @@ async def get_job_graph(job_id: str, container: ServiceContainer = Depends(get_d
    )


+@app.get("/jobs/{job_id}/report", response_model=ProjectReport)
+async def get_job_report(job_id: str, container: ServiceContainer = Depends(get_dependencies)) -> ProjectReport:
+    """Get the generated beginner-friendly onboarding report."""
+    job_store = container.job_store
+    if not job_store.exists(job_id):
+        raise HTTPException(status_code=404, detail="Job not found")
+    job = job_store.get(job_id)
+    if job.stage != JobStage.COMPLETED:
+        raise HTTPException(
+            status_code=409,
+            detail="Report not ready yet. Job is still processing."
+        )
+    if not job.report:
+        # Check if there was an error during report generation
+        error_msg = "Report not found. "
+        if job.error:
+            # Check if error is specifically about report generation
+            if "report generation" in job.error.lower() or "claude" in job.error.lower():
+                error_msg = job.error
+            else:
+                error_msg += f"Error during generation: {job.error}"
+        else:
+            error_msg += "Report generation may have failed (check logs for details)."
+        raise HTTPException(
+            status_code=404,
+            detail=error_msg
+        )
+    return job.report
+
+
+@app.get("/jobs/{job_id}/report/pdf")
+async def get_job_report_pdf(job_id: str, container: ServiceContainer = Depends(get_dependencies)):
+    """Download the PDF version of the onboarding report (as per README Step 7.9)."""
+    job_store = container.job_store
+    if not job_store.exists(job_id):
+        raise HTTPException(status_code=404, detail="Job not found")
+    job = job_store.get(job_id)
+    if job.stage != JobStage.COMPLETED:
+        raise HTTPException(
+            status_code=409,
+            detail="Report not ready yet. Job is still processing."
+        )
+    if not job.report:
+        raise HTTPException(
+            status_code=404,
+            detail="Report not found. Job may have completed without generating report."
+        )
+    
+    # Get PDF path from report metadata
+    pdf_path_str = job.report.metadata.get("pdf_path")
+    if not pdf_path_str:
+        raise HTTPException(
+            status_code=404,
+            detail="PDF not available. Report may have been generated without PDF conversion."
+        )
+    
+    pdf_path = Path(pdf_path_str)
+    if not pdf_path.exists():
+        raise HTTPException(
+            status_code=404,
+            detail="PDF file not found on server."
+        )
+    
+    return FileResponse(
+        path=pdf_path,
+        media_type="application/pdf",
+        filename=f"onboarding_report_{job_id}.pdf"
+    )
+
+
@app.get("/health")
 async def healthcheck(container: ServiceContainer = Depends(get_dependencies)):
    settings = container.settings
    return {
        "status": "ok",
        "claude_model": settings.claude_model,
-        "max_input_tokens_per_min": settings.claude_max_input_tokens,
-        "max_output_tokens_per_min": settings.claude_max_output_tokens,
+        "qwen_model": settings.qwen_model,
+        "embedding_model": settings.embedding_model,
+        "qdrant_url": settings.qdrant_url,
+        "dowhy_enabled": settings.dowhy_enabled,
    }


--- a/services/multi-document-upload-service/src/multi_document_upload_service/models.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/models.py
@ -10,9 +10,10 @@ from pydantic import BaseModel, Field
 class JobStage(str, Enum):
    RECEIVED = "received"
    SAVING_FILES = "saving_files"
-    EXTRACTING = "extracting"
-    ANALYZING = "analyzing"
-    BUILDING_GRAPH = "building_graph"
+    EXTRACTING = "extracting"  # PyMuPDF + Qwen2.5-VL
+    BUILDING_GRAPH = "building_graph"  # DoWhy + Neo4j
+    INDEXING_VECTORS = "indexing_vectors"  # Qdrant
+    GENERATING_REPORT = "generating_report"  # Claude onboarding doc
    COMPLETED = "completed"
    FAILED = "failed"

@ -34,6 +35,7 @@ class CausalRelation(BaseModel):
    explanation: Optional[str] = None
    source_file_id: Optional[str] = None
    source_snippet: Optional[str] = None
+    relationship_type: str = Field(default="CAUSES")  # DEPENDS_ON, USES, IMPLEMENTS, etc.
    metadata: Dict[str, Any] = Field(default_factory=dict)


@ -46,6 +48,7 @@ class JobRecord(BaseModel):
    total_files: int = 0
    processed_files: int = 0
    relations: List[CausalRelation] = Field(default_factory=list)
+    report: Optional[ProjectReport] = None  # Generated onboarding report
    created_at: datetime = Field(default_factory=datetime.utcnow)
    updated_at: datetime = Field(default_factory=datetime.utcnow)
    error: str | None = None
@ -82,3 +85,15 @@ class JobGraphSummary(BaseModel):
    edge_count: int
    generated_at: datetime

+
+class ProjectReport(BaseModel):
+    """Beginner-friendly onboarding report generated from project documents."""
+    job_id: str
+    title: str = "Project Onboarding Guide"
+    content: str  # Markdown content
+    sections: Dict[str, str] = Field(default_factory=dict)  # Section name -> content
+    key_concepts: List[str] = Field(default_factory=list)  # Important concepts covered
+    total_pages: int = 0  # Estimated pages
+    generated_at: datetime = Field(default_factory=datetime.utcnow)
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+
--- a/services/multi-document-upload-service/src/multi_document_upload_service/processors/chunker.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/processors/chunker.py
@ -1,24 +0,0 @@
-from __future__ import annotations
-
-from typing import Iterable, List
-
-import tiktoken
-
-
-class TextChunker:
-    def __init__(self, model_name: str, token_target: int = 800, overlap: int = 200):
-        self.encoder = tiktoken.encoding_for_model("gpt-4o") if "claude" not in model_name else tiktoken.get_encoding("cl100k_base")
-        self.token_target = token_target
-        self.overlap = overlap
-
-    def chunk(self, text: str) -> Iterable[str]:
-        tokens = self.encoder.encode(text)
-        step = max(self.token_target - self.overlap, 1)
-        chunks: List[str] = []
-        for start in range(0, len(tokens), step):
-            end = min(start + self.token_target, len(tokens))
-            chunk_tokens = tokens[start:end]
-            chunk_text = self.encoder.decode(chunk_tokens)
-            chunks.append(chunk_text)
-        return chunks
-
--- a/services/multi-document-upload-service/src/multi_document_upload_service/processors/dowhy_analyzer.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/processors/dowhy_analyzer.py
@ -0,0 +1,187 @@
+from __future__ import annotations
+
+import logging
+from typing import List, Optional
+
+import pandas as pd
+
+from ..config import get_settings
+from ..models import CausalRelation
+
+logger = logging.getLogger(__name__)
+
+try:
+    import dowhy
+    from dowhy import CausalModel
+    HAS_DOWHY = True
+except ImportError:
+    HAS_DOWHY = False
+    logger.warning("DoWhy not available")
+
+
+class DoWhyAnalyzer:
+    """Validate causal relationships using DoWhy Structural Causal Models."""
+    
+    def __init__(self, confidence_threshold: Optional[float] = None):
+        if not HAS_DOWHY:
+            raise ImportError("DoWhy is required for causal analysis")
+        
+        settings = get_settings()
+        self.confidence_threshold = confidence_threshold or settings.dowhy_confidence_threshold
+        self.enabled = settings.dowhy_enabled
+    
+    def validate_relationships(
+        self, 
+        relationships: List[CausalRelation],
+        text_data: Optional[str] = None
+    ) -> List[CausalRelation]:
+        """
+        Validate causal relationships using DoWhy SCM.
+        Filters out relationships that don't pass validation.
+        """
+        if not self.enabled:
+            logger.info("DoWhy validation is disabled, returning all relationships")
+            return relationships
+        
+        if not relationships:
+            return []
+        
+        validated: List[CausalRelation] = []
+        
+        # Group relationships by cause to build SCM
+        cause_groups = {}
+        for rel in relationships:
+            cause = rel.cause
+            if cause not in cause_groups:
+                cause_groups[cause] = []
+            cause_groups[cause].append(rel)
+        
+        # Validate each group
+        for cause, effects in cause_groups.items():
+            for rel in effects:
+                try:
+                    is_valid = self._validate_single_relationship(rel, relationships, text_data)
+                    if is_valid:
+                        # Update confidence with validation score
+                        rel.confidence = min(rel.confidence + 0.1, 0.95)  # Boost validated relationships
+                        rel.metadata["dowhy_validated"] = True
+                        validated.append(rel)
+                    else:
+                        logger.debug("DoWhy validation failed for: %s -> %s", rel.cause, rel.effect)
+                except Exception as exc:
+                    logger.warning("DoWhy validation error for %s -> %s: %s", 
+                                 rel.cause, rel.effect, exc)
+                    # If validation fails, keep the relationship but mark it
+                    rel.metadata["dowhy_validated"] = False
+                    rel.metadata["dowhy_error"] = str(exc)
+                    validated.append(rel)  # Keep it but with lower confidence
+        
+        logger.info("DoWhy validated %d/%d relationships", len(validated), len(relationships))
+        return validated
+    
+    def _validate_single_relationship(
+        self,
+        relationship: CausalRelation,
+        all_relationships: List[CausalRelation],
+        text_data: Optional[str] = None
+    ) -> bool:
+        """
+        Validate a single relationship using DoWhy.
+        Returns True if relationship is valid, False otherwise.
+        """
+        try:
+            # Build a simple causal graph from relationships
+            # Extract unique variables (causes and effects)
+            variables = set()
+            for rel in all_relationships:
+                variables.add(rel.cause)
+                variables.add(rel.effect)
+            
+            # Create a simple dataset for DoWhy
+            # Since we don't have actual data, we'll use a heuristic approach
+            # based on relationship frequency and structure
+            
+            # Check if there's a path from cause to effect in the graph
+            has_path = self._check_causal_path(
+                relationship.cause,
+                relationship.effect,
+                all_relationships
+            )
+            
+            if not has_path:
+                return False
+            
+            # Additional validation: check for confounders
+            # If there are many relationships involving both cause and effect,
+            # it's more likely to be valid
+            related_count = sum(
+                1 for rel in all_relationships
+                if rel.cause == relationship.cause or rel.effect == relationship.effect
+            )
+            
+            # If there are multiple relationships involving these concepts,
+            # it's more likely to be a valid causal relationship
+            if related_count >= 2:
+                return True
+            
+            # For single relationships, use confidence threshold
+            return relationship.confidence >= 0.6
+            
+        except Exception as exc:
+            logger.warning("DoWhy validation error: %s", exc)
+            return False
+    
+    def _check_causal_path(
+        self,
+        cause: str,
+        effect: str,
+        relationships: List[CausalRelation],
+        max_depth: int = 3
+    ) -> bool:
+        """Check if there's a causal path from cause to effect."""
+        if max_depth == 0:
+            return False
+        
+        # Direct relationship
+        for rel in relationships:
+            if rel.cause == cause and rel.effect == effect:
+                return True
+        
+        # Indirect relationship (transitive)
+        for rel in relationships:
+            if rel.cause == cause:
+                # Check if rel.effect leads to the target effect
+                if self._check_causal_path(rel.effect, effect, relationships, max_depth - 1):
+                    return True
+        
+        return False
+    
+    def build_scm_from_relationships(
+        self,
+        relationships: List[CausalRelation]
+    ) -> Optional[CausalModel]:
+        """
+        Build a Structural Causal Model from relationships.
+        This is a simplified version for text-based causal inference.
+        """
+        if not relationships:
+            return None
+        
+        try:
+            # Extract all unique variables
+            variables = set()
+            for rel in relationships:
+                variables.add(rel.cause)
+                variables.add(rel.effect)
+            
+            # Create a simple adjacency matrix representation
+            # This is a heuristic approach since we don't have actual data
+            
+            # For now, return None as building a full SCM requires actual data
+            # The validation uses graph-based heuristics instead
+            return None
+            
+        except Exception as exc:
+            logger.warning("Failed to build SCM: %s", exc)
+            return None
+
--- a/services/multi-document-upload-service/src/multi_document_upload_service/processors/embedder.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/processors/embedder.py
@ -0,0 +1,85 @@
+from __future__ import annotations
+
+import logging
+from typing import List
+
+from ..config import get_settings
+
+logger = logging.getLogger(__name__)
+
+try:
+    from sentence_transformers import SentenceTransformer
+    HAS_SENTENCE_TRANSFORMERS = True
+except ImportError:
+    HAS_SENTENCE_TRANSFORMERS = False
+    logger.warning("sentence-transformers not available")
+
+
+class Embedder:
+    """Generate embeddings using sentence-transformers."""
+    
+    def __init__(self, model_name: str | None = None):
+        if not HAS_SENTENCE_TRANSFORMERS:
+            raise ImportError("sentence-transformers is required for embeddings")
+        
+        settings = get_settings()
+        self.model_name = model_name or settings.embedding_model
+        
+        logger.info("Loading embedding model: %s", self.model_name)
+        try:
+            self.model = SentenceTransformer(self.model_name)
+            self.dimension = self.model.get_sentence_embedding_dimension()
+            logger.info("Loaded embedding model with dimension: %d", self.dimension)
+        except Exception as exc:
+            logger.exception("Failed to load embedding model %s: %s", self.model_name, exc)
+            raise
+    
+    def embed_text(self, text: str) -> List[float]:
+        """Generate embedding for a single text."""
+        if not text or not text.strip():
+            # Return zero vector for empty text
+            return [0.0] * self.dimension
+        
+        try:
+            embedding = self.model.encode(text, normalize_embeddings=True)
+            return embedding.tolist()
+        except Exception as exc:
+            logger.warning("Failed to embed text: %s", exc)
+            return [0.0] * self.dimension
+    
+    def embed_batch(self, texts: List[str], batch_size: int = 32) -> List[List[float]]:
+        """Generate embeddings for a batch of texts."""
+        if not texts:
+            return []
+        
+        try:
+            embeddings = self.model.encode(
+                texts,
+                batch_size=batch_size,
+                normalize_embeddings=True,
+                show_progress_bar=False
+            )
+            return embeddings.tolist()
+        except Exception as exc:
+            logger.warning("Failed to embed batch: %s", exc)
+            return [[0.0] * self.dimension] * len(texts)
+    
+    def embed_relation(self, cause: str, effect: str, explanation: str | None = None) -> List[float]:
+        """Generate embedding for a cause-effect relationship."""
+        # Combine cause, effect, and explanation into a single text
+        parts = [cause, "causes", effect]
+        if explanation:
+            parts.append(explanation)
+        
+        text = " ".join(parts)
+        return self.embed_text(text)
+    
+    def embed_concept(self, concept_name: str, description: str | None = None) -> List[float]:
+        """Generate embedding for a concept/node."""
+        if description:
+            text = f"{concept_name}: {description}"
+        else:
+            text = concept_name
+        
+        return self.embed_text(text)
+
--- a/services/multi-document-upload-service/src/multi_document_upload_service/processors/entity_resolver.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/processors/entity_resolver.py
@ -0,0 +1,253 @@
+from __future__ import annotations
+
+import json
+import logging
+import re
+from typing import Dict, List, Set
+
+from anthropic import Anthropic, BadRequestError
+
+from ..config import get_settings
+from ..models import CausalRelation
+
+logger = logging.getLogger(__name__)
+
+
+class EntityResolver:
+    """
+    Resolve entity mentions using Claude AI as per README Stage 4.
+    Identifies that different mentions refer to the same entity.
+    """
+    
+    def __init__(self):
+        settings = get_settings()
+        self.api_key = settings.anthropic_api_key
+        self.model = settings.claude_model
+        self.max_output_tokens = settings.claude_max_output_tokens
+        
+        if not self.api_key:
+            logger.warning("ANTHROPIC_API_KEY not set - Entity resolution will be skipped")
+            self.client = None
+        else:
+            try:
+                self.client = Anthropic(api_key=self.api_key)
+                logger.info("EntityResolver initialized with Claude AI")
+            except Exception as e:
+                logger.warning("Failed to initialize Claude AI for entity resolution: %s", e)
+                self.client = None
+    
+    def resolve_entities(self, relations: List[CausalRelation]) -> Dict[str, Dict]:
+        """
+        Resolve entity mentions across all documents as per README Step 4.
+        
+        Step 4.1: Collect all entities
+        Step 4.2: Group by entity type
+        Step 4.3: AI-powered resolution (Claude API)
+        Step 4.4: Create canonical names
+        
+        Returns mapping: canonical_name -> {mentions, type, role, confidence}
+        """
+        if not self.client:
+            logger.info("Entity resolution skipped (Claude AI not available)")
+            return {}
+        
+        if not relations:
+            return {}
+        
+        # Step 4.1: COLLECT ALL ENTITIES
+        all_mentions: Set[str] = set()
+        for rel in relations:
+            all_mentions.add(rel.cause.strip())
+            all_mentions.add(rel.effect.strip())
+        
+        if not all_mentions:
+            return {}
+        
+        logger.info("Collecting %d entity mentions for resolution", len(all_mentions))
+        
+        # Step 4.2: GROUP BY ENTITY TYPE (simple heuristic)
+        people_mentions = []
+        project_mentions = []
+        team_mentions = []
+        other_mentions = []
+        
+        for mention in all_mentions:
+            mention_lower = mention.lower()
+            if any(word in mention_lower for word in ["team", "department", "group", "division"]):
+                team_mentions.append(mention)
+            elif any(word in mention_lower for word in ["project", "system", "application", "platform"]):
+                project_mentions.append(mention)
+            elif len(mention.split()) <= 3 and not any(char.isdigit() for char in mention):
+                # Likely a person name (short, no numbers)
+                people_mentions.append(mention)
+            else:
+                other_mentions.append(mention)
+        
+        # Step 4.3: AI-POWERED RESOLUTION (Claude API)
+        resolved_entities = {}
+        
+        # Resolve people
+        if people_mentions:
+            people_resolved = self._resolve_with_claude(people_mentions, "Person")
+            resolved_entities.update(people_resolved)
+        
+        # Resolve projects
+        if project_mentions:
+            projects_resolved = self._resolve_with_claude(project_mentions, "Project")
+            resolved_entities.update(projects_resolved)
+        
+        # Resolve teams
+        if team_mentions:
+            teams_resolved = self._resolve_with_claude(team_mentions, "Team")
+            resolved_entities.update(teams_resolved)
+        
+        # Resolve others
+        if other_mentions:
+            others_resolved = self._resolve_with_claude(other_mentions, "Entity")
+            resolved_entities.update(others_resolved)
+        
+        logger.info("Resolved %d entities from %d mentions", len(resolved_entities), len(all_mentions))
+        
+        return resolved_entities
+    
+    def _resolve_with_claude(self, mentions: List[str], entity_type: str) -> Dict[str, Dict]:
+        """Use Claude AI to resolve entity mentions."""
+        if not self.client or not mentions:
+            return {}
+        
+        try:
+            system_prompt = """You are an expert at entity resolution. Your task is to identify which mentions refer to the same real-world entity.
+
+Analyze the given list of entity mentions and group them by the actual entity they refer to.
+
+Return a JSON object where:
+- Key: Canonical name (best/most complete name)
+- Value: Object with:
+  - "mentions": List of all mentions that refer to this entity
+  - "type": Entity type (Person, Project, Team, etc.)
+  - "role": Role or description (if applicable)
+  - "confidence": Confidence score (0.0 to 1.0)
+
+Example:
+{
+  "John Smith": {
+    "mentions": ["John", "J. Smith", "John Smith", "Smith"],
+    "type": "Person",
+    "role": "Project Lead",
+    "confidence": 0.95
+  },
+  "Project Alpha": {
+    "mentions": ["Project Alpha", "Alpha", "The Alpha Project"],
+    "type": "Project",
+    "role": null,
+    "confidence": 0.90
+  }
+}
+
+Be thorough and group all related mentions together."""
+
+            user_prompt = f"""Analyze these {entity_type} entity mentions and resolve which ones refer to the same entity:
+
+{json.dumps(mentions, indent=2)}
+
+Return a JSON object mapping canonical names to their resolved mentions."""
+
+            message = self.client.messages.create(
+                model=self.model,
+                max_tokens=self.max_output_tokens,
+                temperature=0.2,  # Lower temperature for more consistent resolution
+                system=system_prompt,
+                messages=[{"role": "user", "content": user_prompt}]
+            )
+            
+            response_text = "".join(
+                block.text for block in message.content 
+                if hasattr(block, "text")
+            )
+            
+            if not response_text:
+                logger.warning("Empty response from Claude for entity resolution")
+                return {}
+            
+            # Parse JSON response
+            try:
+                json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
+                if json_match:
+                    json_text = json_match.group(0)
+                else:
+                    json_text = response_text
+                
+                resolved = json.loads(json_text)
+                
+                # Validate and structure the response
+                result = {}
+                for canonical_name, entity_data in resolved.items():
+                    if isinstance(entity_data, dict):
+                        result[canonical_name] = {
+                            "mentions": entity_data.get("mentions", [canonical_name]),
+                            "type": entity_data.get("type", entity_type),
+                            "role": entity_data.get("role"),
+                            "confidence": float(entity_data.get("confidence", 0.85))
+                        }
+                    else:
+                        # Fallback if structure is different
+                        result[canonical_name] = {
+                            "mentions": [canonical_name] if isinstance(entity_data, str) else entity_data,
+                            "type": entity_type,
+                            "role": None,
+                            "confidence": 0.8
+                        }
+                
+                return result
+                
+            except json.JSONDecodeError as e:
+                logger.warning("Failed to parse Claude response as JSON: %s. Response: %s", 
+                             e, response_text[:500])
+                return {}
+                
+        except BadRequestError as e:
+            logger.warning("Claude API error during entity resolution: %s", e)
+            return {}
+        except Exception as e:
+            logger.warning("Entity resolution failed: %s", e)
+            return {}
+    
+    def apply_resolution_to_relations(
+        self, 
+        relations: List[CausalRelation], 
+        resolved_entities: Dict[str, Dict]
+    ) -> List[CausalRelation]:
+        """
+        Apply entity resolution to relationships.
+        Replace mentions with canonical names.
+        """
+        if not resolved_entities:
+            return relations
+        
+        # Create reverse mapping: mention -> canonical_name
+        mention_to_canonical: Dict[str, str] = {}
+        for canonical_name, entity_data in resolved_entities.items():
+            mentions = entity_data.get("mentions", [])
+            for mention in mentions:
+                mention_to_canonical[mention.lower()] = canonical_name
+        
+        # Update relations with canonical names
+        updated_relations = []
+        for rel in relations:
+            # Resolve cause
+            cause_lower = rel.cause.strip().lower()
+            if cause_lower in mention_to_canonical:
+                rel.cause = mention_to_canonical[cause_lower]
+            
+            # Resolve effect
+            effect_lower = rel.effect.strip().lower()
+            if effect_lower in mention_to_canonical:
+                rel.effect = mention_to_canonical[effect_lower]
+            
+            # Store resolution info in metadata
+            rel.metadata["entity_resolved"] = True
+            updated_relations.append(rel)
+        
+        logger.info("Applied entity resolution to %d relationships", len(updated_relations))
+        return updated_relations
+
--- a/services/multi-document-upload-service/src/multi_document_upload_service/processors/graph_writer.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/processors/graph_writer.py
@ -1,38 +1,65 @@
 from __future__ import annotations

+import json
 import logging
-from typing import Iterable
+import re
+from typing import Dict, Iterable, List, Optional

+from anthropic import Anthropic, BadRequestError
 from neo4j import GraphDatabase, Transaction

+from ..config import get_settings
 from ..models import CausalRelation

 logger = logging.getLogger(__name__)


-MERGE_QUERY = """
-MERGE (cause:Concept {name: $cause})
-ON CREATE SET cause.created_at = timestamp(), cause.lastSeen = timestamp()
-ON MATCH SET cause.lastSeen = timestamp()
-MERGE (effect:Concept {name: $effect})
-ON CREATE SET effect.created_at = timestamp(), effect.lastSeen = timestamp()
-ON MATCH SET effect.lastSeen = timestamp()
-MERGE (cause)-[r:CAUSES]->(effect)
-ON CREATE SET r.confidence = $confidence,
-              r.explanation = $explanation,
-              r.source_file_id = $source_file_id,
-              r.source_snippet = $source_snippet,
-              r.job_id = $job_id,
-              r.model = $model,
-              r.created_at = timestamp(),
-              r.updated_at = timestamp()
-ON MATCH SET r.confidence = $confidence,
-             r.explanation = $explanation,
-             r.source_file_id = $source_file_id,
-             r.source_snippet = $source_snippet,
-             r.job_id = $job_id,
-             r.model = $model,
-             r.updated_at = timestamp()
+# Query to create Document node
+CREATE_DOCUMENT_QUERY = """
+MERGE (doc:Document {filename: $filename})
+ON CREATE SET doc.uploaded_at = timestamp(),
+              doc.file_path = $file_path,
+              doc.job_id = $job_id,
+              doc.created_at = timestamp()
+ON MATCH SET doc.lastSeen = timestamp()
+"""
+
+# Query to create Entity nodes and relationship with dynamic type
+CREATE_ENTITY_RELATIONSHIP_QUERY = """
+MERGE (source:Entity:Concept {name: $source})
+ON CREATE SET source.created_at = timestamp(), 
+              source.lastSeen = timestamp(),
+              source.type = COALESCE($source_type, 'Entity')
+ON MATCH SET source.lastSeen = timestamp()
+
+MERGE (target:Entity:Concept {name: $target})
+ON CREATE SET target.created_at = timestamp(), 
+              target.lastSeen = timestamp(),
+              target.type = COALESCE($target_type, 'Entity')
+ON MATCH SET target.lastSeen = timestamp()
+
+WITH source, target
+CALL apoc.merge.relationship(
+  source,
+  $rel_type,
+  {confidence: $confidence,
+   explanation: $explanation,
+   source_file_id: $source_file_id,
+   source_snippet: $source_snippet,
+   job_id: $job_id,
+   model: $model,
+   created_at: timestamp(),
+   updated_at: timestamp()},
+  {confidence: $confidence,
+   explanation: $explanation,
+   source_file_id: $source_file_id,
+   source_snippet: $source_snippet,
+   job_id: $job_id,
+   model: $model,
+   updated_at: timestamp()},
+  target
+) YIELD rel
+RETURN rel
 """


@ -43,12 +70,42 @@ class GraphWriter:
    def close(self) -> None:
        self._driver.close()
    
-    def write_relations(self, job_id: str, relations: Iterable[CausalRelation]) -> None:
+    def write_documents(self, job_id: str, files: Iterable) -> None:
+        """Create Document nodes for uploaded files."""
+        files_list = list(files)
+        if not files_list:
+            return
+        
+        logger.info("Creating %d document nodes for job %s", len(files_list), job_id)
+        
+        with self._driver.session() as session:
+            def _write_docs(tx: Transaction) -> None:
+                for file_record in files_list:
+                    try:
+                        tx.run(
+                            CREATE_DOCUMENT_QUERY,
+                            filename=file_record.filename,
+                            file_path=file_record.stored_path,
+                            job_id=job_id
+                        )
+                        logger.debug("Created document node: %s", file_record.filename)
+                    except Exception as exc:
+                        logger.warning("Failed to create document node for %s: %s", file_record.filename, exc)
+            
+            session.execute_write(_write_docs)
+            logger.info("Created document nodes for job %s", job_id)
+
+    def write_relations(self, job_id: str, relations: Iterable[CausalRelation], files: Iterable = None) -> None:
+        """Write entities and relationships to Neo4j with multiple relationship types."""
        relations_list = list(relations)
        if not relations_list:
            logger.warning("No relations to write for job %s", job_id)
            return
        
+        # Create document nodes if files provided
+        if files:
+            self.write_documents(job_id, files)
+            
        logger.info("Writing %d relations to Neo4j for job %s", len(relations_list), job_id)
        
        with self._driver.session() as session:
@ -58,11 +115,70 @@ class GraphWriter:
                    if not relation.cause or not relation.effect:
                        logger.warning("Skipping relation with empty cause or effect: %s -> %s", relation.cause, relation.effect)
                        continue
+                    
+                    # Get relationship type (default to CAUSES for backward compatibility)
+                    rel_type = getattr(relation, 'relationship_type', None) or "CAUSES"
+                    
+                    # Sanitize relationship type (only allow alphanumeric and underscores)
+                    rel_type = re.sub(r'[^A-Z0-9_]', '', rel_type.upper())
+                    if not rel_type:
+                        rel_type = "CAUSES"
+                    
+                    # Infer entity types from names (simple heuristic)
+                    source_type = self._infer_entity_type(relation.cause)
+                    target_type = self._infer_entity_type(relation.effect)
+                    
                    try:
+                        # Create source entity
+                        tx.run("""
+                        MERGE (source:Entity:Concept {name: $source})
+                        ON CREATE SET source.created_at = timestamp(), 
+                                      source.lastSeen = timestamp(),
+                                      source.type = $source_type
+                        ON MATCH SET source.lastSeen = timestamp()
+                        """,
+                        source=relation.cause.strip(),
+                        source_type=source_type
+                        )
+                        
+                        # Create target entity
+                        tx.run("""
+                        MERGE (target:Entity:Concept {name: $target})
+                        ON CREATE SET target.created_at = timestamp(), 
+                                      target.lastSeen = timestamp(),
+                                      target.type = $target_type
+                        ON MATCH SET target.lastSeen = timestamp()
+                        """,
+                        target=relation.effect.strip(),
+                        target_type=target_type
+                        )
+                        
+                        # Create relationship with dynamic type (sanitized)
+                        query = f"""
+                        MATCH (source:Entity {{name: $source}})
+                        MATCH (target:Entity {{name: $target}})
+                        MERGE (source)-[r:{rel_type}]->(target)
+                        ON CREATE SET r.confidence = $confidence,
+                                      r.explanation = $explanation,
+                                      r.source_file_id = $source_file_id,
+                                      r.source_snippet = $source_snippet,
+                                      r.job_id = $job_id,
+                                      r.model = $model,
+                                      r.created_at = timestamp(),
+                                      r.updated_at = timestamp()
+                        ON MATCH SET r.confidence = $confidence,
+                                     r.explanation = $explanation,
+                                     r.source_file_id = $source_file_id,
+                                     r.source_snippet = $source_snippet,
+                                     r.job_id = $job_id,
+                                     r.model = $model,
+                                     r.updated_at = timestamp()
+                        """
+                        
                        result = tx.run(
-                            MERGE_QUERY,
-                            cause=relation.cause.strip(),
-                            effect=relation.effect.strip(),
+                            query,
+                            source=relation.cause.strip(),
+                            target=relation.effect.strip(),
                            confidence=float(relation.confidence) if relation.confidence else 0.0,
                            explanation=relation.explanation or "",
                            source_file_id=relation.source_file_id or "",
@ -70,12 +186,145 @@ class GraphWriter:
                            job_id=job_id,
                            model=relation.metadata.get("model") or "",
                        )
+                        
+                        # Link entities to documents if source_file_id is a filename
+                        if relation.source_file_id and relation.source_file_id != "combined_text":
+                            link_query = f"""
+                            MATCH (entity:Entity {{name: $entity_name}})
+                            MATCH (doc:Document {{filename: $filename}})
+                            MERGE (entity)-[:EXTRACTED_FROM]->(doc)
+                            """
+                            try:
+                                tx.run(link_query, entity_name=relation.cause.strip(), filename=relation.source_file_id)
+                                tx.run(link_query, entity_name=relation.effect.strip(), filename=relation.source_file_id)
+                            except:
+                                pass  # Ignore if document doesn't exist
+                        
                        count += 1
-                        logger.debug("Wrote relation: %s -> %s (confidence: %s)", relation.cause, relation.effect, relation.confidence)
+                        logger.debug("Wrote relation: %s -[%s]-> %s (confidence: %s)", 
+                                   relation.cause, rel_type, relation.effect, relation.confidence)
                    except Exception as exc:
                        logger.exception("Failed to write relation %s -> %s: %s", relation.cause, relation.effect, exc)
                logger.info("Successfully wrote %d/%d relations to Neo4j", count, len(relations_list))

            session.execute_write(_write)
-            logger.info("Persisted causal relations for job %s", job_id)
+            logger.info("Persisted relations for job %s", job_id)
+    
+    def _infer_entity_type(self, entity_name: str) -> str:
+        """Infer entity type from name (simple heuristic)."""
+        name_lower = entity_name.lower()
+        
+        # Technology patterns
+        if any(tech in name_lower for tech in ['react', 'node', 'python', 'java', 'postgres', 'mysql', 'redis', 'mongodb', 'docker', 'kubernetes']):
+            return "Technology"
+        
+        # Service patterns
+        if any(word in name_lower for word in ['service', 'api', 'gateway', 'auth', 'payment', 'notification']):
+            return "Service"
+        
+        # Component patterns
+        if any(word in name_lower for word in ['component', 'module', 'system', 'application', 'platform']):
+            return "Component"
+        
+        # Process patterns
+        if any(word in name_lower for word in ['flow', 'process', 'workflow', 'pipeline', 'procedure']):
+            return "Process"
+        
+        # Default
+        return "Entity"
+    
+    def query_causal_chains(
+        self, 
+        job_id: str, 
+        min_length: int = 2, 
+        max_length: int = 4,
+        min_confidence: float = 0.8,
+        limit: int = 20
+    ) -> List[Dict]:
+        """
+        Query Neo4j for causal chains as per README Step 7.3.
+        Returns sequences of connected events.
+        """
+        # Query for causal chains - match any relationship type
+        query = f"""
+        MATCH path = (start:Entity)-[r*{min_length}..{max_length}]->(end:Entity)
+        WHERE ALL(rel in relationships(path) WHERE rel.job_id = $job_id AND rel.confidence >= $min_confidence)
+        WITH path, 
+             [node in nodes(path) | node.name] as chain,
+             [rel in relationships(path) | rel.confidence] as confidences,
+             [rel in relationships(path) | type(rel)] as rel_types,
+             [rel in relationships(path) | rel.explanation] as explanations
+        RETURN chain, confidences, rel_types, explanations
+        ORDER BY reduce(conf = 0.0, c in confidences | conf + c) DESC
+        LIMIT $limit
+        """
+        
+        try:
+            with self._driver.session() as session:
+                result = session.run(
+                    query,
+                    job_id=job_id,
+                    min_confidence=min_confidence,
+                    limit=limit
+                )
+                
+                chains = []
+                for record in result:
+                    chain = record["chain"]
+                    confidences = record["confidences"]
+                    rel_types = record["rel_types"]
+                    explanations = record["explanations"]
+                    
+                    # Calculate average confidence
+                    avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
+                    
+                    chains.append({
+                        "chain": chain,
+                        "confidences": confidences,
+                        "rel_types": rel_types,
+                        "explanations": explanations,
+                        "avg_confidence": avg_confidence,
+                        "length": len(chain) - 1
+                    })
+                
+                logger.info("Found %d causal chains for job %s", len(chains), job_id)
+                return chains
+        except Exception as exc:
+            logger.exception("Failed to query causal chains: %s", exc)
+            return []
+    
+    def query_key_entities(self, job_id: str, limit: int = 20) -> List[Dict]:
+        """
+        Query Neo4j for key entities (most involved) as per README Step 7.3.
+        """
+        query = """
+        MATCH (e:Entity)-[r]->(target)
+        WHERE r.job_id = $job_id
+        WITH e, count(r) as relation_count, collect(DISTINCT type(r)) as rel_types
+        RETURN e.name as name, 
+               e.type as type,
+               relation_count,
+               rel_types
+        ORDER BY relation_count DESC
+        LIMIT $limit
+        """
+        
+        try:
+            with self._driver.session() as session:
+                result = session.run(query, job_id=job_id, limit=limit)
+                
+                entities = []
+                for record in result:
+                    entities.append({
+                        "name": record["name"],
+                        "type": record.get("type", "Entity"),
+                        "relation_count": record["relation_count"],
+                        "relation_types": record["rel_types"]
+                    })
+                
+                logger.info("Found %d key entities for job %s", len(entities), job_id)
+                return entities
+        except Exception as exc:
+            logger.exception("Failed to query key entities: %s", exc)
+            return []

--- a/services/multi-document-upload-service/src/multi_document_upload_service/processors/relationship_extractor.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/processors/relationship_extractor.py
@ -0,0 +1,625 @@
+from __future__ import annotations
+
+import json
+import logging
+import re
+from typing import Dict, List, Optional
+
+from anthropic import Anthropic, BadRequestError
+
+from ..config import get_settings
+from ..models import CausalRelation
+
+logger = logging.getLogger(__name__)
+
+# Try to import SpaCy
+try:
+    import spacy
+    from spacy.lang.en import English
+    HAS_SPACY = True
+except ImportError:
+    HAS_SPACY = False
+    logger.warning("spacy not available - NLP detection will be skipped")
+
+
+class RelationshipExtractor:
+    """Extract potential cause-effect relationships from text using NLP (SpaCy) + Claude AI."""
+    
+    # Causal keywords for NLP detection (Step 3.1)
+    CAUSAL_KEYWORDS = [
+        "because", "due to", "as a result", "led to", "caused", "therefore",
+        "consequently", "hence", "thus", "so", "since", "owing to",
+        "resulted in", "brought about", "gave rise to", "triggered",
+        "provoked", "induced", "generated", "produced", "created"
+    ]
+    
+    # Common cause-effect patterns (expanded for architecture/technical documents)
+    CAUSE_EFFECT_PATTERNS = [
+        # Direct causal patterns
+        (r"(\w+(?:\s+\w+){0,15})\s+causes?\s+(\w+(?:\s+\w+){0,15})", "causes"),
+        (r"(\w+(?:\s+\w+){0,15})\s+leads?\s+to\s+(\w+(?:\s+\w+){0,15})", "leads_to"),
+        (r"(\w+(?:\s+\w+){0,15})\s+results?\s+in\s+(\w+(?:\s+\w+){0,15})", "results_in"),
+        (r"(\w+(?:\s+\w+){0,15})\s+triggers?\s+(\w+(?:\s+\w+){0,15})", "triggers"),
+        (r"(\w+(?:\s+\w+){0,15})\s+produces?\s+(\w+(?:\s+\w+){0,15})", "produces"),
+        (r"(\w+(?:\s+\w+){0,15})\s+enables?\s+(\w+(?:\s+\w+){0,15})", "enables"),
+        (r"(\w+(?:\s+\w+){0,15})\s+allows?\s+(\w+(?:\s+\w+){0,15})", "allows"),
+        (r"(\w+(?:\s+\w+){0,15})\s+facilitates?\s+(\w+(?:\s+\w+){0,15})", "facilitates"),
+        
+        # Dependency patterns
+        (r"(\w+(?:\s+\w+){0,15})\s+depends?\s+on\s+(\w+(?:\s+\w+){0,15})", "depends_on"),
+        (r"(\w+(?:\s+\w+){0,15})\s+requires?\s+(\w+(?:\s+\w+){0,15})", "requires"),
+        (r"(\w+(?:\s+\w+){0,15})\s+needs?\s+(\w+(?:\s+\w+){0,15})", "needs"),
+        (r"(\w+(?:\s+\w+){0,15})\s+relies?\s+on\s+(\w+(?:\s+\w+){0,15})", "relies_on"),
+        (r"(\w+(?:\s+\w+){0,15})\s+uses?\s+(\w+(?:\s+\w+){0,15})", "uses"),
+        (r"(\w+(?:\s+\w+){0,15})\s+utilizes?\s+(\w+(?:\s+\w+){0,15})", "utilizes"),
+        (r"(\w+(?:\s+\w+){0,15})\s+leverages?\s+(\w+(?:\s+\w+){0,15})", "leverages"),
+        
+        # Architectural/System patterns
+        (r"(\w+(?:\s+\w+){0,15})\s+connects?\s+to\s+(\w+(?:\s+\w+){0,15})", "connects_to"),
+        (r"(\w+(?:\s+\w+){0,15})\s+communicates?\s+with\s+(\w+(?:\s+\w+){0,15})", "communicates_with"),
+        (r"(\w+(?:\s+\w+){0,15})\s+interacts?\s+with\s+(\w+(?:\s+\w+){0,15})", "interacts_with"),
+        (r"(\w+(?:\s+\w+){0,15})\s+integrates?\s+with\s+(\w+(?:\s+\w+){0,15})", "integrates_with"),
+        (r"(\w+(?:\s+\w+){0,15})\s+provides?\s+(\w+(?:\s+\w+){0,15})", "provides"),
+        (r"(\w+(?:\s+\w+){0,15})\s+supports?\s+(\w+(?:\s+\w+){0,15})", "supports"),
+        (r"(\w+(?:\s+\w+){0,15})\s+handles?\s+(\w+(?:\s+\w+){0,15})", "handles"),
+        (r"(\w+(?:\s+\w+){0,15})\s+manages?\s+(\w+(?:\s+\w+){0,15})", "manages"),
+        (r"(\w+(?:\s+\w+){0,15})\s+controls?\s+(\w+(?:\s+\w+){0,15})", "controls"),
+        (r"(\w+(?:\s+\w+){0,15})\s+processes?\s+(\w+(?:\s+\w+){0,15})", "processes"),
+        (r"(\w+(?:\s+\w+){0,15})\s+generates?\s+(\w+(?:\s+\w+){0,15})", "generates"),
+        (r"(\w+(?:\s+\w+){0,15})\s+creates?\s+(\w+(?:\s+\w+){0,15})", "creates"),
+        (r"(\w+(?:\s+\w+){0,15})\s+implements?\s+(\w+(?:\s+\w+){0,15})", "implements"),
+        (r"(\w+(?:\s+\w+){0,15})\s+delivers?\s+(\w+(?:\s+\w+){0,15})", "delivers"),
+        
+        # Flow patterns
+        (r"(\w+(?:\s+\w+){0,15})\s+flows?\s+to\s+(\w+(?:\s+\w+){0,15})", "flows_to"),
+        (r"(\w+(?:\s+\w+){0,15})\s+sends?\s+to\s+(\w+(?:\s+\w+){0,15})", "sends_to"),
+        (r"(\w+(?:\s+\w+){0,15})\s+transmits?\s+to\s+(\w+(?:\s+\w+){0,15})", "transmits_to"),
+        (r"(\w+(?:\s+\w+){0,15})\s+receives?\s+from\s+(\w+(?:\s+\w+){0,15})", "receives_from"),
+        
+        # Conditional patterns
+        (r"if\s+(\w+(?:\s+\w+){0,15}),\s+then\s+(\w+(?:\s+\w+){0,15})", "if_then"),
+        (r"when\s+(\w+(?:\s+\w+){0,15}),\s+(\w+(?:\s+\w+){0,15})\s+occurs?", "when_then"),
+        (r"(\w+(?:\s+\w+){0,15})\s+implies?\s+(\w+(?:\s+\w+){0,15})", "implies"),
+        (r"(\w+(?:\s+\w+){0,15})\s+ensures?\s+(\w+(?:\s+\w+){0,15})", "ensures"),
+        
+        # Sequential patterns
+        (r"(\w+(?:\s+\w+){0,15})\s+follows?\s+(\w+(?:\s+\w+){0,15})", "follows"),
+        (r"(\w+(?:\s+\w+){0,15})\s+comes?\s+after\s+(\w+(?:\s+\w+){0,15})", "comes_after"),
+        (r"first\s+(\w+(?:\s+\w+){0,15}),\s+then\s+(\w+(?:\s+\w+){0,15})", "first_then"),
+        (r"(\w+(?:\s+\w+){0,15})\s+precedes?\s+(\w+(?:\s+\w+){0,15})", "precedes"),
+        
+        # Containment patterns
+        (r"(\w+(?:\s+\w+){0,15})\s+contains?\s+(\w+(?:\s+\w+){0,15})", "contains"),
+        (r"(\w+(?:\s+\w+){0,15})\s+includes?\s+(\w+(?:\s+\w+){0,15})", "includes"),
+        (r"(\w+(?:\s+\w+){0,15})\s+consists?\s+of\s+(\w+(?:\s+\w+){0,15})", "consists_of"),
+        
+        # Influence patterns
+        (r"(\w+(?:\s+\w+){0,15})\s+affects?\s+(\w+(?:\s+\w+){0,15})", "affects"),
+        (r"(\w+(?:\s+\w+){0,15})\s+impacts?\s+(\w+(?:\s+\w+){0,15})", "impacts"),
+        (r"(\w+(?:\s+\w+){0,15})\s+influences?\s+(\w+(?:\s+\w+){0,15})", "influences"),
+    ]
+    
+    def __init__(self):
+        """Initialize NLP and Claude AI components."""
+        settings = get_settings()
+        
+        # Initialize SpaCy NLP model (Step 3.1)
+        self.nlp = None
+        if HAS_SPACY:
+            try:
+                # Try to load English model, fallback to blank if not available
+                try:
+                    self.nlp = spacy.load("en_core_web_sm")
+                except OSError:
+                    logger.warning("en_core_web_sm model not found, using blank English model")
+                    self.nlp = English()
+                    self.nlp.add_pipe("sentencizer")
+                logger.info("SpaCy NLP model loaded")
+            except Exception as e:
+                logger.warning("Failed to load SpaCy model: %s", e)
+                self.nlp = None
+        
+        # Initialize Claude AI client (Step 3.2)
+        self.claude_client = None
+        self.claude_model = settings.claude_model
+        self.claude_max_input_tokens = settings.claude_max_input_tokens
+        self.claude_max_output_tokens = settings.claude_max_output_tokens
+        
+        if settings.anthropic_api_key:
+            try:
+                self.claude_client = Anthropic(api_key=settings.anthropic_api_key)
+                logger.info("Claude AI client initialized")
+            except Exception as e:
+                logger.warning("Failed to initialize Claude AI client: %s", e)
+        else:
+            logger.warning("ANTHROPIC_API_KEY not set - Claude AI extraction will be skipped")
+    
+    def extract_from_text(self, text: str, source_file_id: str) -> List[CausalRelation]:
+        """
+        Extract cause-effect relationships using NLP (SpaCy) + Claude AI.
+        Implements Step 3.1 (NLP Detection) and Step 3.2 (Claude AI Extraction).
+        """
+        if not text or not text.strip():
+            return []
+        
+        all_relationships: List[CausalRelation] = []
+        
+        # Step 3.1: BASIC NLP DETECTION (SpaCy)
+        nlp_relationships = self._extract_with_nlp(text, source_file_id)
+        all_relationships.extend(nlp_relationships)
+        logger.info("NLP (SpaCy) extracted %d candidate relationships (low confidence)", 
+                   len(nlp_relationships))
+        
+        # Step 3.2: AI-POWERED EXTRACTION (Claude API)
+        if self.claude_client:
+            claude_relationships = self._extract_with_claude(text, source_file_id)
+            all_relationships.extend(claude_relationships)
+            logger.info("Claude AI extracted %d relationships (high confidence)", 
+                       len(claude_relationships))
+        else:
+            logger.info("Claude AI extraction skipped (API key not configured)")
+        
+        # Also run pattern matching as fallback
+        pattern_relationships = self._extract_with_patterns(text, source_file_id)
+        all_relationships.extend(pattern_relationships)
+        logger.info("Pattern matching extracted %d relationships", len(pattern_relationships))
+        
+        # Deduplicate relationships
+        seen = set()
+        unique_relationships = []
+        for rel in all_relationships:
+            key = (rel.cause.lower().strip(), rel.effect.lower().strip())
+            if key not in seen:
+                seen.add(key)
+                unique_relationships.append(rel)
+        
+        logger.info("Total unique relationships extracted: %d (from %d total)", 
+                   len(unique_relationships), len(all_relationships))
+        return unique_relationships
+    
+    def _extract_with_nlp(self, text: str, source_file_id: str) -> List[CausalRelation]:
+        """
+        Step 3.1: Basic NLP Detection using SpaCy.
+        Look for causal keywords and find sentences containing these patterns.
+        Returns potential causal relationships (low confidence).
+        """
+        if not self.nlp:
+            return []
+        
+        relationships: List[CausalRelation] = []
+        
+        try:
+            # Process text with SpaCy
+            doc = self.nlp(text)
+            
+            # Find sentences containing causal keywords
+            for sent in doc.sents:
+                sent_text = sent.text.strip()
+                if len(sent_text) < 10:
+                    continue
+                
+                # Check if sentence contains causal keywords
+                sent_lower = sent_text.lower()
+                has_causal_keyword = any(keyword in sent_lower for keyword in self.CAUSAL_KEYWORDS)
+                
+                if has_causal_keyword:
+                    # Try to extract cause-effect using dependency parsing
+                    cause = None
+                    effect = None
+                    
+                    # Look for causal conjunctions
+                    for token in sent:
+                        if token.text.lower() in ["because", "due", "since", "as"]:
+                            # Find the clause after the causal conjunction
+                            if token.dep_ in ["mark", "prep"]:
+                                # Try to extract cause and effect
+                                cause_span = None
+                                effect_span = None
+                                
+                                # Simple heuristic: text before "because/due to" is effect, after is cause
+                                if "because" in sent_lower or "since" in sent_lower:
+                                    parts = re.split(r'\b(because|since)\b', sent_text, flags=re.IGNORECASE)
+                                    if len(parts) >= 3:
+                                        effect = parts[0].strip()
+                                        cause = parts[2].strip()
+                                elif "due to" in sent_lower:
+                                    parts = re.split(r'\bdue to\b', sent_text, flags=re.IGNORECASE)
+                                    if len(parts) >= 2:
+                                        effect = parts[0].strip()
+                                        cause = parts[1].strip()
+                                
+                                if cause and effect:
+                                    # Clean up cause and effect
+                                    cause = re.sub(r'^[,\s]+|[,\s]+$', '', cause)
+                                    effect = re.sub(r'^[,\s]+|[,\s]+$', '', effect)
+                                    
+                                    if len(cause) >= 3 and len(effect) >= 3:
+                                        relationships.append(CausalRelation(
+                                            cause=cause,
+                                            effect=effect,
+                                            confidence=0.5,  # Low confidence for NLP
+                                            explanation=f"Extracted using NLP (SpaCy) - found causal keyword",
+                                            source_file_id=source_file_id,
+                                            source_snippet=sent_text[:200],
+                                            relationship_type="CAUSES",
+                                            metadata={
+                                                "extraction_method": "spacy_nlp",
+                                                "sentence": sent_text
+                                            }
+                                        ))
+        except Exception as e:
+            logger.warning("NLP extraction failed: %s", e)
+        
+        return relationships
+    
+    def _extract_with_claude(self, text: str, source_file_id: str) -> List[CausalRelation]:
+        """
+        Step 3.2: AI-Powered Extraction using Claude API.
+        Send full document text to Claude AI and ask it to find ALL causal relationships.
+        Returns high-quality causal relationships (high confidence).
+        """
+        if not self.claude_client:
+            return []
+        
+        relationships: List[CausalRelation] = []
+        
+        try:
+            # Prepare prompt for Claude
+            system_prompt = """You are an expert at analyzing text and extracting cause-effect relationships.
+Your task is to identify ALL causal relationships in the given text, including both explicit and implicit ones.
+
+For each causal relationship, extract:
+- Cause: What triggered or led to this?
+- Effect: What was the result or outcome?
+- Context: Additional background information
+- Entities: Who or what is involved (people, teams, projects, systems)
+- Confidence: How certain are you? (0.0 to 1.0)
+- Source sentence: The sentence or passage where this relationship was found
+- Date: When did this happen (if mentioned)
+
+Return the results as a JSON array of objects with this structure:
+[
+  {
+    "cause": "string",
+    "effect": "string",
+    "context": "string (optional)",
+    "entities": ["string"],
+    "confidence": 0.0-1.0,
+    "source_sentence": "string",
+    "date": "string (optional)"
+  }
+]
+
+Focus on:
+- Explicit relationships ("because X, therefore Y")
+- Implicit relationships (strongly implied cause-effect)
+- Technical and architectural dependencies
+- Business decisions and their impacts
+- Process flows and sequences"""
+
+            # Truncate text to fit within token limits (rough estimate: 1 token ≈ 4 characters)
+            max_chars = (self.claude_max_input_tokens - 1000) * 4
+            truncated_text = text[:max_chars] if len(text) > max_chars else text
+            
+            user_prompt = f"""Analyze the following text and extract ALL causal relationships.
+
+Text:
+{truncated_text}
+
+Return a JSON array of causal relationships. Be thorough and find both explicit and implicit relationships."""
+
+            # Call Claude API
+            message = self.claude_client.messages.create(
+                model=self.claude_model,
+                max_tokens=self.claude_max_output_tokens,
+                temperature=0.3,  # Lower temperature for more focused extraction
+                system=system_prompt,
+                messages=[
+                    {
+                        "role": "user",
+                        "content": user_prompt
+                    }
+                ]
+            )
+            
+            # Extract response text
+            content_blocks = message.content or []
+            response_text = "".join(
+                block.text for block in content_blocks 
+                if hasattr(block, "text")
+            )
+            
+            if not response_text:
+                logger.warning("Empty response from Claude AI")
+                return []
+            
+            # Parse JSON response
+            try:
+                # Try to extract JSON from response (might have markdown code blocks)
+                json_match = re.search(r'\[.*\]', response_text, re.DOTALL)
+                if json_match:
+                    json_text = json_match.group(0)
+                else:
+                    json_text = response_text
+                
+                claude_results = json.loads(json_text)
+                
+                # Convert Claude results to CausalRelation objects
+                for result in claude_results:
+                    cause = result.get("cause", "").strip()
+                    effect = result.get("effect", "").strip()
+                    context = result.get("context", "")
+                    entities = result.get("entities", [])
+                    confidence = float(result.get("confidence", 0.85))
+                    source_sentence = result.get("source_sentence", "")
+                    date = result.get("date", "")
+                    
+                    if not cause or not effect:
+                        continue
+                    
+                    # Map to Neo4j relationship type (default to CAUSES)
+                    relationship_type = "CAUSES"
+                    
+                    explanation = context or f"Extracted by Claude AI"
+                    if entities:
+                        explanation += f" (Entities: {', '.join(entities)})"
+                    
+                    relationships.append(CausalRelation(
+                        cause=cause,
+                        effect=effect,
+                        confidence=min(confidence, 0.95),  # Cap at 0.95
+                        explanation=explanation,
+                        source_file_id=source_file_id,
+                        source_snippet=source_sentence[:200] if source_sentence else "",
+                        relationship_type=relationship_type,
+                        metadata={
+                            "extraction_method": "claude_ai",
+                            "context": context,
+                            "entities": entities,
+                            "date": date,
+                            "source_sentence": source_sentence
+                        }
+                    ))
+                
+                logger.info("Claude AI successfully extracted %d relationships", len(relationships))
+                
+            except json.JSONDecodeError as e:
+                logger.warning("Failed to parse Claude AI response as JSON: %s. Response: %s", 
+                             e, response_text[:500])
+            except Exception as e:
+                logger.warning("Error processing Claude AI response: %s", e)
+                
+        except BadRequestError as e:
+            logger.warning("Claude API error: %s", e)
+        except Exception as e:
+            logger.warning("Claude AI extraction failed: %s", e)
+        
+        return relationships
+    
+    def _extract_with_patterns(self, text: str, source_file_id: str) -> List[CausalRelation]:
+        """
+        Fallback: Pattern-based extraction (original method).
+        Returns candidate relationships for DoWhy validation.
+        """
+        if not text or not text.strip():
+            return []
+        
+        relationships: List[CausalRelation] = []
+        seen = set()  # Avoid duplicates
+        
+        # Normalize text
+        text = re.sub(r'\s+', ' ', text)
+        sentences = re.split(r'[.!?]\s+', text)
+        
+        for sentence in sentences:
+            sentence = sentence.strip()
+            if len(sentence) < 10:  # Skip very short sentences
+                continue
+            
+            for pattern, rel_type in self.CAUSE_EFFECT_PATTERNS:
+                matches = re.finditer(pattern, sentence, re.IGNORECASE)
+                
+                for match in matches:
+                    cause = match.group(1).strip()
+                    effect = match.group(2).strip()
+                    
+                    # Filter out very short or very long phrases (increased limit for technical terms)
+                    if len(cause) < 3 or len(cause) > 150:
+                        continue
+                    if len(effect) < 3 or len(effect) > 150:
+                        continue
+                    
+                    # Skip common false positives
+                    if cause.lower() in ["this", "that", "it", "they", "we"]:
+                        continue
+                    if effect.lower() in ["this", "that", "it", "they", "we"]:
+                        continue
+                    
+                    # Create unique key
+                    key = (cause.lower(), effect.lower())
+                    if key in seen:
+                        continue
+                    seen.add(key)
+                    
+                    # Calculate confidence based on pattern type
+                    confidence = self._calculate_confidence(rel_type, sentence)
+                    
+                    # Map pattern type to Neo4j relationship type (uppercase with underscores)
+                    neo4j_rel_type = self._map_to_neo4j_relationship_type(rel_type)
+                    
+                    relationships.append(CausalRelation(
+                        cause=cause,
+                        effect=effect,
+                        confidence=confidence,
+                        explanation=f"Extracted from text using pattern: {rel_type}",
+                        source_file_id=source_file_id,
+                        source_snippet=sentence[:200],  # First 200 chars
+                        relationship_type=neo4j_rel_type,
+                        metadata={
+                            "extraction_method": "pattern_matching",
+                            "pattern_type": rel_type,
+                            "sentence": sentence
+                        }
+                    ))
+        
+        logger.info("Extracted %d candidate relationships from text (source: %s)", 
+                   len(relationships), source_file_id)
+        return relationships
+    
+    def _calculate_confidence(self, rel_type: str, sentence: str) -> float:
+        """Calculate confidence score based on pattern type and sentence quality."""
+        base_confidence = {
+            "causes": 0.8,
+            "leads_to": 0.75,
+            "results_in": 0.75,
+            "triggers": 0.7,
+            "produces": 0.7,
+            "depends_on": 0.65,
+            "requires": 0.65,
+            "needs": 0.6,
+            "if_then": 0.8,
+            "when_then": 0.75,
+            "implies": 0.7,
+            "follows": 0.6,
+            "comes_after": 0.6,
+            "first_then": 0.7,
+            "enables": 0.7,
+            "allows": 0.65,
+            "facilitates": 0.65,
+            "relies_on": 0.65,
+            "uses": 0.6,
+            "utilizes": 0.6,
+            "leverages": 0.6,
+            "connects_to": 0.7,
+            "communicates_with": 0.7,
+            "interacts_with": 0.7,
+            "integrates_with": 0.7,
+            "provides": 0.7,
+            "supports": 0.7,
+            "handles": 0.65,
+            "manages": 0.65,
+            "controls": 0.65,
+            "processes": 0.65,
+            "generates": 0.7,
+            "creates": 0.7,
+            "implements": 0.7,
+            "delivers": 0.7,
+            "flows_to": 0.7,
+            "sends_to": 0.7,
+            "transmits_to": 0.7,
+            "receives_from": 0.7,
+            "ensures": 0.75,
+            "precedes": 0.6,
+            "contains": 0.6,
+            "includes": 0.6,
+            "consists_of": 0.6,
+            "affects": 0.65,
+            "impacts": 0.65,
+            "influences": 0.65,
+        }.get(rel_type, 0.5)
+        
+        # Adjust based on sentence length (longer sentences might be more descriptive)
+        if len(sentence) > 50:
+            base_confidence += 0.05
+        
+        return min(base_confidence, 0.95)
+    
+    def _map_to_neo4j_relationship_type(self, pattern_type: str) -> str:
+        """Map pattern type to Neo4j relationship type (uppercase with underscores)."""
+        # Map lowercase pattern types to Neo4j relationship types
+        mapping = {
+            "causes": "CAUSES",
+            "leads_to": "LEADS_TO",
+            "results_in": "RESULTS_IN",
+            "triggers": "TRIGGERS",
+            "produces": "PRODUCES",
+            "depends_on": "DEPENDS_ON",
+            "requires": "REQUIRES",
+            "needs": "NEEDS",
+            "relies_on": "RELIES_ON",
+            "uses": "USES",
+            "utilizes": "UTILIZES",
+            "leverages": "LEVERAGES",
+            "connects_to": "CONNECTS_TO",
+            "communicates_with": "COMMUNICATES_WITH",
+            "interacts_with": "INTERACTS_WITH",
+            "integrates_with": "INTEGRATES_WITH",
+            "provides": "PROVIDES",
+            "supports": "SUPPORTS",
+            "handles": "HANDLES",
+            "manages": "MANAGES",
+            "controls": "CONTROLS",
+            "processes": "PROCESSES",
+            "generates": "GENERATES",
+            "creates": "CREATES",
+            "implements": "IMPLEMENTS",
+            "delivers": "DELIVERS",
+            "flows_to": "FLOWS_TO",
+            "sends_to": "SENDS_TO",
+            "transmits_to": "TRANSMITS_TO",
+            "receives_from": "RECEIVES_FROM",
+            "if_then": "IF_THEN",
+            "when_then": "WHEN_THEN",
+            "implies": "IMPLIES",
+            "ensures": "ENSURES",
+            "follows": "FOLLOWS",
+            "comes_after": "COMES_AFTER",
+            "first_then": "FIRST_THEN",
+            "precedes": "PRECEDES",
+            "contains": "CONTAINS",
+            "includes": "INCLUDES",
+            "consists_of": "CONSISTS_OF",
+            "affects": "AFFECTS",
+            "impacts": "IMPACTS",
+            "influences": "INFLUENCES",
+            "enables": "ENABLES",
+            "allows": "ALLOWS",
+            "facilitates": "FACILITATES",
+        }
+        return mapping.get(pattern_type, "CAUSES")  # Default to CAUSES if not found
+    
+    def extract_from_qwen_results(self, qwen_results: List[Dict], source_file_id: str) -> List[CausalRelation]:
+        """Convert Qwen2.5-VL extraction results to CausalRelation objects."""
+        relationships: List[CausalRelation] = []
+        
+        for result in qwen_results:
+            entity1 = result.get("entity1", "").strip()
+            entity2 = result.get("entity2", "").strip()
+            rel_type = result.get("relationship_type", "").strip()
+            description = result.get("description", "").strip()
+            confidence = float(result.get("confidence", 0.7))
+            
+            if not entity1 or not entity2:
+                continue
+            
+            # Map relationship type to cause-effect
+            # For most types, entity1 is cause, entity2 is effect
+            cause = entity1
+            effect = entity2
+            
+            # Some relationship types might need reversal
+            if rel_type in ["depends_on", "requires", "needs"]:
+                # If A depends on B, then B is the cause, A is the effect
+                cause, effect = effect, cause
+            
+            # Map Qwen relationship type to Neo4j format
+            neo4j_rel_type = self._map_to_neo4j_relationship_type(rel_type.lower().replace("-", "_"))
+            
+            relationships.append(CausalRelation(
+                cause=cause,
+                effect=effect,
+                confidence=confidence,
+                explanation=description or f"Extracted from diagram: {rel_type}",
+                source_file_id=source_file_id,
+                source_snippet=description,
+                relationship_type=neo4j_rel_type,
+                metadata={
+                    "extraction_method": "qwen2.5-vl",
+                    "relationship_type": rel_type,
+                    "original_entity1": entity1,
+                    "original_entity2": entity2
+                }
+            ))
+        
+        return relationships
+
--- a/services/multi-document-upload-service/src/multi_document_upload_service/processors/report_generator.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/processors/report_generator.py
@ -0,0 +1,570 @@
+from __future__ import annotations
+
+import json
+import logging
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Optional, Set
+
+from anthropic import Anthropic, BadRequestError
+
+from ..config import get_settings
+from ..models import CausalRelation, ProjectReport
+
+logger = logging.getLogger(__name__)
+
+# Try to import PDF generation libraries
+try:
+    import markdown
+    from markdown.extensions import codehilite, fenced_code, tables
+    HAS_MARKDOWN = True
+except ImportError:
+    HAS_MARKDOWN = False
+    logger.warning("markdown library not available - PDF conversion will be limited")
+
+try:
+    from weasyprint import HTML, CSS
+    from weasyprint.text.fonts import FontConfiguration
+    HAS_WEASYPRINT = True
+except ImportError:
+    HAS_WEASYPRINT = False
+    logger.warning("weasyprint not available - PDF conversion will be skipped")
+
+
+class ReportGenerator:
+    """Generate beginner-friendly onboarding reports from knowledge graph."""
+    
+    def __init__(self, api_key: str | None = None, model: str | None = None):
+        settings = get_settings()
+        self.api_key = api_key or settings.anthropic_api_key
+        self.model = model or settings.claude_model
+        self.max_output_tokens = settings.claude_max_output_tokens
+        
+        if not self.api_key:
+            raise ValueError("Anthropic API key is required for report generation")
+        
+        self.client = Anthropic(api_key=self.api_key)
+    
+    def generate_onboarding_report(
+        self,
+        job_id: str,
+        relations: List[CausalRelation],
+        vector_store,
+        embedder,
+        graph_writer=None,
+        kg_summary: Dict | None = None
+    ) -> ProjectReport:
+        """
+        Generate a beginner-friendly onboarding report from the knowledge graph.
+        """
+        logger.info("Generating onboarding report for job %s", job_id)
+        
+        # Step 1: Analyze KG structure
+        key_concepts = self._analyze_kg_structure(relations)
+        
+        # Step 2: Semantic search for different topics
+        overview_content = self._search_topic(
+            "project overview main purpose goals objectives",
+            vector_store, embedder, job_id, top_k=10
+        )
+        
+        concepts_content = self._search_topic(
+            "core concepts definitions key terms important ideas",
+            vector_store, embedder, job_id, top_k=15
+        )
+        
+        processes_content = self._search_topic(
+            "how system works processes flows procedures steps",
+            vector_store, embedder, job_id, top_k=15
+        )
+        
+        relationships_content = self._search_topic(
+            "cause effect dependencies relationships connections",
+            vector_store, embedder, job_id, top_k=20
+        )
+        
+        components_content = self._search_topic(
+            "components modules systems parts architecture",
+            vector_store, embedder, job_id, top_k=15
+        )
+        
+        # Step 3: Query Neo4j for causal chains (as per README Step 7.3)
+        causal_chains = []
+        key_entities = []
+        if graph_writer:
+            try:
+                # Query 1: Get critical causal chains
+                causal_chains = graph_writer.query_causal_chains(
+                    job_id=job_id,
+                    min_length=2,
+                    max_length=4,
+                    min_confidence=0.8,
+                    limit=20
+                )
+                logger.info("Retrieved %d causal chains from Neo4j", len(causal_chains))
+                
+                # Query 2: Get key entities
+                key_entities = graph_writer.query_key_entities(job_id=job_id, limit=20)
+                logger.info("Retrieved %d key entities from Neo4j", len(key_entities))
+            except Exception as neo4j_exc:
+                logger.warning("Failed to query Neo4j: %s", neo4j_exc)
+        
+        # Step 4: Organize content hierarchically
+        organized_content = self._organize_content(
+            key_concepts,
+            overview_content,
+            concepts_content,
+            processes_content,
+            relationships_content,
+            components_content,
+            causal_chains,
+            key_entities
+        )
+        
+        # Step 5: Generate report with Claude
+        report_content = self._claude_generate_report(
+            job_id=job_id,
+            relations=relations,
+            organized_content=organized_content,
+            kg_summary=kg_summary or {}
+        )
+        
+        # Step 6: Parse sections
+        sections = self._parse_sections(report_content)
+        
+        # Step 7: Convert to PDF (as per README Step 7.8)
+        pdf_path = None
+        if HAS_WEASYPRINT and HAS_MARKDOWN:
+            try:
+                pdf_path = self._convert_to_pdf(report_content, job_id)
+                logger.info("Generated PDF report: %s", pdf_path)
+            except Exception as pdf_exc:
+                logger.warning("PDF conversion failed: %s", pdf_exc)
+        
+        # Estimate pages (rough: ~500 words per page)
+        word_count = len(report_content.split())
+        estimated_pages = max(1, word_count // 500)
+        
+        return ProjectReport(
+            job_id=job_id,
+            title="Project Onboarding Guide",
+            content=report_content,
+            sections=sections,
+            key_concepts=list(key_concepts)[:20],  # Top 20 concepts
+            total_pages=estimated_pages,
+            generated_at=datetime.utcnow(),
+            metadata={
+                "total_relations": len(relations),
+                "total_concepts": len(key_concepts),
+                "causal_chains_count": len(causal_chains),
+                "key_entities_count": len(key_entities),
+                "model": self.model,
+                "pdf_path": str(pdf_path) if pdf_path else None
+            }
+        )
+    
+    def _analyze_kg_structure(self, relations: List[CausalRelation]) -> Set[str]:
+        """Identify key concepts from the knowledge graph."""
+        concepts = set()
+        
+        for rel in relations:
+            concepts.add(rel.cause)
+            concepts.add(rel.effect)
+        
+        # Identify high-degree nodes (concepts involved in many relationships)
+        cause_counts: Dict[str, int] = {}
+        effect_counts: Dict[str, int] = {}
+        
+        for rel in relations:
+            cause_counts[rel.cause] = cause_counts.get(rel.cause, 0) + 1
+            effect_counts[rel.effect] = effect_counts.get(rel.effect, 0) + 1
+        
+        # Key concepts are those with high degree (appear in many relationships)
+        all_counts = {**cause_counts, **effect_counts}
+        threshold = max(1, len(relations) // 10)  # Top 10% most connected
+        
+        key_concepts = {
+            concept for concept, count in all_counts.items()
+            if count >= threshold
+        }
+        
+        # If threshold is too high, use top N concepts
+        if len(key_concepts) < 5:
+            sorted_concepts = sorted(all_counts.items(), key=lambda x: x[1], reverse=True)
+            key_concepts = {concept for concept, _ in sorted_concepts[:20]}
+        
+        logger.info("Identified %d key concepts from %d relationships", 
+                   len(key_concepts), len(relations))
+        return key_concepts
+    
+    def _search_topic(
+        self,
+        query: str,
+        vector_store,
+        embedder,
+        job_id: str,
+        top_k: int = 10
+    ) -> List[Dict]:
+        """Search for content related to a topic."""
+        try:
+            results = vector_store.search_by_text(
+                query_text=query,
+                embedder=embedder,
+                job_id=job_id,
+                top_k=top_k
+            )
+            return results
+        except Exception as exc:
+            logger.warning("Search failed for topic '%s': %s", query, exc)
+            return []
+    
+    def _organize_content(
+        self,
+        key_concepts: Set[str],
+        overview_content: List[Dict],
+        concepts_content: List[Dict],
+        processes_content: List[Dict],
+        relationships_content: List[Dict],
+        components_content: List[Dict],
+        causal_chains: List[Dict] = None,
+        key_entities: List[Dict] = None
+    ) -> Dict:
+        """Organize retrieved content into a structured format."""
+        return {
+            "key_concepts": list(key_concepts),
+            "overview": [r.get("payload", {}) for r in overview_content],
+            "concepts": [r.get("payload", {}) for r in concepts_content],
+            "processes": [r.get("payload", {}) for r in processes_content],
+            "relationships": [r.get("payload", {}) for r in relationships_content],
+            "components": [r.get("payload", {}) for r in components_content],
+            "causal_chains": causal_chains or [],
+            "key_entities": key_entities or [],
+        }
+    
+    def _claude_generate_report(
+        self,
+        job_id: str,
+        relations: List[CausalRelation],
+        organized_content: Dict,
+        kg_summary: Dict
+    ) -> str:
+        """Generate report using Claude AI."""
+        
+        # Build KG summary text
+        kg_summary_text = self._build_kg_summary(relations, organized_content)
+        
+        # Build system prompt
+        system_prompt = """You are an expert technical writer specializing in creating beginner-friendly onboarding documentation for new team members.
+
+Your goal is to explain complex project information in simple, clear language that anyone can understand, even without technical background.
+
+Guidelines:
+- Use simple, clear language - avoid jargon or explain it when necessary
+- Use examples and analogies to make concepts relatable
+- Structure information logically (basics first, then advanced)
+- Make it engaging and easy to follow
+- Cover all important aspects comprehensively
+- Write in a friendly, welcoming tone
+- Use headings, bullet points, and clear sections
+- Explain "why" not just "what"
+
+Generate a comprehensive onboarding document that helps a new team member understand the entire project."""
+        
+        # Format causal chains from Neo4j
+        causal_chains_text = self._format_causal_chains(organized_content.get('causal_chains', []))
+        key_entities_text = self._format_key_entities(organized_content.get('key_entities', []))
+        
+        # Build user prompt
+        user_prompt = f"""Generate a comprehensive, beginner-friendly onboarding document for this project.
+
+KNOWLEDGE GRAPH SUMMARY:
+{kg_summary_text}
+
+IMPORTANT RELATIONSHIPS:
+{self._format_relationships(relations[:50])}  # Top 50 relationships
+
+CAUSAL CHAINS (from Knowledge Graph):
+{causal_chains_text}
+
+KEY ENTITIES (from Knowledge Graph):
+{key_entities_text}
+
+KEY CONCEPTS:
+{', '.join(organized_content.get('key_concepts', [])[:30])}
+
+REQUIRED SECTIONS:
+1. Project Overview
+   - What is this project about?
+   - Main purpose and goals
+   - Key stakeholders or users
+
+2. Core Concepts (Explained Simply)
+   - Explain each important concept in simple terms
+   - Why each concept matters
+   - How concepts relate to each other
+
+3. How Things Work Together
+   - System flow (simple explanation)
+   - Key processes and workflows
+   - Dependencies explained simply
+
+4. Important Relationships
+   - Cause → Effect relationships (explained in plain language)
+   - "When X happens, Y occurs because..."
+   - Visual flow if possible (describe it)
+
+5. Key Components
+   - Main modules/systems/components
+   - What each does (beginner-friendly)
+   - How they interact
+
+6. Getting Started
+   - Where to start learning
+   - What to understand first
+   - Recommended learning path
+
+7. Common Questions
+   - FAQ based on the knowledge graph
+   - Answers in simple terms
+
+Generate the complete onboarding document in Markdown format. Make it comprehensive, beginner-friendly, and easy to follow."""
+        
+        try:
+            message = self.client.messages.create(
+                model=self.model,
+                max_tokens=self.max_output_tokens,
+                temperature=0.3,  # Slightly creative but focused
+                system=system_prompt,
+                messages=[
+                    {
+                        "role": "user",
+                        "content": user_prompt
+                    }
+                ]
+            )
+            
+            content_blocks = message.content or []
+            report_text = "".join(
+                block.text for block in content_blocks 
+                if hasattr(block, "text")
+            )
+            
+            if not report_text:
+                logger.warning("Empty report generated")
+                return "# Project Onboarding Guide\n\nNo content available."
+            
+            logger.info("Generated onboarding report (%d characters)", len(report_text))
+            return report_text
+            
+        except BadRequestError as e:
+            # Handle API credit/authentication errors gracefully
+            error_msg = str(e)
+            if "credit balance" in error_msg.lower() or "too low" in error_msg.lower():
+                logger.error("Claude API credit balance too low. Cannot generate report.")
+                raise ValueError("Claude API credit balance is too low. Please add credits to your Anthropic account to generate reports.")
+            elif "invalid_request_error" in error_msg.lower():
+                logger.error("Claude API invalid request: %s", error_msg)
+                raise ValueError(f"Claude API request failed: {error_msg}")
+            else:
+                raise
+        except Exception as e:
+            logger.exception("Failed to generate report: %s", e)
+            raise
+    
+    def _build_kg_summary(
+        self,
+        relations: List[CausalRelation],
+        organized_content: Dict
+    ) -> str:
+        """Build a text summary of the knowledge graph."""
+        summary_parts = [
+            f"Total Relationships: {len(relations)}",
+            f"Total Concepts: {len(organized_content.get('key_concepts', []))}",
+            "",
+            "Top Relationships:",
+        ]
+        
+        # Show top relationships by confidence
+        top_relations = sorted(relations, key=lambda r: r.confidence, reverse=True)[:20]
+        for i, rel in enumerate(top_relations, 1):
+            summary_parts.append(
+                f"{i}. {rel.cause} → {rel.effect} "
+                f"(confidence: {rel.confidence:.2f})"
+            )
+        
+        return "\n".join(summary_parts)
+    
+    def _format_relationships(self, relations: List[CausalRelation]) -> str:
+        """Format relationships for the prompt."""
+        if not relations:
+            return "No relationships found."
+        
+        lines = []
+        for rel in relations[:50]:  # Limit to 50
+            line = f"- {rel.cause} → {rel.effect}"
+            if rel.explanation:
+                line += f" ({rel.explanation[:100]})"
+            lines.append(line)
+        
+        return "\n".join(lines)
+    
+    def _parse_sections(self, content: str) -> Dict[str, str]:
+        """Parse markdown content into sections."""
+        sections = {}
+        current_section = None
+        current_content = []
+        
+        lines = content.split('\n')
+        
+        for line in lines:
+            # Check if it's a heading (starts with #)
+            if line.strip().startswith('#'):
+                # Save previous section
+                if current_section:
+                    sections[current_section] = '\n'.join(current_content).strip()
+                
+                # Start new section
+                current_section = line.strip().lstrip('#').strip()
+                current_content = [line]
+            else:
+                if current_section:
+                    current_content.append(line)
+                else:
+                    # Content before first heading
+                    if 'introduction' not in sections:
+                        sections['introduction'] = line
+                    else:
+                        sections['introduction'] += '\n' + line
+        
+        # Save last section
+        if current_section:
+            sections[current_section] = '\n'.join(current_content).strip()
+        
+        return sections
+    
+    def _format_causal_chains(self, causal_chains: List[Dict]) -> str:
+        """Format causal chains from Neo4j for the prompt."""
+        if not causal_chains:
+            return "No causal chains found in knowledge graph."
+        
+        lines = []
+        for i, chain_data in enumerate(causal_chains[:20], 1):  # Top 20 chains
+            chain = chain_data.get("chain", [])
+            avg_confidence = chain_data.get("avg_confidence", 0.0)
+            
+            if len(chain) >= 2:
+                chain_text = " → ".join(chain)
+                lines.append(f"{i}. {chain_text} (confidence: {avg_confidence:.2f})")
+        
+        return "\n".join(lines) if lines else "No causal chains found."
+    
+    def _format_key_entities(self, key_entities: List[Dict]) -> str:
+        """Format key entities from Neo4j for the prompt."""
+        if not key_entities:
+            return "No key entities found in knowledge graph."
+        
+        lines = []
+        for entity in key_entities[:20]:  # Top 20 entities
+            name = entity.get("name", "")
+            entity_type = entity.get("type", "Entity")
+            relation_count = entity.get("relation_count", 0)
+            lines.append(f"- {name} ({entity_type}): involved in {relation_count} relationships")
+        
+        return "\n".join(lines) if lines else "No key entities found."
+    
+    def _convert_to_pdf(self, markdown_content: str, job_id: str) -> Optional[Path]:
+        """
+        Convert Markdown report to PDF as per README Step 7.8.
+        Uses markdown + weasyprint for PDF generation.
+        """
+        if not HAS_MARKDOWN or not HAS_WEASYPRINT:
+            return None
+        
+        try:
+            # Convert Markdown to HTML
+            html_content = markdown.markdown(
+                markdown_content,
+                extensions=['codehilite', 'fenced_code', 'tables']
+            )
+            
+            # Add CSS styling
+            css_style = """
+            @page {
+                size: A4;
+                margin: 2cm;
+            }
+            body {
+                font-family: 'Georgia', serif;
+                line-height: 1.6;
+                color: #333;
+            }
+            h1, h2, h3, h4 {
+                color: #2c3e50;
+                margin-top: 1.5em;
+                margin-bottom: 0.5em;
+            }
+            h1 { font-size: 2em; border-bottom: 2px solid #3498db; padding-bottom: 0.3em; }
+            h2 { font-size: 1.5em; border-bottom: 1px solid #95a5a6; padding-bottom: 0.2em; }
+            h3 { font-size: 1.2em; }
+            code {
+                background-color: #f4f4f4;
+                padding: 2px 4px;
+                border-radius: 3px;
+                font-family: 'Courier New', monospace;
+            }
+            pre {
+                background-color: #f4f4f4;
+                padding: 1em;
+                border-radius: 5px;
+                overflow-x: auto;
+            }
+            table {
+                border-collapse: collapse;
+                width: 100%;
+                margin: 1em 0;
+            }
+            th, td {
+                border: 1px solid #ddd;
+                padding: 8px;
+                text-align: left;
+            }
+            th {
+                background-color: #3498db;
+                color: white;
+            }
+            """
+            
+            # Create full HTML document
+            full_html = f"""
+            <!DOCTYPE html>
+            <html>
+            <head>
+                <meta charset="UTF-8">
+                <title>Project Onboarding Guide</title>
+            </head>
+            <body>
+                {html_content}
+            </body>
+            </html>
+            """
+            
+            # Generate PDF
+            settings = get_settings()
+            storage_root = Path(settings.storage_root)
+            reports_dir = storage_root / "reports"
+            reports_dir.mkdir(parents=True, exist_ok=True)
+            
+            pdf_path = reports_dir / f"report_{job_id}.pdf"
+            
+            HTML(string=full_html).write_pdf(
+                pdf_path,
+                stylesheets=[CSS(string=css_style)]
+            )
+            
+            logger.info("PDF report generated: %s", pdf_path)
+            return pdf_path
+            
+        except Exception as exc:
+            logger.exception("Failed to convert Markdown to PDF: %s", exc)
+            return None
+
--- a/services/multi-document-upload-service/src/multi_document_upload_service/processors/vector_store.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/processors/vector_store.py
@ -0,0 +1,269 @@
+from __future__ import annotations
+
+import logging
+from typing import Dict, List, Optional
+from uuid import uuid4
+
+from ..config import get_settings
+from ..models import CausalRelation
+
+logger = logging.getLogger(__name__)
+
+try:
+    from qdrant_client import QdrantClient
+    from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
+    HAS_QDRANT = True
+except ImportError:
+    HAS_QDRANT = False
+    logger.warning("qdrant-client not available")
+
+
+class VectorStore:
+    """Qdrant vector database client for storing KG embeddings."""
+    
+    def __init__(
+        self,
+        url: str | None = None,
+        collection_name: str | None = None,
+        vector_size: int | None = None
+    ):
+        if not HAS_QDRANT:
+            raise ImportError("qdrant-client is required for vector storage")
+        
+        settings = get_settings()
+        self.url = url or settings.qdrant_url
+        self.collection_name = collection_name or settings.qdrant_collection_name
+        self.vector_size = vector_size or settings.qdrant_vector_size
+        
+        logger.info("Connecting to Qdrant at %s", self.url)
+        try:
+            self.client = QdrantClient(url=self.url)
+            logger.info("Connected to Qdrant")
+        except Exception as exc:
+            logger.exception("Failed to connect to Qdrant: %s", exc)
+            raise
+        
+        # Ensure collection exists
+        self._ensure_collection()
+    
+    def _ensure_collection(self) -> None:
+        """Create collection if it doesn't exist."""
+        try:
+            collections = self.client.get_collections()
+            collection_names = [col.name for col in collections.collections]
+            
+            if self.collection_name not in collection_names:
+                logger.info("Creating Qdrant collection: %s", self.collection_name)
+                try:
+                    self.client.create_collection(
+                        collection_name=self.collection_name,
+                        vectors_config=VectorParams(
+                            size=self.vector_size,
+                            distance=Distance.COSINE
+                        )
+                    )
+                    logger.info("Created collection: %s", self.collection_name)
+                except Exception as create_exc:
+                    # Collection might have been created by another instance
+                    if "already exists" in str(create_exc).lower() or "409" in str(create_exc):
+                        logger.info("Collection %s already exists (created by another instance)", self.collection_name)
+                    else:
+                        raise
+            else:
+                logger.debug("Collection %s already exists", self.collection_name)
+        except Exception as exc:
+            logger.exception("Failed to ensure collection: %s", exc)
+            raise
+    
+    def store_relation(
+        self,
+        relation: CausalRelation,
+        embedding: List[float],
+        job_id: str
+    ) -> str:
+        """Store a relationship embedding in Qdrant."""
+        point_id = str(uuid4())
+        
+        payload = {
+            "job_id": job_id,
+            "cause": relation.cause,
+            "effect": relation.effect,
+            "confidence": relation.confidence,
+            "source_file_id": relation.source_file_id or "",
+            "source_snippet": relation.source_snippet or "",
+            "explanation": relation.explanation or "",
+        }
+        
+        point = PointStruct(
+            id=point_id,
+            vector=embedding,
+            payload=payload
+        )
+        
+        try:
+            self.client.upsert(
+                collection_name=self.collection_name,
+                points=[point]
+            )
+            logger.debug("Stored relation embedding: %s -> %s", relation.cause, relation.effect)
+            return point_id
+        except Exception as exc:
+            logger.warning("Failed to store relation: %s", exc)
+            return ""
+    
+    def store_concept(
+        self,
+        concept_name: str,
+        embedding: List[float],
+        job_id: str,
+        description: str | None = None
+    ) -> str:
+        """Store a concept/node embedding in Qdrant."""
+        point_id = str(uuid4())
+        
+        payload = {
+            "job_id": job_id,
+            "concept_name": concept_name,
+            "description": description or "",
+            "type": "concept"
+        }
+        
+        point = PointStruct(
+            id=point_id,
+            vector=embedding,
+            payload=payload
+        )
+        
+        try:
+            self.client.upsert(
+                collection_name=self.collection_name,
+                points=[point]
+            )
+            logger.debug("Stored concept embedding: %s", concept_name)
+            return point_id
+        except Exception as exc:
+            logger.warning("Failed to store concept: %s", exc)
+            return ""
+    
+    def search(
+        self,
+        query_embedding: List[float],
+        job_id: str | None = None,
+        top_k: int = 10,
+        score_threshold: float = 0.5
+    ) -> List[Dict]:
+        """Search for similar vectors in Qdrant."""
+        try:
+            # Build filter if job_id is provided
+            query_filter = None
+            if job_id:
+                query_filter = Filter(
+                    must=[
+                        FieldCondition(
+                            key="job_id",
+                            match=MatchValue(value=job_id)
+                        )
+                    ]
+                )
+            
+            # Use the collections API for search
+            # Check if client has search method (newer versions) or use query_points (older)
+            if hasattr(self.client, 'search'):
+                results = self.client.search(
+                    collection_name=self.collection_name,
+                    query_vector=query_embedding,
+                    query_filter=query_filter,
+                    limit=top_k,
+                    score_threshold=score_threshold
+                )
+            elif hasattr(self.client, 'query_points'):
+                # Fallback for older API
+                results = self.client.query_points(
+                    collection_name=self.collection_name,
+                    query=query_embedding,
+                    query_filter=query_filter,
+                    top=top_k,
+                    score_threshold=score_threshold
+                )
+            else:
+                # Try using the collection directly
+                collection = self.client.get_collection(self.collection_name)
+                if hasattr(collection, 'search'):
+                    results = collection.search(
+                        query_vector=query_embedding,
+                        query_filter=query_filter,
+                        limit=top_k,
+                        score_threshold=score_threshold
+                    )
+                else:
+                    logger.error("QdrantClient does not have search or query_points method")
+                    return []
+            
+            # Convert to list of dicts
+            search_results = []
+            for result in results:
+                search_results.append({
+                    "id": str(result.id),
+                    "score": result.score,
+                    "payload": result.payload
+                })
+            
+            return search_results
+            
+        except Exception as exc:
+            logger.warning("Vector search failed: %s", exc)
+            import traceback
+            logger.debug("Search error traceback: %s", traceback.format_exc())
+            return []
+    
+    def search_by_text(
+        self,
+        query_text: str,
+        embedder,
+        job_id: str | None = None,
+        top_k: int = 10
+    ) -> List[Dict]:
+        """Search using text query (embeds it first)."""
+        query_embedding = embedder.embed_text(query_text)
+        return self.search(query_embedding, job_id=job_id, top_k=top_k)
+    
+    def delete_job_vectors(self, job_id: str) -> int:
+        """Delete all vectors for a specific job."""
+        try:
+            # Qdrant doesn't have a direct delete by filter, so we need to:
+            # 1. Search for all points with job_id
+            # 2. Delete them by ID
+            
+            # This is a simplified version - in production, you might want
+            # to use scroll API for large datasets
+            query_filter = Filter(
+                must=[
+                    FieldCondition(
+                        key="job_id",
+                        match=MatchValue(value=job_id)
+                    )
+                ]
+            )
+            
+            # Scroll to get all points
+            points, _ = self.client.scroll(
+                collection_name=self.collection_name,
+                scroll_filter=query_filter,
+                limit=10000  # Adjust based on expected size
+            )
+            
+            if points:
+                point_ids = [str(point.id) for point in points]
+                self.client.delete(
+                    collection_name=self.collection_name,
+                    points_selector=point_ids
+                )
+                logger.info("Deleted %d vectors for job %s", len(point_ids), job_id)
+                return len(point_ids)
+            
+            return 0
+            
+        except Exception as exc:
+            logger.warning("Failed to delete job vectors: %s", exc)
+            return 0
+
--- a/services/multi-document-upload-service/src/multi_document_upload_service/workflows/pipeline.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/workflows/pipeline.py
@ -4,14 +4,19 @@ import logging
 from pathlib import Path
 from typing import Iterable, List

-from ..claude_client import ClaudeCausalExtractor
 from ..config import get_settings
-from ..extractors.auto import extract_text
 from ..extractors.image_extractor import extract_images_from_file
+from ..extractors.pymupdf_extractor import extract_all_text, extract_text_with_context
+from ..extractors.qwen_vision import QwenVisionClient
 from ..jobs import JobStore
 from ..models import CausalRelation, JobStage
-from ..processors.chunker import TextChunker
+from ..processors.dowhy_analyzer import DoWhyAnalyzer
+from ..processors.embedder import Embedder
+from ..processors.entity_resolver import EntityResolver
 from ..processors.graph_writer import GraphWriter
+from ..processors.relationship_extractor import RelationshipExtractor
+from ..processors.report_generator import ReportGenerator
+from ..processors.vector_store import VectorStore
 from ..storage import StorageManager

 logger = logging.getLogger(__name__)
@ -23,31 +28,60 @@ class JobPipeline:
        job_store: JobStore,
        storage: StorageManager,
        graph_writer: GraphWriter,
-        claude_extractor: ClaudeCausalExtractor,
    ):
        self.job_store = job_store
        self.storage = storage
        self.graph_writer = graph_writer
-        self.claude_extractor = claude_extractor
+        
        settings = get_settings()
-        self.chunker = TextChunker(
-            model_name=settings.claude_model,
-            token_target=settings.chunk_token_target,
-            overlap=settings.chunk_token_overlap,
-        )
+        
+        # Initialize extractors
+        self.qwen_client = QwenVisionClient()  # Only for images/diagrams
+        self.relationship_extractor = RelationshipExtractor()  # NLP (SpaCy) + Claude AI for text (as per README)
+        self.entity_resolver = EntityResolver()  # Claude AI entity resolution (as per README Stage 4)
+        
+        # Initialize processors
+        try:
+            self.dowhy_analyzer = DoWhyAnalyzer() if settings.dowhy_enabled else None
+        except Exception as e:
+            logger.warning("DoWhy not available: %s", e)
+            self.dowhy_analyzer = None
+        
+        try:
+            self.embedder = Embedder()
+            self.vector_store = VectorStore()
+        except Exception as e:
+            logger.warning("Vector store not available: %s", e)
+            self.embedder = None
+            self.vector_store = None
+        
+        try:
+            self.report_generator = ReportGenerator()
+        except Exception as e:
+            logger.warning("Report generator not available: %s", e)
+            self.report_generator = None

    def process_job(self, job_id: str, saved_files: Iterable[str]) -> None:
        job = self.job_store.get(job_id)
        logger.info("Processing job %s with %d files", job_id, job.total_files)

-        relations: List[CausalRelation] = []
+        all_text_content: List[str] = []
+        all_relations: List[CausalRelation] = []

        try:
-            self.job_store.update(job_id, stage=JobStage.EXTRACTING, status_message="Extracting content")
+            # ============================================================
+            # STEP 1: CONTENT EXTRACTION (PyMuPDF + Qwen2.5-VL)
+            # ============================================================
+            self.job_store.update(
+                job_id, 
+                stage=JobStage.EXTRACTING, 
+                status_message="Extracting content from documents"
+            )
+            
            for count, file_path in enumerate(saved_files, start=1):
                file_path_obj = Path(file_path)
                file_record = next((f for f in job.files if f.stored_path == file_path), None)
-                logger.info("Processing %s", file_path_obj.name)
+                logger.info("Processing %s (%d/%d)", file_path_obj.name, count, job.total_files)
                source_file_id = file_record.id if file_record else file_path_obj.name
                suffix = file_path_obj.suffix.lower()
                
@ -55,27 +89,36 @@ class JobPipeline:
                is_direct_image = suffix in {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}
                
                try:
-                    # Extract text from document (if not a direct image)
+                    # Step 2.1: IDENTIFY FILE TYPE and route to appropriate extractor
+                    # Step 2.2: Extract text based on file type (as per README)
                    text = ""
                    if not is_direct_image:
                        try:
-                            text = extract_text(file_path_obj)
+                            # extract_all_text() handles routing:
+                            # - PDF → PyMuPDF (Step 2.2a)
+                            # - DOCX → python-docx (Step 2.2b)
+                            # - PPTX → python-pptx (Step 2.2c)
+                            # - CSV/XLSX → pandas (Step 2.2d)
+                            # - Text files → direct read
+                            # Also performs Step 2.3: Text cleaning
+                            text = extract_all_text(file_path_obj)
                            
-                            # Process text if available
                            if text and text.strip():
-                                # Validate text is readable
+                                # Validate text is readable (basic check)
                                printable_chars = sum(1 for c in text if c.isprintable() or c.isspace())
                                total_chars = len(text)
                                if total_chars > 100 and printable_chars / total_chars < 0.3:
-                                    logger.warning("Text from %s appears to be binary, skipping text processing", file_path_obj.name)
+                                    logger.warning("Text from %s appears to be binary, skipping", file_path_obj.name)
                                    text = ""
                                else:
+                                    # Step 2.4: STORE EXTRACTED TEXT
+                                    all_text_content.append(text)
                                    extracted_path = self.storage.stage_extracted_content(job_id, file_path_obj.name, text)
                                    if file_record:
                                        file_record.extracted_path = str(extracted_path)
-                                    logger.info("Successfully extracted %d characters from %s", len(text), file_path_obj.name)
+                                    logger.info("Extracted %d characters from %s", len(text), file_path_obj.name)
                        except Exception as text_exc:
-                            logger.warning("Text extraction failed for %s: %s. Will continue with image extraction if available.", file_path_obj.name, text_exc)
+                            logger.warning("Text extraction failed for %s: %s", file_path_obj.name, text_exc)
                            text = ""
                    
                    # Extract images from documents (PDF, DOCX, PPTX)
@ -93,7 +136,25 @@ class JobPipeline:
                        extracted_images = [file_path_obj]
                        logger.info("Direct image upload detected: %s", file_path_obj.name)
                    
-                except Exception as exc:  # noqa: BLE001
+                    # Process images with Qwen2.5-VL
+                    if extracted_images:
+                        for image_path in extracted_images:
+                            try:
+                                qwen_results = self.qwen_client.extract_relationships_from_image(
+                                    image_path, source_file_id
+                                )
+                                if qwen_results:
+                                    # Convert Qwen results to CausalRelation objects
+                                    qwen_relations = self.relationship_extractor.extract_from_qwen_results(
+                                        qwen_results, source_file_id
+                                    )
+                                    all_relations.extend(qwen_relations)
+                                    logger.info("Extracted %d relations from image %s using Qwen2.5-VL", 
+                                              len(qwen_relations), image_path.name)
+                            except Exception as img_exc:
+                                logger.warning("Failed to analyze image %s with Qwen: %s", image_path, img_exc)
+                    
+                except Exception as exc:
                    logger.exception("Extraction failed for %s", file_path_obj)
                    if file_record:
                        file_record.error = str(exc)
@ -103,62 +164,188 @@ class JobPipeline:
                    job_id,
                    files=job.files,
                    processed_files=count,
-                    status_message=f"Analyzing causal relations ({count}/{job.total_files})",
-                    stage=JobStage.ANALYZING,
+                    status_message=f"Extracting content ({count}/{job.total_files})",
                )

-                # Process text content
-                if text and text.strip():
-                    chunks = self.chunker.chunk(text)
-                    text_relations = self.claude_extractor.analyze(chunks, source_file_id=source_file_id)
-                    relations.extend(text_relations)
-                    logger.info("Extracted %d relations from text in %s", len(text_relations), file_path_obj.name)
+            # ============================================================
+            # STEP 2: RELATIONSHIP EXTRACTION (NLP + Claude AI as per README)
+            # ============================================================
+            logger.info("Extracting relationships from text content using NLP (SpaCy) + Claude AI")
+            combined_text = "\n\n".join(all_text_content)
            
-                # Process images (extracted from documents or direct uploads)
-                if extracted_images:
-                    for image_path in extracted_images:
-                        try:
-                            image_relations = self.claude_extractor.analyze_image(image_path, source_file_id=source_file_id)
-                            relations.extend(image_relations)
-                            logger.info("Extracted %d relations from image %s", len(image_relations), image_path.name)
-                        except Exception as img_exc:
-                            logger.warning("Failed to analyze image %s: %s", image_path, img_exc)
-                            # Continue with other images
-                elif not text or not text.strip():
-                    # No text and no images - file might be empty or unsupported
-                    logger.warning("File %s has no extractable text or images", file_path_obj.name)
-                    if file_record:
-                        file_record.error = "No extractable content found (no text or images)"
+            if combined_text.strip():
+                # Extract relationships using NLP (Step 3.1) + Claude AI (Step 3.2)
+                # This implements the flow described in README.md
+                text_relations = self.relationship_extractor.extract_from_text(
+                    combined_text, 
+                    source_file_id="combined_text"
+                )
+                all_relations.extend(text_relations)
+                logger.info("NLP + Claude AI extracted %d relationships from text", len(text_relations))

-            # Write relations to Neo4j if any were found
-            if relations:
-                self.job_store.update(job_id, status_message="Writing to knowledge graph", stage=JobStage.BUILDING_GRAPH)
+            # ============================================================
+            # STEP 3: ENTITY RESOLUTION (Claude AI as per README Stage 4)
+            # ============================================================
+            if all_relations and self.entity_resolver.client:
+                logger.info("Resolving entities using Claude AI")
+                resolved_entities = self.entity_resolver.resolve_entities(all_relations)
+                if resolved_entities:
+                    # Apply resolution to relationships
+                    all_relations = self.entity_resolver.apply_resolution_to_relations(
+                        all_relations, resolved_entities
+                    )
+                    logger.info("Entity resolution completed: %d canonical entities", len(resolved_entities))
+                else:
+                    logger.info("Entity resolution returned no results")
+            else:
+                if not self.entity_resolver.client:
+                    logger.info("Entity resolution skipped (Claude AI not available)")
+
+            # ============================================================
+            # STEP 4: DOWHY VALIDATION
+            # ============================================================
+            if self.dowhy_analyzer and all_relations:
+                self.job_store.update(
+                    job_id,
+                    status_message="Validating relationships with DoWhy",
+                    stage=JobStage.BUILDING_GRAPH
+                )
+                logger.info("Validating %d relationships with DoWhy", len(all_relations))
+                validated_relations = self.dowhy_analyzer.validate_relationships(
+                    all_relations,
+                    text_data=combined_text
+                )
+                all_relations = validated_relations
+                logger.info("DoWhy validated %d relationships", len(all_relations))
+            else:
+                if not self.dowhy_analyzer:
+                    logger.info("DoWhy validation skipped (not available)")
+                self.job_store.update(
+                    job_id,
+                    status_message="Building knowledge graph",
+                    stage=JobStage.BUILDING_GRAPH
+                )
+
+            # ============================================================
+            # STEP 5: WRITE TO NEO4J (Documents, Entities, Relationships)
+            # ============================================================
+            if all_relations:
                try:
-                    self.graph_writer.write_relations(job_id, relations)
-                    logger.info("Wrote %d relations to Neo4j for job %s", len(relations), job_id)
-                    status_message = f"Completed with {len(relations)} causal relationship(s) written to Neo4j"
+                    # Write documents, entities, and relationships with types
+                    self.graph_writer.write_relations(job_id, all_relations, files=job.files)
+                    logger.info("Wrote %d relations to Neo4j for job %s", len(all_relations), job_id)
                except Exception as graph_exc:
-                    logger.exception("Failed to write relations to Neo4j for job %s: %s", job_id, graph_exc)
-                    status_message = f"Completed with {len(relations)} relations extracted, but failed to write to Neo4j: {graph_exc}"
-            else:
-                logger.warning("Job %s completed with 0 relations - no causal relationships found", job_id)
-                # Check if any files failed to extract
-                failed_files = [f for f in job.files if f.error]
-                if failed_files:
-                    status_message = f"Completed but {len(failed_files)} file(s) failed to extract. No relations found."
-                else:
-                    status_message = "Completed but no causal relationships were found in the documents."
+                    logger.exception("Failed to write relations to Neo4j: %s", graph_exc)
+                    raise
+
+            # ============================================================
+            # STEP 6: VECTOR DATABASE INDEXING (Qdrant)
+            # ============================================================
+            if self.vector_store and self.embedder and all_relations:
+                self.job_store.update(
+                    job_id,
+                    status_message="Indexing knowledge graph in vector database",
+                    stage=JobStage.INDEXING_VECTORS
+                )
+                logger.info("Indexing %d relationships in Qdrant", len(all_relations))
+                
+                indexed_count = 0
+                for relation in all_relations:
+                    try:
+                        # Generate embedding for the relationship
+                        embedding = self.embedder.embed_relation(
+                            relation.cause,
+                            relation.effect,
+                            relation.explanation
+                        )
+                        
+                        # Store in Qdrant
+                        self.vector_store.store_relation(relation, embedding, job_id)
+                        indexed_count += 1
+                    except Exception as e:
+                        logger.warning("Failed to index relation %s -> %s: %s", 
+                                    relation.cause, relation.effect, e)
+                
+                # Also index concepts (nodes)
+                concepts = set()
+                for rel in all_relations:
+                    concepts.add(rel.cause)
+                    concepts.add(rel.effect)
+                
+                for concept in concepts:
+                    try:
+                        embedding = self.embedder.embed_concept(concept)
+                        self.vector_store.store_concept(concept, embedding, job_id)
+                    except Exception as e:
+                        logger.warning("Failed to index concept %s: %s", concept, e)
+                
+                logger.info("Indexed %d relationships and %d concepts in Qdrant", 
+                          indexed_count, len(concepts))
+
+            # ============================================================
+            # STEP 7: GENERATE ONBOARDING REPORT
+            # ============================================================
+            if self.report_generator and self.vector_store and self.embedder:
+                self.job_store.update(
+                    job_id,
+                    status_message="Generating beginner-friendly onboarding report",
+                    stage=JobStage.GENERATING_REPORT
+                )
+                logger.info("Generating onboarding report for job %s", job_id)
+                
+                try:
+                    kg_summary = {
+                        "total_relations": len(all_relations),
+                        "total_files": job.total_files,
+                        "processed_files": job.processed_files
+                    }
+                    
+                    report = self.report_generator.generate_onboarding_report(
+                        job_id=job_id,
+                        relations=all_relations,
+                        vector_store=self.vector_store,
+                        embedder=self.embedder,
+                        graph_writer=self.graph_writer,  # Pass graph_writer for Neo4j queries
+                        kg_summary=kg_summary
+                    )
+                    
+                    logger.info("Generated onboarding report: %d sections, %d pages", 
+                              len(report.sections), report.total_pages)
+                    
+                except Exception as report_exc:
+                    logger.exception("Failed to generate report: %s", report_exc)
+                    report = None
+                    # Store report generation error in job metadata
+                    report_error_msg = str(report_exc)
+                    if "credit balance" in report_error_msg.lower() or "too low" in report_error_msg.lower():
+                        report_error_msg = "Report generation failed: Claude API credit balance is too low. Please add credits to your Anthropic account."
+                    self.job_store.update(
+                        job_id,
+                        error=f"Report generation failed: {report_error_msg}"
+                    )
+            else:
+                logger.warning("Report generation skipped (components not available)")
+                report = None
+
+            # ============================================================
+            # FINAL UPDATE
+            # ============================================================
+            status_message = f"Completed successfully"
+            if all_relations:
+                status_message += f" with {len(all_relations)} relationships"
+            if report:
+                status_message += f" and generated onboarding report"
            
-            # Final update
            self.job_store.update(
                job_id,
                stage=JobStage.COMPLETED,
                status_message=status_message,
-                relations=relations,
+                relations=all_relations,
+                report=report,
                processed_files=job.total_files,
            )
-            logger.info("Job %s completed with %d relations", job_id, len(relations))
-        except Exception as exc:  # noqa: BLE001
+            logger.info("Job %s completed successfully", job_id)
+            
+        except Exception as exc:
            logger.exception("Job %s failed: %s", job_id, exc)
            self.job_store.mark_error(job_id, f"Pipeline failed: {exc}")
-