newly added multi doc upload service

2025-11-17 09:04:49 +05:30 · 2025-11-17 09:04:49 +05:30 · 603e9b4b20
commit 603e9b4b20
parent ad2c27d793
23 changed files with 3248 additions and 48 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -131,11 +131,11 @@ services:
    networks:
      - pipeline_network
    healthcheck:
-      test: ["CMD", "cypher-shell", "--username", "neo4j", "--password", "password", "MATCH () RETURN count(*) as count"]
+      test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:7474 || exit 1"]
      interval: 30s
      timeout: 10s
      retries: 5
-      start_period: 60s
+      start_period: 90s

  # chromadb:
  #   image: chromadb/chroma:latest
@ -269,6 +269,7 @@ services:
      - SELF_IMPROVING_GENERATOR_URL=http://self-improving-generator:8007
      - AI_MOCKUP_URL=http://ai-mockup-service:8021
      - AI_ANALYSIS_URL=http://ai-analysis-service:8022
+      - MULTI_DOCUMENT_UPLOAD_URL=http://multi-document-upload-service:8024
      - UNISON_URL=http://unison:8010
      - TEMPLATE_MANAGER_AI_URL=http://template-manager:8013
    volumes:
@ -775,6 +776,67 @@ services:
      retries: 3
      start_period: 60s
    restart: unless-stopped
+
+  # Multi-Document Upload Service
+  # =====================================
+  
+  multi-document-upload-service:
+    build:
+      context: ./services/multi-document-upload-service
+      dockerfile: Dockerfile
+    container_name: pipeline_multi_document_upload
+    ports:
+      - "8024:8024"
+    environment:
+      - PORT=8024
+      - HOST=0.0.0.0
+      - ANTHROPIC_API_KEY=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
+      - CLAUDE_MODEL=claude-3-5-haiku-latest
+      
+      # Neo4j Configuration
+      - NEO4J_URI=bolt://neo4j:7687
+      - NEO4J_USER=neo4j
+      - NEO4J_PASSWORD=password
+      - NEO4J_DATABASE=neo4j
+      
+      # Storage Configuration
+      - STORAGE_DIR=/app/storage
+      
+      # Database configurations (optional, for job tracking)
+      - POSTGRES_HOST=pipeline_postgres
+      - POSTGRES_PORT=5432
+      - POSTGRES_DB=dev_pipeline
+      - POSTGRES_USER=pipeline_admin
+      - POSTGRES_PASSWORD=secure_pipeline_2024
+      
+      - REDIS_HOST=pipeline_redis
+      - REDIS_PORT=6379
+      - REDIS_PASSWORD=redis_secure_2024
+    volumes:
+      - multi_document_storage:/app/storage
+    depends_on:
+      neo4j:
+        condition: service_healthy
+      postgres:
+        condition: service_healthy
+      redis:
+        condition: service_healthy
+    networks:
+      - pipeline_network
+    deploy:
+      resources:
+        limits:
+          memory: 4G
+        reservations:
+          memory: 2G
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8024/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 60s
+    restart: unless-stopped
+
  # =====================================
  # Workflow Orchestration
  # =====================================
@ -894,6 +956,8 @@ volumes:
    driver: local
  ai_analysis_temp:
    driver: local
+  multi_document_storage:
+    driver: local

 # =====================================
 # Networks
--- a/services/ai-analysis-service/server.py
+++ b/services/ai-analysis-service/server.py
@ -13,8 +13,10 @@ import time
 import hashlib
 import traceback
 import uuid
+import re
 from pathlib import Path
-from typing import Dict, Any, Optional, List, Tuple
+from typing import Dict, Any, Optional, List, Tuple, Set
+from dataclasses import dataclass, field
 from datetime import datetime
 from contextlib import asynccontextmanager

@ -53,7 +55,8 @@ from ai_analyze import (
    CodeQualityAnalysis,
    Issue,
    ModuleAnalysis,
-    ModuleSummary
+    ModuleSummary,
+    FileAnalysis
 )

 # Import enhanced analyzer (backward compatible)
@ -72,6 +75,222 @@ analyzer = None
 neo4j_client: Optional[Neo4jGraphClient] = None
 USE_KNOWLEDGE_GRAPH = False

+CANONICAL_CHUNK_SUFFIX_RE = re.compile(r'(_part|_chunk)\d+$', re.IGNORECASE)
+
+
+def get_canonical_module_name(raw_name: str) -> str:
+    """Normalize chunk/module names so split chunks collapse to one canonical module."""
+    if not raw_name:
+        return "unknown"
+    cleaned = raw_name.strip()
+    canonical = CANONICAL_CHUNK_SUFFIX_RE.sub("", cleaned)
+    canonical = canonical.strip("_- ")
+    return canonical or cleaned
+
+
+def _ensure_list_of_strings(value: Any) -> List[str]:
+    if value is None:
+        return []
+    if isinstance(value, str):
+        value = value.strip()
+        return [value] if value else []
+    if isinstance(value, (list, tuple, set)):
+        return [str(item).strip() for item in value if item is not None and str(item).strip()]
+    return []
+
+
+def _dedupe_preserve_order(items: List[str]) -> List[str]:
+    seen = set()
+    result = []
+    for item in items:
+        if item not in seen:
+            seen.add(item)
+            result.append(item)
+    return result
+
+
+def sanitize_file_analysis_for_aggregation(fa: Any) -> FileAnalysis:
+    """Create a lightweight, serialization-safe FileAnalysis for aggregation."""
+    if isinstance(fa, FileAnalysis):
+        path = str(fa.path) if fa.path else ""
+        language = fa.language or "unknown"
+        lines = int(fa.lines_of_code or 0)
+        complexity = float(fa.complexity_score or 0.0)
+        severity_value = fa.severity_score
+        severity = float(severity_value) if isinstance(severity_value, (int, float)) else 5.0
+        issues = _ensure_list_of_strings(fa.issues_found)
+        recommendations = _ensure_list_of_strings(fa.recommendations)
+        detailed = fa.detailed_analysis or ""
+    elif isinstance(fa, dict):
+        path = str(fa.get("path") or fa.get("file_path") or "")
+        language = fa.get("language") or "unknown"
+        lines = int(fa.get("lines_of_code") or 0)
+        complexity = float(fa.get("complexity_score") or 0.0)
+        severity_value = fa.get("severity_score")
+        severity = float(severity_value) if isinstance(severity_value, (int, float)) else 5.0
+        issues = _ensure_list_of_strings(fa.get("issues_found", []))
+        recommendations = _ensure_list_of_strings(fa.get("recommendations", []))
+        detailed = fa.get("detailed_analysis") or ""
+    else:
+        path = str(getattr(fa, "path", "") or "")
+        language = getattr(fa, "language", "unknown") or "unknown"
+        lines = int(getattr(fa, "lines_of_code", 0) or 0)
+        complexity = float(getattr(fa, "complexity_score", 0) or 0.0)
+        severity_value = getattr(fa, "severity_score", 5.0)
+        severity = float(severity_value) if isinstance(severity_value, (int, float)) else 5.0
+        issues = _ensure_list_of_strings(getattr(fa, "issues_found", []))
+        recommendations = _ensure_list_of_strings(getattr(fa, "recommendations", []))
+        detailed = getattr(fa, "detailed_analysis", "") or ""
+
+    return FileAnalysis(
+        path=path,
+        language=language,
+        lines_of_code=lines,
+        complexity_score=complexity,
+        issues_found=issues,
+        recommendations=recommendations,
+        detailed_analysis=detailed,
+        severity_score=severity
+    )
+
+
+def merge_file_analyses(existing: FileAnalysis, new: FileAnalysis) -> FileAnalysis:
+    """Merge two FileAnalysis objects for the same file path."""
+    severity = (
+        (existing.severity_score or 0) + (new.severity_score or 0)
+    ) / 2.0 if isinstance(existing.severity_score, (int, float)) and isinstance(new.severity_score, (int, float)) else (existing.severity_score or new.severity_score or 5.0)
+
+    complexity = (
+        (existing.complexity_score or 0) + (new.complexity_score or 0)
+    ) / 2.0 if isinstance(existing.complexity_score, (int, float)) and isinstance(new.complexity_score, (int, float)) else (existing.complexity_score or new.complexity_score or 0.0)
+
+    language = existing.language if existing.language and existing.language != "unknown" else new.language
+
+    issues = _ensure_list_of_strings(existing.issues_found) + _ensure_list_of_strings(new.issues_found)
+    recommendations = _ensure_list_of_strings(existing.recommendations) + _ensure_list_of_strings(new.recommendations)
+
+    issues = _dedupe_preserve_order(issues)
+    recommendations = _dedupe_preserve_order(recommendations)
+
+    detailed = existing.detailed_analysis or new.detailed_analysis or ""
+
+    return FileAnalysis(
+        path=existing.path or new.path,
+        language=language or "unknown",
+        lines_of_code=max(existing.lines_of_code or 0, new.lines_of_code or 0),
+        complexity_score=complexity,
+        issues_found=issues,
+        recommendations=recommendations,
+        detailed_analysis=detailed,
+        severity_score=severity
+    )
+
+
+@dataclass
+class AggregatedModuleData:
+    canonical_name: str
+    original_names: Set[str] = field(default_factory=set)
+    chunk_ids: List[str] = field(default_factory=list)
+    chunk_types: Set[str] = field(default_factory=set)
+    file_map: Dict[str, FileAnalysis] = field(default_factory=dict)
+    quality_scores: List[float] = field(default_factory=list)
+    overviews: List[str] = field(default_factory=list)
+    architectures: List[str] = field(default_factory=list)
+    security_notes: List[str] = field(default_factory=list)
+    recommendations: Set[str] = field(default_factory=set)
+    ai_responses: List[str] = field(default_factory=list)
+    dependencies: Set[str] = field(default_factory=set)
+    metadata_records: List[Dict[str, Any]] = field(default_factory=list)
+    context_dependencies: Set[str] = field(default_factory=set)
+
+
+class ModuleAggregationManager:
+    """Collects chunk-level results and exposes aggregated module summaries."""
+
+    def __init__(self) -> None:
+        self._cache: Dict[str, Dict[str, AggregatedModuleData]] = {}
+
+    def reset(self, run_id: str) -> None:
+        self._cache[run_id] = {}
+
+    def clear(self, run_id: Optional[str]) -> None:
+        if run_id and run_id in self._cache:
+            del self._cache[run_id]
+
+    def add_chunk(
+        self,
+        run_id: str,
+        chunk_name: str,
+        chunk_id: Optional[str],
+        chunk_type: Optional[str],
+        chunk: Dict[str, Any],
+        chunk_analysis: Dict[str, Any],
+        file_analyses: List[Any],
+        metadata: Dict[str, Any],
+        ai_response: str
+    ) -> None:
+        if not run_id:
+            return
+
+        canonical_name = get_canonical_module_name(chunk_name)
+        modules = self._cache.setdefault(run_id, {})
+        module_data = modules.get(canonical_name)
+        if module_data is None:
+            module_data = AggregatedModuleData(canonical_name=canonical_name)
+            modules[canonical_name] = module_data
+
+        module_data.original_names.add(chunk_name)
+        if chunk_id:
+            module_data.chunk_ids.append(chunk_id)
+        if chunk_type:
+            module_data.chunk_types.add(chunk_type)
+
+        quality_value = chunk_analysis.get('module_quality_score')
+        if quality_value is None:
+            quality_value = chunk_analysis.get('module_quality')
+        if isinstance(quality_value, (int, float)):
+            module_data.quality_scores.append(float(quality_value))
+
+        overview_text = chunk_analysis.get('module_overview')
+        if overview_text:
+            module_data.overviews.append(str(overview_text).strip())
+
+        architecture_text = chunk_analysis.get('module_architecture')
+        if architecture_text:
+            module_data.architectures.append(str(architecture_text).strip())
+
+        security_text = chunk_analysis.get('module_security_assessment')
+        if security_text:
+            module_data.security_notes.append(str(security_text).strip())
+
+        recommendations = chunk_analysis.get('module_recommendations', [])
+        module_data.recommendations.update(_ensure_list_of_strings(recommendations))
+
+        if ai_response:
+            module_data.ai_responses.append(ai_response)
+
+        module_data.dependencies.update(chunk.get('dependencies', []))
+        module_data.context_dependencies.update(chunk.get('context_dependencies', []))
+
+        for fa in file_analyses:
+            sanitized = sanitize_file_analysis_for_aggregation(fa)
+            if not sanitized.path:
+                continue
+            existing = module_data.file_map.get(sanitized.path)
+            if existing:
+                module_data.file_map[sanitized.path] = merge_file_analyses(existing, sanitized)
+            else:
+                module_data.file_map[sanitized.path] = sanitized
+
+        if metadata:
+            module_data.metadata_records.append(metadata)
+
+    def get_modules(self, run_id: str) -> Dict[str, AggregatedModuleData]:
+        return self._cache.get(run_id, {})
+
+
+module_aggregation_manager = ModuleAggregationManager()
+
@asynccontextmanager
 async def lifespan(app: FastAPI):
    """Lifespan context manager for startup and shutdown events."""
@ -2105,10 +2324,14 @@ def estimate_tokens(files: List[Tuple[str, str]]) -> int:
    # Rough estimate: 4 characters per token
    return total_chars // 4

-def split_by_token_limit(module_files: List[Tuple[str, str]], max_tokens: int = 15000) -> List[List[Tuple[str, str]]]:
+def split_by_token_limit(
+    module_files: List[Tuple[str, str]],
+    max_tokens: int = 15000,
+    max_files: int = 12
+) -> List[List[Tuple[str, str]]]:
    """Split large module into sub-chunks while preserving related files together."""
-    sub_chunks = []
-    current_chunk = []
+    sub_chunks: List[List[Tuple[str, str]]] = []
+    current_chunk: List[Tuple[str, str]] = []
    current_tokens = 0
    
    for file_path, content in module_files:
@ -2116,9 +2339,15 @@ def split_by_token_limit(module_files: List[Tuple[str, str]], max_tokens: int =
            continue
        
        file_tokens = len(content) // 4
+        should_split = (
+            current_chunk
+            and (
+                current_tokens + file_tokens > max_tokens
+                or len(current_chunk) >= max_files
+            )
+        )
        
-        if current_tokens + file_tokens > max_tokens and current_chunk:
-            # Save current chunk and start new one
+        if should_split:
            sub_chunks.append(current_chunk)
            current_chunk = [(file_path, content)]
            current_tokens = file_tokens
@ -2158,6 +2387,10 @@ def find_dependencies(chunk_files: List[Tuple[str, str]], dependency_graph: Opti
    # For now, return empty list - can be enhanced with actual dependency tracking
    return dependencies

+MAX_TOKENS_PER_CHUNK = int(os.getenv("MAX_TOKENS_PER_CHUNK", "18000"))
+MAX_FILES_PER_CHUNK = int(os.getenv("MAX_FILES_PER_CHUNK", "12"))
+
+
 def create_intelligent_chunks(files: List[Tuple[str, str]], dependency_graph: Optional[Dict] = None) -> List[Dict]:
    """
    Group files by module/feature for semantic analysis.
@ -2192,14 +2425,15 @@ def create_intelligent_chunks(files: List[Tuple[str, str]], dependency_graph: Op
        if not module_files:
            continue
        
-        # Check token limit (increased for better context and fewer chunks)
-        # With 2000 req/min API limit, we can handle larger chunks
-        # Increased from 15000 to 25000 tokens for better module-level context
+        # Check token and file limits to keep prompts manageable for Claude
        module_tokens = estimate_tokens(module_files)
-        MAX_TOKENS_PER_CHUNK = 25000  # Increased for more files per chunk
-        if module_tokens > MAX_TOKENS_PER_CHUNK:
+        if module_tokens > MAX_TOKENS_PER_CHUNK or len(module_files) > MAX_FILES_PER_CHUNK:
            # Split large modules
-            sub_chunks = split_by_token_limit(module_files, MAX_TOKENS_PER_CHUNK)
+            sub_chunks = split_by_token_limit(
+                module_files,
+                max_tokens=MAX_TOKENS_PER_CHUNK,
+                max_files=MAX_FILES_PER_CHUNK
+            )
            for i, sub_chunk in enumerate(sub_chunks):
                chunks.append({
                    'id': f'chunk_{chunk_counter:03d}',
@ -2354,7 +2588,8 @@ def update_state_with_findings(analysis_state: Dict, chunk: Dict, chunk_analysis
    Update analysis_state with findings from current chunk analysis.
    Returns updated analysis_state.
    """
-    chunk_name = chunk.get('name', 'unknown')
+    raw_chunk_name = chunk.get('name', 'unknown')
+    chunk_name = get_canonical_module_name(raw_chunk_name)
    chunk_id = chunk.get('id', 'unknown')
    
    # Initialize state if needed
@ -2522,6 +2757,7 @@ def build_intelligent_chunk_prompt(chunk: Dict, analysis_state: Optional[Dict] =
        "## RESPONSE FORMAT:",
        "",
        "⚠️ CRITICAL: You MUST analyze ALL files listed above. Do NOT skip any files.",
+        "If a file looks empty or repetitive, still return a JSON entry with notes explaining limited context.",
        f"Files to analyze ({len(optimized_files)} total):",
    ])
    for i, file_path in enumerate(file_paths_list, 1):
@ -3271,19 +3507,144 @@ async def store_chunk_analysis_in_memory(chunk: Dict, file_analyses: List, chunk
            },
            'file_analyses': file_analyses_data
        }
+        metadata['dependencies'] = {
+            'depends_on_chunks': chunk.get('context_dependencies', []),
+            'raw_dependencies': chunk.get('dependencies', [])
+        }

-        # Prioritize Knowledge Graph storage
+        canonical_name = get_canonical_module_name(chunk_name)
+        module_aggregation_manager.add_chunk(
+            run_id=run_id,
+            chunk_name=chunk_name,
+            chunk_id=chunk.get('id'),
+            chunk_type=chunk_type,
+            chunk=chunk,
+            chunk_analysis=chunk_analysis or {},
+            file_analyses=file_analyses,
+            metadata=metadata,
+            ai_response=ai_response
+        )
+        print(f"   📦 Aggregated chunk '{chunk_name}' into canonical module '{canonical_name}'")
+        return canonical_name
+        
+    except Exception as e:
+        print(f"❌ [MEMORY] Failed to store chunk analysis: {e}")
+        import traceback
+        traceback.print_exc()
+        return None
+
+async def flush_module_aggregations(run_id: Optional[str], repository_id: str, session_id: Optional[str] = None) -> None:
+    """Persist aggregated module data to the knowledge graph (or fallback memory)."""
+    if not run_id:
+        return
+
+    aggregated_modules = module_aggregation_manager.get_modules(run_id)
+    if not aggregated_modules:
+        print(f"ℹ️ [AGGREGATION] No aggregated modules to persist for run {run_id}")
+        return
+
+    print(f"📦 [AGGREGATION] Persisting {len(aggregated_modules)} aggregated modules for run {run_id}")
+
+    for canonical_name, module_data in aggregated_modules.items():
+        file_list = list(module_data.file_map.values())
+        if not file_list:
+            print(f"   ⚠️ [AGGREGATION] Skipping module '{canonical_name}' (no file analyses aggregated)")
+            continue
+
+        total_files = len(file_list)
+        total_lines = sum(fa.lines_of_code or 0 for fa in file_list)
+        total_issues = sum(len(_ensure_list_of_strings(fa.issues_found)) for fa in file_list)
+        total_recommendations = sum(len(_ensure_list_of_strings(fa.recommendations)) for fa in file_list)
+        high_quality = len([fa for fa in file_list if isinstance(fa.severity_score, (int, float)) and fa.severity_score >= 8])
+        medium_quality = len([fa for fa in file_list if isinstance(fa.severity_score, (int, float)) and 5 <= fa.severity_score < 8])
+        low_quality = len([fa for fa in file_list if isinstance(fa.severity_score, (int, float)) and fa.severity_score < 5])
+
+        if module_data.quality_scores:
+            quality_score = sum(module_data.quality_scores) / max(len(module_data.quality_scores), 1)
+        elif total_files:
+            severity_sum = sum(fa.severity_score for fa in file_list if isinstance(fa.severity_score, (int, float)))
+            quality_score = severity_sum / total_files if total_files else 5.0
+        else:
+            quality_score = 5.0
+
+        overviews = _dedupe_preserve_order([text for text in module_data.overviews if text])
+        architectures = _dedupe_preserve_order([text for text in module_data.architectures if text])
+        security_notes = _dedupe_preserve_order([text for text in module_data.security_notes if text])
+        recommendations_list = _dedupe_preserve_order(list(module_data.recommendations))
+
+        module_overview = "\n\n".join(overviews)
+        module_architecture = "\n\n".join(architectures)
+        module_security = "\n\n".join(security_notes)
+
+        ai_response_blocks = _dedupe_preserve_order([text for text in module_data.ai_responses if text])
+        ai_response_text = "\n\n".join(ai_response_blocks) if ai_response_blocks else module_overview
+
+        aggregated_chunk_analysis = {
+            'module_overview': module_overview or f"Aggregated analysis for {canonical_name}",
+            'module_quality_score': round(quality_score, 2),
+            'module_architecture': module_architecture,
+            'module_security_assessment': module_security,
+            'module_recommendations': recommendations_list
+        }
+
+        file_analyses_for_metadata = [
+            {
+                'file_path': fa.path,
+                'language': fa.language,
+                'lines_of_code': fa.lines_of_code,
+                'complexity_score': fa.complexity_score,
+                'severity_score': fa.severity_score,
+                'issues_found': _ensure_list_of_strings(fa.issues_found),
+                'recommendations': _ensure_list_of_strings(fa.recommendations),
+                'detailed_analysis': fa.detailed_analysis,
+            }
+            for fa in file_list
+        ]
+
+        metadata = {
+            'type': 'module_analysis',
+            'run_id': run_id,
+            'chunk_name': canonical_name,
+            'chunk_type': 'module',
+            'repository_id': repository_id,
+            'total_files_in_chunk': total_files,
+            'chunk_metrics': {
+                'total_issues': total_issues,
+                'total_recommendations': total_recommendations,
+                'high_quality_files': high_quality,
+                'medium_quality_files': medium_quality,
+                'low_quality_files': low_quality
+            },
+            'file_analyses': file_analyses_for_metadata,
+            'dependencies': {
+                'depends_on_chunks': sorted(module_data.context_dependencies),
+                'raw_dependencies': sorted(module_data.dependencies)
+            },
+            'source_chunks': sorted(module_data.original_names),
+            'total_lines': total_lines
+        }
+
+        aggregated_chunk = {
+            'id': module_data.chunk_ids[0] if module_data.chunk_ids else f'aggregated_{canonical_name}',
+            'name': canonical_name,
+            'priority': 2,
+            'type': 'module',
+            'context_dependencies': list(module_data.context_dependencies),
+            'dependencies': list(module_data.dependencies)
+        }
+
+        stored = False
        if USE_KNOWLEDGE_GRAPH and neo4j_client:
            try:
                module_payload = kg_ops.build_module_payload(
                    run_id=run_id,
                    repository_id=repository_id,
-                    module_name=chunk_name,
-                    chunk=chunk,
-                    chunk_analysis=chunk_analysis,
-                    file_analyses=file_analyses,
+                    module_name=canonical_name,
+                    chunk=aggregated_chunk,
+                    chunk_analysis=aggregated_chunk_analysis,
+                    file_analyses=file_list,
                    metadata=metadata,
-                    ai_response=ai_response,
+                    ai_response=ai_response_text,
                )
                await kg_ops.store_module_analysis(
                    client=neo4j_client,
@ -3291,33 +3652,30 @@ async def store_chunk_analysis_in_memory(chunk: Dict, file_analyses: List, chunk
                    repository_id=repository_id,
                    module_payload=module_payload,
                )
-                print(f"   ✅ Stored in Neo4j knowledge graph (module: {chunk_name})")
-                return module_payload["module_props"]["module_id"]
+                print(f"   ✅ [AGGREGATION] Stored aggregated module '{canonical_name}' in Neo4j")
+                stored = True
            except Exception as kg_error:
-                print(f"   ⚠️ Failed to store module in knowledge graph: {kg_error}. Falling back to episodic memory.")
+                print(f"   ⚠️ [AGGREGATION] Failed to store '{canonical_name}' in Neo4j: {kg_error}")

-        # Fallback to Episodic Memory
+        if not stored and analyzer and hasattr(analyzer, 'memory_manager'):
            try:
                memory_id = await analyzer.memory_manager.store_episodic_memory(
                    session_id=session_id,
-                user_query=user_query,
-                ai_response=ai_response,
+                    user_query=f"Aggregated analysis for module: {canonical_name}",
+                    ai_response=ai_response_text or module_overview or f"Aggregated analysis for {canonical_name}",
                    repo_context=repository_id,
                    metadata=metadata
                )
-            print(f"   ✅ Stored in episodic memory with ID: {memory_id}")
-            return memory_id
+                print(f"   ✅ [AGGREGATION] Stored aggregated module '{canonical_name}' in episodic memory (ID: {memory_id})")
+                stored = True
            except Exception as memory_error:
-            print(f"   ❌ Failed to store in episodic memory: {memory_error}")
-            import traceback
+                print(f"   ❌ [AGGREGATION] Failed to store '{canonical_name}' in episodic memory: {memory_error}")
                traceback.print_exc()
-            return None

-    except Exception as e:
-        print(f"❌ [MEMORY] Failed to store chunk analysis: {e}")
-        import traceback
-        traceback.print_exc()
-        return None
+        if not stored:
+            print(f"   ❌ [AGGREGATION] Unable to persist aggregated module '{canonical_name}' in any storage backend")
+
+    module_aggregation_manager.clear(run_id)

 async def store_cumulative_analysis_state(session_id: str, repository_id: str, analysis_state: Dict, chunk_sequence: int):
    """
@ -5307,6 +5665,7 @@ async def process_chunks_in_parallel_batches(chunks, repository_id, progress_mgr

 async def analyze_repository_with_optimizations_parallel(repo_path: str, repository_id: str, user_id: str, max_files: Optional[int] = None, progress_mgr: Optional[AnalysisProgressManager] = None):
    """Analyze repository with PARALLEL BATCH PROCESSING for faster analysis."""
+    run_id: Optional[str] = None
    try:
        # Set run_id early so it's available for chunk storage
        # Extract analysis_id from progress_mgr if available, otherwise generate
@ -5321,6 +5680,9 @@ async def analyze_repository_with_optimizations_parallel(repo_path: str, reposit
            if not hasattr(analyzer, 'session_id') or not analyzer.session_id:
                analyzer.session_id = str(uuid.uuid4())
        
+        if run_id:
+            module_aggregation_manager.reset(run_id)
+        
        print(f"🔑 [ANALYSIS] Set run_id: {run_id}")
        
        # Get repository files from Git Integration Service API
@ -5447,6 +5809,11 @@ async def analyze_repository_with_optimizations_parallel(repo_path: str, reposit
                })
        
        print(f"✅ [STORAGE] All chunk analyses stored")
+        await flush_module_aggregations(
+            run_id=run_id,
+            repository_id=repository_id,
+            session_id=getattr(analyzer, 'session_id', None) if analyzer else None
+        )
        
        # ========================================================================
        # PHASE 2: CROSS-MODULE SYNTHESIS
@ -5568,11 +5935,15 @@ async def analyze_repository_with_optimizations_parallel(repo_path: str, reposit
    except Exception as e:
        print(f"Error in parallel analysis: {e}")
        raise
+    finally:
+        if run_id:
+            module_aggregation_manager.clear(run_id)

 async def analyze_repository_with_optimizations(repo_path: str, repository_id: str, user_id: str, max_files: int = 100, progress_mgr: Optional[AnalysisProgressManager] = None):
    """Analyze repository with SMART BATCHING for maximum efficiency."""
    from pathlib import Path
    
+    run_id: Optional[str] = None
    try:
        # Get repository files from Git Integration Service API
        files_to_analyze = await get_repository_files_from_api(repository_id, user_id, max_files)
@ -5601,6 +5972,19 @@ async def analyze_repository_with_optimizations(repo_path: str, repository_id: s
        for i, chunk in enumerate(chunks, 1):
            print(f"   Chunk {i}: {chunk['name']} ({chunk['chunk_type']}) - {len(chunk['files'])} files")
        
+        if analyzer:
+            run_id = getattr(analyzer, 'run_id', None)
+            if not run_id:
+                run_id = f"repo_analysis_{repository_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+                analyzer.run_id = run_id
+            if not hasattr(analyzer, 'session_id') or not analyzer.session_id:
+                analyzer.session_id = str(uuid.uuid4())
+        else:
+            run_id = f"repo_analysis_{repository_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+
+        if run_id:
+            module_aggregation_manager.reset(run_id)
+        
        # Initialize analysis_state for progressive context
        analysis_state = {}
        
@ -5739,6 +6123,12 @@ async def analyze_repository_with_optimizations(repo_path: str, repository_id: s
        
        print(f"🎉 [INTELLIGENT CHUNKING] Completed all {total_chunks} chunks - {len(file_analyses)} files analyzed")
        
+        await flush_module_aggregations(
+            run_id=run_id,
+            repository_id=repository_id,
+            session_id=getattr(analyzer, 'session_id', None) if analyzer else None
+        )
+        
        # ========================================================================
        # PHASE 2: CROSS-MODULE SYNTHESIS
        # ========================================================================
@ -5868,6 +6258,9 @@ async def analyze_repository_with_optimizations(repo_path: str, repository_id: s
    except Exception as e:
        print(f"Error in optimized analysis: {e}")
        raise
+    finally:
+        if run_id:
+            module_aggregation_manager.clear(run_id)

@app.get("/repository/{repository_id}/info")
 async def get_repository_info(repository_id: str, user_id: str):
--- a/services/api-gateway/src/server.js
+++ b/services/api-gateway/src/server.js
@ -70,6 +70,7 @@ const serviceTargets = {
  AI_MOCKUP_URL: process.env.AI_MOCKUP_URL || 'http://localhost:8021',
  AI_ANALYSIS_URL: process.env.AI_ANALYSIS_URL || 'http://localhost:8022',
  FAST_AI_ANALYSIS_URL: process.env.FAST_AI_ANALYSIS_URL || 'http://localhost:8023',
+  MULTI_DOCUMENT_UPLOAD_URL: process.env.MULTI_DOCUMENT_UPLOAD_URL || 'http://localhost:8024',
 };

 // Log service targets for debugging
@ -944,6 +945,31 @@ app.use('/api/ai/repository',
  }
 );

+// Multi-Document Upload Service - handles large multipart uploads
+console.log('🔧 Registering /api/multi-docs proxy route...');
+app.use('/api/multi-docs',
+  createServiceLimiter(120),
+  (req, res, next) => {
+    console.log(`📁 [MULTI-DOCS PROXY] ${req.method} ${req.originalUrl}`);
+    next();
+  },
+  createProxyMiddleware({
+    target: serviceTargets.MULTI_DOCUMENT_UPLOAD_URL,
+    changeOrigin: true,
+    pathRewrite: { '^/api/multi-docs': '' },
+    logLevel: 'warn',
+    proxyTimeout: 1800000,
+    timeout: 1800000,
+    onProxyReq: (proxyReq, req, res) => {
+      proxyReq.setHeader('X-Forwarded-By', 'api-gateway');
+    },
+    onProxyRes: (proxyRes, req, res) => {
+      res.setHeader('Access-Control-Allow-Origin', req.headers.origin || '*');
+      res.setHeader('Access-Control-Allow-Credentials', 'true');
+    }
+  })
+);
+
 // Template Manager AI - expose AI recommendations through the gateway
 console.log('🔧 Registering /api/ai/tech-stack proxy route...');
 app.use('/api/ai/tech-stack',
--- a/services/multi-document-upload-service/Dockerfile
+++ b/services/multi-document-upload-service/Dockerfile
@ -0,0 +1,30 @@
+FROM python:3.11-slim
+
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1
+
+WORKDIR /app
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        build-essential \
+        poppler-utils \
+        tesseract-ocr \
+        ffmpeg \
+        libmagic1 \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY src ./src
+
+ENV PYTHONPATH=/app/src \
+    MULTI_DOC_STORAGE_ROOT=/app/storage \
+    MULTI_DOC_CLAUDE_MODEL=claude-3-5-sonnet-20241022 \
+    PORT=8024
+
+EXPOSE 8024
+
+CMD ["sh", "-c", "uvicorn multi_document_upload_service.main:app --host 0.0.0.0 --port ${PORT:-8024}"]
+
--- a/services/multi-document-upload-service/FIX_EMPTY_GRAPH.md
+++ b/services/multi-document-upload-service/FIX_EMPTY_GRAPH.md
@ -0,0 +1,144 @@
+# Fix: Empty Graph in Neo4j (No Relationships Found)
+
+## Problem
+
+When querying Neo4j for `CAUSES` relationships, you get "(no changes, no records)" because:
+
+1. **PDF extraction failed** - Missing dependencies (`unstructured[pdf]`)
+2. **0 relations extracted** - No text was extracted, so no analysis happened
+3. **0 relations written** - Nothing was written to Neo4j (correct behavior)
+
+## Root Cause
+
+The service completed with 0 relations because:
+- PDF file extraction failed: `partition_pdf() is not available because one or more dependencies are not installed`
+- No text was extracted from the PDF
+- No chunks were created
+- No Claude analysis happened
+- 0 relations were extracted
+- 0 relations were written to Neo4j
+
+## Solution
+
+### Step 1: Update Dependencies
+
+The `requirements.txt` has been updated to include:
+```
+unstructured[pdf]>=0.15.0
+unstructured[docx]>=0.15.0
+unstructured[pptx]>=0.15.0
+unstructured[xlsx]>=0.15.0
+```
+
+### Step 2: Rebuild the Service
+
+```bash
+cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine
+
+# Rebuild the service with new dependencies
+docker-compose build multi-document-upload-service
+
+# Restart the service
+docker-compose restart multi-document-upload-service
+
+# Check logs to verify it's working
+docker-compose logs -f multi-document-upload-service
+```
+
+### Step 3: Verify Dependencies
+
+```bash
+# Check if unstructured[pdf] is installed
+docker-compose exec multi-document-upload-service pip list | grep unstructured
+```
+
+### Step 4: Re-upload Documents
+
+1. Go to Project Builder in the frontend
+2. Click on "Upload Documents for Knowledge Graph"
+3. Upload a PDF or other document
+4. Wait for processing to complete
+5. Check Neo4j for relationships
+
+### Step 5: Check Neo4j
+
+Run these queries in Neo4j Browser:
+
+```cypher
+// Check if any nodes exist
+MATCH (n)
+RETURN count(n) as node_count
+
+// Check for CAUSES relationships
+MATCH (n:Concept)-[r:CAUSES]->(m:Concept)
+RETURN n.name as cause, m.name as effect, r.confidence as confidence
+LIMIT 50
+```
+
+## Expected Behavior After Fix
+
+1. **PDF extraction succeeds** - Text is extracted from PDF files
+2. **Text is chunked** - Document is split into manageable chunks
+3. **Claude analyzes** - Causal relationships are extracted
+4. **Relations are written** - Relationships are stored in Neo4j
+5. **Query returns results** - Neo4j query shows relationships
+
+## Verification Steps
+
+1. **Check service logs**:
+   ```bash
+   docker-compose logs multi-document-upload-service | grep -i "extracted\|relation\|neo4j"
+   ```
+
+2. **Check job status**:
+   ```bash
+   curl http://localhost:8000/api/multi-docs/jobs/{job_id}
+   ```
+   Should show: `"processed_files": 1` and relations count > 0
+
+3. **Check Neo4j**:
+   ```cypher
+   MATCH (n:Concept)-[r:CAUSES]->(m:Concept)
+   RETURN count(r) as relation_count
+   ```
+
+## Improvements Made
+
+1. ✅ **Added PDF dependencies** - `unstructured[pdf]`, `unstructured[docx]`, etc.
+2. ✅ **Added fallback extractors** - Uses `pdfplumber` if unstructured fails
+3. ✅ **Better error handling** - Shows actual errors in job status
+4. ✅ **Improved logging** - More detailed logs for debugging
+5. ✅ **Better Neo4j query** - Validates data before writing
+
+## Troubleshooting
+
+If you still see 0 relations after rebuilding:
+
+1. **Check extraction logs**:
+   ```bash
+   docker-compose logs multi-document-upload-service | grep -i "extract"
+   ```
+
+2. **Check Claude analysis**:
+   ```bash
+   docker-compose logs multi-document-upload-service | grep -i "claude\|analyze"
+   ```
+
+3. **Check Neo4j connection**:
+   ```bash
+   docker-compose logs multi-document-upload-service | grep -i "neo4j\|graph"
+   ```
+
+4. **Verify document has causal language**:
+   - Not all documents contain causal relationships
+   - Try uploading a document with clear cause-effect statements
+   - Example: "Smoking causes lung cancer" or "Rain causes flooding"
+
+## Next Steps
+
+1. Rebuild the service with new dependencies
+2. Re-upload documents
+3. Check Neo4j for relationships
+4. If still no results, check service logs for errors
+5. Verify the document contains causal language
+
--- a/services/multi-document-upload-service/NEO4J_DIAGNOSTIC_QUERIES.md
+++ b/services/multi-document-upload-service/NEO4J_DIAGNOSTIC_QUERIES.md
@ -0,0 +1,176 @@
+# Neo4j Diagnostic Queries
+
+## Issue: No relationships found in Neo4j
+
+If you're seeing "(no changes, no records)" when querying for `CAUSES` relationships, here are diagnostic queries to check what's actually in the database.
+
+## Diagnostic Queries
+
+### 1. Check if any nodes exist
+```cypher
+MATCH (n)
+RETURN count(n) as node_count
+LIMIT 1
+```
+
+### 2. Check if Concept nodes exist
+```cypher
+MATCH (n:Concept)
+RETURN count(n) as concept_count, 
+       collect(DISTINCT labels(n)) as labels,
+       collect(DISTINCT keys(n)) as properties
+LIMIT 10
+```
+
+### 3. Check all relationship types
+```cypher
+CALL db.relationshipTypes() YIELD relationshipType
+RETURN relationshipType
+```
+
+### 4. Check all node labels
+```cypher
+CALL db.labels() YIELD label
+RETURN label
+```
+
+### 5. Check all relationships (any type)
+```cypher
+MATCH (n)-[r]->(m)
+RETURN type(r) as relationship_type, 
+       count(r) as count,
+       labels(n) as from_labels,
+       labels(m) as to_labels
+LIMIT 50
+```
+
+### 6. Check for CAUSES relationships specifically
+```cypher
+MATCH (n)-[r:CAUSES]->(m)
+RETURN n, r, m
+LIMIT 50
+```
+
+### 7. Check for relationships with lowercase "causes"
+```cypher
+MATCH (n)-[r]->(m)
+WHERE type(r) =~ '(?i)causes'
+RETURN type(r) as relationship_type, n, r, m
+LIMIT 50
+```
+
+### 8. Check all nodes and their relationships
+```cypher
+MATCH (n)
+OPTIONAL MATCH (n)-[r]->(m)
+RETURN n, labels(n) as node_labels, 
+       type(r) as relationship_type, 
+       m, labels(m) as target_labels
+LIMIT 50
+```
+
+### 9. Check for nodes created by the service (by job_id property)
+```cypher
+MATCH (n)-[r]->(m)
+WHERE r.job_id IS NOT NULL
+RETURN n, r, m, r.job_id as job_id
+LIMIT 50
+```
+
+### 10. Check database statistics
+```cypher
+MATCH (n)
+RETURN count(n) as total_nodes,
+       size([(n)-[r]->() | r]) as total_relationships
+```
+
+## Common Issues and Solutions
+
+### Issue 1: No nodes at all
+**Symptom**: Query 1 returns 0 nodes
+**Cause**: Service hasn't written anything to Neo4j, or connection failed
+**Solution**: 
+- Check service logs: `docker-compose logs multi-document-upload-service`
+- Verify Neo4j connection in service configuration
+- Check if job completed with 0 relations (extraction failed)
+
+### Issue 2: Nodes exist but no relationships
+**Symptom**: Query 1 returns nodes, but Query 6 returns no relationships
+**Cause**: Relationships weren't created, or different relationship type
+**Solution**:
+- Check Query 5 to see what relationship types actually exist
+- Check service logs for graph writing errors
+- Verify the job actually extracted relations (check job status)
+
+### Issue 3: Different relationship type
+**Symptom**: Query 5 shows relationships but not `CAUSES`
+**Cause**: Service might be using a different relationship type
+**Solution**:
+- Check Query 3 to see all relationship types
+- Update query to use the correct relationship type
+
+### Issue 4: Different node labels
+**Symptom**: Query 6 returns no results, but Query 2 shows different labels
+**Cause**: Service might be using different node labels
+**Solution**:
+- Check Query 2 to see what labels exist
+- Update query to match actual labels
+
+## Expected Structure
+
+After a successful upload, you should see:
+
+### Nodes
+- **Label**: `Concept`
+- **Properties**: `name`, `lastSeen`
+
+### Relationships
+- **Type**: `CAUSES`
+- **Properties**: `confidence`, `explanation`, `source_file_id`, `source_snippet`, `job_id`, `model`, `updated_at`
+
+### Example Query
+```cypher
+MATCH (cause:Concept)-[r:CAUSES]->(effect:Concept)
+RETURN cause.name as cause, 
+       effect.name as effect, 
+       r.confidence as confidence,
+       r.job_id as job_id,
+       r.source_file_id as source_file
+LIMIT 50
+```
+
+## Troubleshooting Steps
+
+1. **Check service logs**:
+   ```bash
+   docker-compose logs -f multi-document-upload-service
+   ```
+
+2. **Check if job completed successfully**:
+   ```bash
+   curl http://localhost:8000/api/multi-docs/jobs/{job_id}
+   ```
+
+3. **Check Neo4j connection**:
+   ```bash
+   docker-compose logs neo4j | grep -i error
+   ```
+
+4. **Verify Neo4j is running**:
+   ```bash
+   docker-compose ps neo4j
+   ```
+
+5. **Test Neo4j connection manually**:
+   ```bash
+   docker-compose exec neo4j cypher-shell -u neo4j -p password "MATCH (n) RETURN count(n)"
+   ```
+
+## Next Steps
+
+1. Run the diagnostic queries above
+2. Check the service logs for errors
+3. Verify the job status via API
+4. Re-upload documents after fixing dependencies
+5. Check if relations were actually extracted (job status should show relation count)
+
--- a/services/multi-document-upload-service/QUICK_TEST.md
+++ b/services/multi-document-upload-service/QUICK_TEST.md
@ -0,0 +1,85 @@
+# Quick Testing Guide - Multi-Document Upload
+
+## 🚀 Quick Start Testing
+
+### 1. Start Services
+```bash
+cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine
+docker-compose up -d multi-document-upload-service neo4j redis postgres api-gateway
+```
+
+### 2. Verify Services
+```bash
+# Check health
+curl http://localhost:8024/health
+curl http://localhost:8000/api/multi-docs/health
+```
+
+### 3. Test via Frontend
+
+1. **Open Frontend**: `http://localhost:3001`
+2. **Login** (if required)
+3. **Go to Project Builder**
+4. **Complete Steps 1-2** (Project Type & Features)
+5. **Step 3: Multi Docs Upload** appears
+6. **Upload files**:
+   - Click upload area
+   - Select multiple files (PDF, DOCX, etc.)
+   - Click "Start Upload"
+7. **Watch Progress**:
+   - Progress bar updates
+   - Status messages appear
+   - Polls every 4 seconds
+8. **Auto-proceeds** when completed
+
+### 4. Verify in Neo4j
+
+```bash
+# Open Neo4j Browser: http://localhost:7474
+# Login: neo4j / password
+
+# Query causal relationships:
+MATCH (n)-[r:CAUSES]->(m)
+RETURN n, r, m
+LIMIT 50
+```
+
+## 📝 Test Checklist
+
+- [ ] Service starts successfully
+- [ ] Health endpoint works
+- [ ] Frontend component renders
+- [ ] File upload works
+- [ ] Progress updates correctly
+- [ ] Job completes successfully
+- [ ] Neo4j graph contains relationships
+- [ ] Error handling works
+- [ ] Skip button works
+
+## 🔍 Debug Commands
+
+```bash
+# View service logs
+docker-compose logs -f multi-document-upload-service
+
+# Check job status (replace {job_id})
+curl http://localhost:8000/api/multi-docs/jobs/{job_id}
+
+# Check graph summary
+curl http://localhost:8000/api/multi-docs/jobs/{job_id}/graph
+```
+
+## ⚠️ Common Issues
+
+1. **502 Bad Gateway**: Service not running → `docker-compose ps`
+2. **413 Too Large**: File too big → Reduce file size
+3. **No progress**: Check browser console → Check network tab
+4. **No relationships**: Check Claude API key → Check service logs
+
+## 🎯 Expected Flow
+
+```
+Upload Files → Job Created → Files Saved → Content Extracted → 
+Claude Analysis → Graph Built → Completed → Auto-proceed to Next Step
+```
+
--- a/services/multi-document-upload-service/README.md
+++ b/services/multi-document-upload-service/README.md
@ -0,0 +1,36 @@
+# Multi Document Upload Service
+
+This service accepts large batches of heterogeneous documents, extracts causal
+relationships with Claude Sonnet 3.5, and writes them into Neo4j as a
+knowledge graph.
+
+## Features
+
+- Multipart upload endpoint (`POST /jobs`) capable of handling dozens of files
+  and mixed formats (PDF, DOCX, PPTX, XLSX/CSV, JSON/XML, images, audio/video).
+- Content extraction powered by the `unstructured` library with fallbacks.
+- Chunking tuned for Claude Sonnet (800 token target, 200 overlap).
+- High-accuracy causal extraction using Anthropic Claude with provenance.
+- Neo4j graph writer that upserts `Concept` nodes and `CAUSES` edges.
+- Status endpoint (`GET /jobs/{id}`) and graph summary endpoint
+  (`GET /jobs/{id}/graph`).
+
+## Configuration
+
+Environment variables:
+
+- `ANTHROPIC_API_KEY` (required)
+- `MULTI_DOC_CLAUDE_MODEL` (default `claude-3-5-sonnet-20241022`)
+- `NEO4J_URI` (default `bolt://localhost:7687`)
+- `NEO4J_USER` / `NEO4J_PASSWORD` (default `neo4j` / `neo4j`)
+- `MULTI_DOC_STORAGE_ROOT` (default `storage` inside project)
+
+## Run locally
+
+```bash
+uvicorn multi_document_upload_service.main:app --reload --host 0.0.0.0 --port 8035
+```
+
+Ensure Neo4j is reachable and Anthropic credentials are exported before
+starting the service.
+
--- a/services/multi-document-upload-service/REBUILD_INSTRUCTIONS.md
+++ b/services/multi-document-upload-service/REBUILD_INSTRUCTIONS.md
@ -0,0 +1,152 @@
+# Rebuild Instructions - Multi-Document Upload Service
+
+## Issue: Empty Graph in Neo4j
+
+**Problem**: Query returns "(no changes, no records)" because the job completed with 0 relations.
+
+**Root Cause**: PDF extraction failed due to missing dependencies (`unstructured[pdf]`).
+
+## Fixes Applied
+
+1. ✅ Added PDF dependencies (`unstructured[pdf]`, `unstructured[docx]`, etc.)
+2. ✅ Added fallback extractors (pdfplumber, python-docx, python-pptx)
+3. ✅ Improved error handling and logging
+4. ✅ Fixed Neo4j query syntax
+5. ✅ Better status messages
+
+## Rebuild Steps
+
+### Step 1: Rebuild the Service
+
+```bash
+cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine
+
+# Stop the service
+docker-compose stop multi-document-upload-service
+
+# Rebuild with new dependencies
+docker-compose build --no-cache multi-document-upload-service
+
+# Start the service
+docker-compose up -d multi-document-upload-service
+
+# Check logs to verify it's starting correctly
+docker-compose logs -f multi-document-upload-service
+```
+
+### Step 2: Verify Dependencies
+
+```bash
+# Check if unstructured[pdf] is installed
+docker-compose exec multi-document-upload-service pip list | grep unstructured
+
+# You should see:
+# unstructured
+# unstructured-pdf
+# unstructured-docx
+# etc.
+```
+
+### Step 3: Test the Service
+
+```bash
+# Check health endpoint
+curl http://localhost:8024/health
+
+# Should return:
+# {
+#   "status": "ok",
+#   "claude_model": "claude-3-5-haiku-latest",
+#   ...
+# }
+```
+
+### Step 4: Re-upload Documents
+
+1. Open frontend: `http://localhost:3001/project-builder`
+2. Go to Step 1: Project Type
+3. Find "Upload Documents for Knowledge Graph" section
+4. Upload a PDF or other document
+5. Wait for processing to complete
+6. Check status - should show relation count > 0
+
+### Step 5: Verify in Neo4j
+
+Run these queries in Neo4j Browser (`http://localhost:7474`):
+
+```cypher
+// Check if any nodes exist
+MATCH (n)
+RETURN count(n) as node_count
+
+// Check for CAUSES relationships
+MATCH (n:Concept)-[r:CAUSES]->(m:Concept)
+RETURN n.name as cause, 
+       m.name as effect, 
+       r.confidence as confidence,
+       r.job_id as job_id
+LIMIT 50
+```
+
+## Expected Results
+
+After rebuilding and re-uploading:
+
+1. **PDF extraction succeeds** ✅
+2. **Text is extracted** ✅
+3. **Relations are extracted** ✅
+4. **Relations are written to Neo4j** ✅
+5. **Query returns results** ✅
+
+## Troubleshooting
+
+If you still see 0 relations:
+
+1. **Check service logs**:
+   ```bash
+   docker-compose logs multi-document-upload-service | tail -50
+   ```
+
+2. **Check extraction logs**:
+   ```bash
+   docker-compose logs multi-document-upload-service | grep -i "extract\|pdf"
+   ```
+
+3. **Check Claude analysis**:
+   ```bash
+   docker-compose logs multi-document-upload-service | grep -i "claude\|analyze\|relation"
+   ```
+
+4. **Check Neo4j connection**:
+   ```bash
+   docker-compose logs multi-document-upload-service | grep -i "neo4j\|graph\|write"
+   ```
+
+5. **Verify document has causal language**:
+   - Not all documents contain causal relationships
+   - Try uploading a document with clear cause-effect statements
+   - Example: "Smoking causes lung cancer"
+
+## Quick Test
+
+Test with a simple text file:
+
+1. Create a test file `test_causal.txt`:
+   ```
+   Smoking cigarettes causes lung cancer.
+   Heavy rain causes flooding.
+   Exercise improves health.
+   ```
+
+2. Upload it via the frontend
+3. Check Neo4j for relationships
+4. Should see 3 causal relationships
+
+## Next Steps
+
+1. Rebuild the service
+2. Re-upload documents
+3. Check Neo4j for relationships
+4. If still no results, check service logs
+5. Verify the document contains causal language
+
--- a/services/multi-document-upload-service/TESTING_GUIDE.md
+++ b/services/multi-document-upload-service/TESTING_GUIDE.md
@ -0,0 +1,300 @@
+# Multi-Document Upload Service - Frontend Testing Guide
+
+## Prerequisites
+
+1. **Backend Services Running**:
+   ```bash
+   cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine
+   docker-compose up -d
+   ```
+
+2. **Verify Services are Running**:
+   - API Gateway: `http://localhost:8000/health`
+   - Multi-Document Upload Service: `http://localhost:8024/health`
+   - Neo4j: `http://localhost:7474` (Browser interface)
+   - Frontend: `http://localhost:3001` (or your frontend port)
+
+3. **Check Service Health**:
+   ```bash
+   # Check API Gateway
+   curl http://localhost:8000/health
+
+   # Check Multi-Document Upload Service directly
+   curl http://localhost:8024/health
+
+   # Check via API Gateway proxy
+   curl http://localhost:8000/api/multi-docs/health
+   ```
+
+## Frontend Testing Steps
+
+### Step 1: Navigate to Project Builder
+
+1. Open your browser and go to: `http://localhost:3001` (or your frontend URL)
+2. Log in if required
+3. Click on **"Project Builder"** in the navigation
+
+### Step 2: Go to Multi Docs Upload Step
+
+1. In the Project Builder, you should see the workflow steps:
+   - **Step 1**: Project Type
+   - **Step 2**: Features
+   - **Step 3**: Multi Docs Upload ← **This is the new step**
+   - **Step 4**: Business Context
+   - **Step 5**: Generate
+   - **Step 6**: Architecture
+
+2. Complete Steps 1 and 2 (Project Type and Features selection)
+3. You will automatically be taken to **Step 3: Multi Docs Upload**
+
+### Step 3: Upload Documents
+
+1. **Click on the upload area** or **drag and drop files**
+2. **Select multiple files** (you can mix different formats):
+   - PDF files (`.pdf`)
+   - Word documents (`.doc`, `.docx`)
+   - PowerPoint (`.ppt`, `.pptx`)
+   - Excel files (`.xls`, `.xlsx`)
+   - JSON files (`.json`)
+   - XML files (`.xml`)
+   - Markdown files (`.md`)
+   - Images (`.png`, `.jpg`, `.jpeg`) - will use OCR
+   - Audio files (`.mp3`, `.wav`) - will be transcribed
+   - Video files (`.mp4`, `.avi`) - will be transcribed
+
+3. **View selected files**: You should see a list of all selected files with:
+   - File icon
+   - File name
+   - Remove button for each file
+
+4. **Click "Start Upload"** button
+
+### Step 4: Monitor Upload Progress
+
+After clicking "Start Upload", you should see:
+
+1. **Upload Status**:
+   - Button shows "Uploading..." with spinner
+   - Progress bar appears
+   - Stage messages appear:
+     - "Job received"
+     - "Saving files"
+     - "Extracting document content"
+     - "Calling Claude for causal relations"
+     - "Writing to Neo4j knowledge graph"
+     - "Completed"
+
+2. **Progress Indicators**:
+   - Progress percentage (0-100%)
+   - Status message showing current stage
+   - Processed files count vs total files count
+
+3. **Polling**: The frontend automatically polls the job status every 4 seconds
+
+### Step 5: Verify Results
+
+Once the job is completed:
+
+1. **Check Neo4j Graph**:
+   - Open Neo4j Browser: `http://localhost:7474`
+   - Login with:
+     - Username: `neo4j`
+     - Password: `password`
+   - Run Cypher query to see the graph:
+     ```cypher
+     MATCH (n)-[r:CAUSES]->(m)
+     RETURN n, r, m
+     LIMIT 50
+     ```
+
+2. **Check Job Status via API**:
+   ```bash
+   # Replace {job_id} with the actual job ID from the frontend
+   curl http://localhost:8000/api/multi-docs/jobs/{job_id}
+   ```
+
+3. **Get Graph Summary**:
+   ```bash
+   curl http://localhost:8000/api/multi-docs/jobs/{job_id}/graph
+   ```
+
+## Testing Different Scenarios
+
+### Scenario 1: Single PDF File
+- Upload one PDF file
+- Verify it processes correctly
+- Check Neo4j for causal relationships
+
+### Scenario 2: Multiple Mixed Format Files
+- Upload 3-5 files of different formats (PDF, DOCX, JSON, image)
+- Verify all files are processed
+- Check that progress updates correctly
+
+### Scenario 3: Large Files
+- Upload a large PDF (10+ MB)
+- Verify it handles large files correctly
+- Check processing time
+
+### Scenario 4: Error Handling
+- Try uploading an unsupported file type
+- Verify error message appears
+- Check that the error is displayed clearly
+
+### Scenario 5: Skip Option
+- Upload files
+- Click "Skip" button before completion
+- Verify you can proceed to the next step
+- Job continues processing in the background
+
+## Browser Developer Tools
+
+### Check Network Requests
+
+1. **Open Developer Tools** (F12)
+2. **Go to Network tab**
+3. **Filter by "multi-docs"**
+4. **Monitor requests**:
+   - `POST /api/multi-docs/jobs` - Upload files
+   - `GET /api/multi-docs/jobs/{job_id}` - Poll job status
+   - `GET /api/multi-docs/jobs/{job_id}/graph` - Get graph summary
+
+### Check Console Logs
+
+1. **Open Console tab**
+2. **Look for**:
+   - Upload progress logs
+   - Job status updates
+   - Any error messages
+
+### Check Response Data
+
+Verify the API responses:
+
+```javascript
+// Upload response should be:
+{
+  "job_id": "uuid-here",
+  "stage": "received",
+  "total_files": 3,
+  "created_at": "2024-01-01T00:00:00Z"
+}
+
+// Status response should be:
+{
+  "job_id": "uuid-here",
+  "stage": "extracting",
+  "status_message": "Extracting document content",
+  "total_files": 3,
+  "processed_files": 1,
+  "error": null,
+  "created_at": "2024-01-01T00:00:00Z",
+  "updated_at": "2024-01-01T00:01:00Z",
+  "files": [...]
+}
+```
+
+## Troubleshooting
+
+### Issue: Upload fails with 502 Bad Gateway
+**Solution**:
+- Check if multi-document-upload-service is running:
+  ```bash
+  docker-compose ps multi-document-upload-service
+  ```
+- Check service logs:
+  ```bash
+  docker-compose logs multi-document-upload-service
+  ```
+
+### Issue: Upload fails with 413 Request Entity Too Large
+**Solution**:
+- Check file sizes (max 500MB total per job)
+- Reduce number of files or file sizes
+- Check API Gateway body size limits
+
+### Issue: Status polling stops working
+**Solution**:
+- Check browser console for errors
+- Verify job ID is correct
+- Check if job completed or failed
+- Check network tab for failed requests
+
+### Issue: No causal relationships found
+**Solution**:
+- Check Claude API key is configured correctly
+- Check service logs for Claude API errors
+- Verify documents contain causal language
+- Check Neo4j connection
+
+### Issue: Frontend shows "Failed" status
+**Solution**:
+- Check the error message in the frontend
+- Check backend service logs:
+  ```bash
+  docker-compose logs -f multi-document-upload-service
+  ```
+- Verify all dependencies are running (Neo4j, Redis, Postgres)
+
+## Expected Behavior
+
+### Successful Flow:
+1. ✅ Files upload successfully
+2. ✅ Job ID is returned
+3. ✅ Status polling starts automatically
+4. ✅ Progress updates every 4 seconds
+5. ✅ Stage changes are displayed
+6. ✅ Progress bar updates
+7. ✅ Job completes successfully
+8. ✅ Frontend automatically proceeds to next step
+9. ✅ Neo4j contains causal relationships
+
+### Error Flow:
+1. ✅ Error message is displayed clearly
+2. ✅ User can retry upload
+3. ✅ User can skip and proceed
+4. ✅ Error details are logged in console
+
+## API Endpoints Reference
+
+### Upload Files
+```bash
+POST /api/multi-docs/jobs
+Content-Type: multipart/form-data
+
+Form Data:
+- files: File[] (multiple files)
+- job_name: string (optional)
+```
+
+### Get Job Status
+```bash
+GET /api/multi-docs/jobs/{job_id}
+```
+
+### Get Graph Summary
+```bash
+GET /api/multi-docs/jobs/{job_id}/graph
+```
+
+### Health Check
+```bash
+GET /api/multi-docs/health
+```
+
+## Next Steps After Testing
+
+1. **Verify Neo4j Graph**: Check that causal relationships are stored correctly
+2. **Check Storage**: Verify files are stored in the persistent volume
+3. **Monitor Performance**: Check processing times for different file types
+4. **Test Error Scenarios**: Verify error handling works correctly
+5. **Test Large Batches**: Upload 50+ files to test scalability
+
+## Support
+
+If you encounter issues:
+1. Check service logs: `docker-compose logs multi-document-upload-service`
+2. Check API Gateway logs: `docker-compose logs api-gateway`
+3. Check Neo4j logs: `docker-compose logs neo4j`
+4. Verify all environment variables are set correctly
+5. Check network connectivity between services
+
--- a/services/multi-document-upload-service/requirements.txt
+++ b/services/multi-document-upload-service/requirements.txt
@ -0,0 +1,34 @@
+fastapi>=0.110.0
+uvicorn[standard]>=0.29.0
+anthropic>=0.33.0
+neo4j>=5.23.0
+python-multipart>=0.0.9
+pydantic>=2.7.0
+pydantic-settings>=2.2.1
+aiofiles>=23.2.1
+tenacity>=8.2.3
+python-dotenv>=1.0.1
+unstructured[pdf]>=0.15.0
+unstructured[docx]>=0.15.0
+unstructured[pptx]>=0.15.0
+unstructured[xlsx]>=0.15.0
+pdfplumber>=0.11.0
+python-docx>=1.1.0
+python-pptx>=0.6.23
+pandas>=2.2.2
+openpyxl>=3.1.2
+xlrd>=2.0.1
+pytesseract>=0.3.10
+Pillow>=10.3.0
+opencv-python-headless>=4.9.0.80
+PyMuPDF>=1.23.0
+pdf2image>=1.16.3
+faster-whisper>=0.10.0
+ffmpeg-python>=0.2.0
+pydub>=0.25.1
+beautifulsoup4>=4.12.3
+lxml>=5.2.1
+sqlalchemy>=2.0.25
+httpx>=0.27.0
+tiktoken>=0.7.0
+
--- a/services/multi-document-upload-service/src/multi_document_upload_service/init.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/init.py
@ -0,0 +1,4 @@
+"""
+Multi Document Upload Service package.
+"""
+
--- a/services/multi-document-upload-service/src/multi_document_upload_service/claude_client.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/claude_client.py
@ -0,0 +1,328 @@
+from __future__ import annotations
+
+import base64
+import json
+import logging
+import re
+from pathlib import Path
+from typing import Iterable, List
+
+from anthropic import Anthropic, BadRequestError
+from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential, RetryCallState
+
+from .models import CausalRelation
+
+logger = logging.getLogger(__name__)
+
+
+def is_billing_error(exception: Exception) -> bool:
+    """Check if the exception is a billing/credit related error that shouldn't be retried."""
+    if isinstance(exception, BadRequestError):
+        error_message = str(exception).lower()
+        billing_keywords = ["credit", "balance", "too low", "billing", "upgrade", "purchase credits"]
+        return any(keyword in error_message for keyword in billing_keywords)
+    return False
+
+
+def should_retry_exception(retry_state: RetryCallState) -> bool:
+    """Custom retry condition that excludes billing errors."""
+    exception = retry_state.outcome.exception()
+    if exception is None:
+        return False
+    # Don't retry billing errors - they won't be resolved by retrying
+    if is_billing_error(exception):
+        return False
+    # Retry other exceptions
+    return True
+
+
+CLAUDE_PROMPT_TEMPLATE = """You are an expert analyst extracting causal relationships from documents.
+
+Given the following text chunk, identify all explicit or strongly implied cause and effect pairs.
+Return JSON with the schema:
+[
+  {
+    "cause": "<short phrase>",
+    "effect": "<short phrase>",
+    "confidence": 0-1 float,
+    "explanation": "<why this is causal>",
+    "source_snippet": "<exact quote or paraphrase>"
+  }
+]
+
+Only include items when the causal direction is clear.
+If none are found, return an empty list [].
+
+Text chunk:
+```
+<<<CHUNK_PLACEHOLDER>>>
+```"""
+
+IMAGE_PROMPT_TEMPLATE = """You are an expert analyst extracting causal relationships from images, diagrams, and visual content.
+
+Analyze this image/diagram for causal relationships. Look for:
+- Architecture flows (A → B → C)
+- Dependency relationships
+- Cause-effect chains in diagrams
+- Process flows
+- System interactions
+- Data flows
+- Sequential relationships
+- Visual connections between components
+
+Return JSON with the schema:
+[
+  {
+    "cause": "<short phrase describing the cause>",
+    "effect": "<short phrase describing the effect>",
+    "confidence": 0-1 float,
+    "explanation": "<why this is causal, referencing visual elements>",
+    "source_snippet": "<description of what you see in the image that shows this relationship>"
+  }
+]
+
+Only include items when the causal direction is clear from the visual structure.
+If none are found, return an empty list []."""
+
+
+class ClaudeCausalExtractor:
+    def __init__(self, api_key: str, model: str, max_output_tokens: int = 4000):
+        self.client = Anthropic(api_key=api_key)
+        self.model = model
+        self.max_output_tokens = max_output_tokens
+
+    @retry(
+        retry=should_retry_exception,
+        wait=wait_exponential(multiplier=1, min=1, max=10),
+        stop=stop_after_attempt(3),
+        reraise=True,
+    )
+    def analyze_chunk(self, chunk: str, source_file_id: str) -> List[CausalRelation]:
+        logger.debug("Analyzing chunk with Claude model %s", self.model)
+        
+        # Validate chunk is not empty and is readable text
+        if not chunk or not chunk.strip():
+            logger.warning("Empty or whitespace-only chunk, skipping")
+            return []
+        
+        # Check if chunk contains mostly readable text (not binary data)
+        # Simple heuristic: if >50% of characters are non-printable or control chars, skip it
+        printable_chars = sum(1 for c in chunk if c.isprintable() or c.isspace())
+        if len(chunk) > 100 and printable_chars / len(chunk) < 0.5:
+            logger.warning("Chunk appears to contain binary data, skipping analysis")
+            return []
+        
+        # Use string replacement with a unique placeholder to avoid KeyError with braces in content
+        # This prevents Python's .format() from interpreting braces in the chunk text as format placeholders
+        prompt_text = CLAUDE_PROMPT_TEMPLATE.replace("<<<CHUNK_PLACEHOLDER>>>", chunk)
+
+        try:
+            message = self.client.messages.create(
+                model=self.model,
+                max_tokens=self.max_output_tokens,
+                temperature=0.0,
+                system="You extract causal (cause→effect) relations with high precision.",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [{"type": "text", "text": prompt_text}],
+                    }
+                ],
+            )
+        except BadRequestError as e:
+            # Check if it's a billing error
+            if is_billing_error(e):
+                error_msg = (
+                    "Anthropic API credit balance is too low. "
+                    "Please go to Plans & Billing to upgrade or purchase credits. "
+                    f"Error: {str(e)}"
+                )
+                logger.error(error_msg)
+                raise RuntimeError(error_msg) from e
+            # Re-raise other BadRequestErrors
+            raise
+
+        content_blocks = message.content or []
+        raw_text = "".join(block.text for block in content_blocks if hasattr(block, "text"))  # type: ignore[attr-defined]
+        if not raw_text:
+            return []
+
+        # Try to extract JSON from markdown code blocks if present
+        json_text = raw_text.strip()
+        
+        # Look for JSON in markdown code blocks (```json ... ```)
+        json_match = re.search(r'```(?:json)?\s*(\[.*?\])\s*```', json_text, re.DOTALL)
+        if json_match:
+            json_text = json_match.group(1)
+        else:
+            # Look for JSON array/object at the start or end
+            json_match = re.search(r'(\[.*?\]|{.*?})', json_text, re.DOTALL)
+            if json_match:
+                json_text = json_match.group(1)
+
+        try:
+            data = json.loads(json_text)
+            if not isinstance(data, list):
+                logger.warning("Claude response is not a list: %s", type(data))
+                return []
+            
+            relations: List[CausalRelation] = []
+            for item in data:
+                if not isinstance(item, dict):
+                    continue
+                cause = item.get("cause", "").strip()
+                effect = item.get("effect", "").strip()
+                if not cause or not effect:
+                    continue  # Skip invalid relations
+                    
+                relations.append(
+                    CausalRelation(
+                        cause=cause,
+                        effect=effect,
+                        confidence=float(item.get("confidence", 0.0)),
+                        explanation=item.get("explanation"),
+                        source_file_id=source_file_id,
+                        source_snippet=item.get("source_snippet"),
+                        metadata={"model": self.model},
+                    )
+                )
+            logger.info("Extracted %d relations from Claude response", len(relations))
+            return relations
+        except json.JSONDecodeError as e:
+            logger.warning("Failed to parse Claude response as JSON: %s. Raw text: %s", e, raw_text[:200])
+            return []
+
+    def analyze(self, chunks: Iterable[str], source_file_id: str) -> List[CausalRelation]:
+        relations: List[CausalRelation] = []
+        for chunk in chunks:
+            relations.extend(self.analyze_chunk(chunk, source_file_id=source_file_id))
+        return relations
+
+    @retry(
+        retry=should_retry_exception,
+        wait=wait_exponential(multiplier=1, min=1, max=10),
+        stop=stop_after_attempt(3),
+        reraise=True,
+    )
+    def analyze_image(self, image_path: Path, source_file_id: str) -> List[CausalRelation]:
+        """
+        Analyze an image using Claude Vision API to extract causal relationships.
+        Sends image directly to Claude (no OCR).
+        """
+        logger.info("Analyzing image with Claude Vision: %s", image_path.name)
+        
+        try:
+            # Read and encode image as base64
+            with open(image_path, "rb") as image_file:
+                image_data = image_file.read()
+            
+            # Determine media type
+            suffix = image_path.suffix.lower()
+            media_type_map = {
+                ".png": "image/png",
+                ".jpg": "image/jpeg",
+                ".jpeg": "image/jpeg",
+                ".gif": "image/gif",
+                ".webp": "image/webp",
+            }
+            media_type = media_type_map.get(suffix, "image/png")
+            
+            # Encode to base64
+            base64_image = base64.b64encode(image_data).decode("utf-8")
+            
+            # Prepare content for Claude Vision API
+            content = [
+                {
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": media_type,
+                        "data": base64_image,
+                    },
+                },
+                {
+                    "type": "text",
+                    "text": IMAGE_PROMPT_TEMPLATE,
+                },
+            ]
+            
+            # Call Claude Vision API
+            try:
+                message = self.client.messages.create(
+                    model=self.model,  # Claude models support vision
+                    max_tokens=self.max_output_tokens,
+                    temperature=0.0,
+                    system="You extract causal (cause→effect) relations from visual content with high precision.",
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": content,
+                        }
+                    ],
+                )
+            except BadRequestError as e:
+                # Check if it's a billing error
+                if is_billing_error(e):
+                    error_msg = (
+                        "Anthropic API credit balance is too low. "
+                        "Please go to Plans & Billing to upgrade or purchase credits. "
+                        f"Error: {str(e)}"
+                    )
+                    logger.error(error_msg)
+                    raise RuntimeError(error_msg) from e
+                # Re-raise other BadRequestErrors
+                raise
+            
+            # Parse response
+            content_blocks = message.content or []
+            raw_text = "".join(block.text for block in content_blocks if hasattr(block, "text"))  # type: ignore[attr-defined]
+            if not raw_text:
+                logger.warning("No text response from Claude Vision for image %s", image_path.name)
+                return []
+            
+            # Extract JSON from response
+            json_text = raw_text.strip()
+            json_match = re.search(r'```(?:json)?\s*(\[.*?\])\s*```', json_text, re.DOTALL)
+            if json_match:
+                json_text = json_match.group(1)
+            else:
+                json_match = re.search(r'(\[.*?\]|{.*?})', json_text, re.DOTALL)
+                if json_match:
+                    json_text = json_match.group(1)
+            
+            try:
+                data = json.loads(json_text)
+                if not isinstance(data, list):
+                    logger.warning("Claude Vision response is not a list: %s", type(data))
+                    return []
+                
+                relations: List[CausalRelation] = []
+                for item in data:
+                    if not isinstance(item, dict):
+                        continue
+                    cause = item.get("cause", "").strip()
+                    effect = item.get("effect", "").strip()
+                    if not cause or not effect:
+                        continue
+                    
+                    relations.append(
+                        CausalRelation(
+                            cause=cause,
+                            effect=effect,
+                            confidence=float(item.get("confidence", 0.0)),
+                            explanation=item.get("explanation"),
+                            source_file_id=source_file_id,
+                            source_snippet=item.get("source_snippet") or f"Image: {image_path.name}",
+                            metadata={"model": self.model, "content_type": "image", "image_path": str(image_path)},
+                        )
+                    )
+                logger.info("Extracted %d relations from image %s", len(relations), image_path.name)
+                return relations
+            except json.JSONDecodeError as e:
+                logger.warning("Failed to parse Claude Vision response as JSON: %s. Raw text: %s", e, raw_text[:200])
+                return []
+                
+        except Exception as exc:
+            logger.exception("Failed to analyze image %s: %s", image_path, exc)
+            return []
+
--- a/services/multi-document-upload-service/src/multi_document_upload_service/config.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/config.py
@ -0,0 +1,52 @@
+from __future__ import annotations
+
+import os
+from functools import lru_cache
+from pathlib import Path
+
+from pydantic import Field
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+DEFAULT_STORAGE_ROOT = Path(
+    os.getenv("MULTI_DOC_STORAGE_ROOT", Path(__file__).resolve().parent.parent.parent / "storage")
+)
+DEFAULT_STORAGE_ROOT.mkdir(parents=True, exist_ok=True)
+
+
+class Settings(BaseSettings):
+    """Application configuration loaded from environment variables."""
+
+    model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")
+
+    anthropic_api_key: str | None = Field(default=None, validation_alias="ANTHROPIC_API_KEY")
+    claude_model: str = Field(default=os.getenv("MULTI_DOC_CLAUDE_MODEL", "claude-3-5-sonnet-20241022"))
+    claude_max_input_tokens: int = Field(default=200_000)
+    claude_max_output_tokens: int = Field(default=16_000)
+
+    neo4j_uri: str = Field(default=os.getenv("NEO4J_URI", "bolt://localhost:7687"))
+    neo4j_user: str = Field(default=os.getenv("NEO4J_USER", "neo4j"))
+    neo4j_password: str = Field(default=os.getenv("NEO4J_PASSWORD", "neo4j"))
+
+    storage_root: Path = Field(default=DEFAULT_STORAGE_ROOT)
+    max_upload_size_mb: int = Field(default=500)
+    max_files_per_job: int = Field(default=200)
+
+    chunk_token_target: int = Field(default=800)
+    chunk_token_overlap: int = Field(default=200)
+
+    job_retention_days: int = Field(default=30)
+
+    def ensure_storage_dirs(self) -> None:
+        (self.storage_root / "jobs").mkdir(parents=True, exist_ok=True)
+        (self.storage_root / "uploads").mkdir(parents=True, exist_ok=True)
+        (self.storage_root / "extracted").mkdir(parents=True, exist_ok=True)
+        (self.storage_root / "images").mkdir(parents=True, exist_ok=True)
+
+
+@lru_cache
+def get_settings() -> Settings:
+    settings = Settings()
+    settings.ensure_storage_dirs()
+    return settings
+
--- a/services/multi-document-upload-service/src/multi_document_upload_service/extractors/auto.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/extractors/auto.py
@ -0,0 +1,168 @@
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+from typing import List
+
+logger = logging.getLogger(__name__)
+
+# Try to import unstructured, but fall back to alternatives if not available
+try:
+    from unstructured.partition.auto import partition
+    HAS_UNSTRUCTURED = True
+except ImportError:
+    HAS_UNSTRUCTURED = False
+    logger.warning("unstructured not available, will use fallback extractors")
+
+# Fallback extractors
+try:
+    import pdfplumber
+    HAS_PDFPLUMBER = True
+except ImportError:
+    HAS_PDFPLUMBER = False
+
+try:
+    from docx import Document as DocxDocument
+    HAS_DOCX = True
+except ImportError:
+    HAS_DOCX = False
+
+try:
+    from pptx import Presentation
+    HAS_PPTX = True
+except ImportError:
+    HAS_PPTX = False
+
+# Image processing libraries
+try:
+    from PIL import Image
+    import pytesseract
+    HAS_OCR = True
+except ImportError:
+    HAS_OCR = False
+    logger.warning("OCR libraries not available, image extraction will be limited")
+
+
+def extract_text(path: Path) -> str:
+    """
+    Extract text from a file using multiple strategies.
+    Falls back through: unstructured -> format-specific -> plain text read.
+    """
+    suffix = path.suffix.lower()
+    
+    # Validate PDF file before processing
+    if suffix == ".pdf":
+        # Quick validation: check if file starts with PDF magic bytes
+        try:
+            with path.open("rb") as f:
+                header = f.read(4)
+                if header != b"%PDF":
+                    raise ValueError(
+                        f"File {path.name} does not appear to be a valid PDF. "
+                        f"PDF files must start with '%PDF' magic bytes. "
+                        f"Got: {header[:20] if len(header) > 0 else 'empty file'}"
+                    )
+        except Exception as exc:
+            if isinstance(exc, ValueError):
+                raise
+            logger.warning("Could not validate PDF header: %s", exc)
+    
+    # Image files - return empty text (will be processed directly with Claude Vision)
+    # We skip OCR and send images directly to Claude Vision API
+    if suffix in {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}:
+        logger.info("Image file detected: %s. Will be processed directly with Claude Vision (no OCR)", path.name)
+        # Return empty string - images will be handled separately in pipeline
+        return ""
+    
+    # Plain text files - direct read
+    if suffix in {".txt", ".md", ".json", ".xml", ".html", ".csv"}:
+        try:
+            return path.read_text(encoding="utf-8", errors="ignore")
+        except Exception as exc:
+            logger.warning("Failed to read %s as text: %s", path, exc)
+            raise
+    
+    # Try unstructured first (if available)
+    if HAS_UNSTRUCTURED:
+        try:
+            elements = partition(filename=str(path))
+            lines: List[str] = []
+            for element in elements:
+                text = getattr(element, "text", None)
+                if text:
+                    lines.append(text.strip())
+            if lines:
+                logger.info("Extracted %d lines using unstructured", len(lines))
+                return "\n".join(lines)
+        except Exception as exc:
+            logger.warning("unstructured extraction failed for %s: %s", path, exc)
+            # Continue to fallback methods
+    
+    # Fallback: PDF with pdfplumber
+    if suffix == ".pdf" and HAS_PDFPLUMBER:
+        try:
+            with pdfplumber.open(path) as pdf:
+                text_parts = []
+                for page in pdf.pages:
+                    page_text = page.extract_text()
+                    if page_text:
+                        text_parts.append(page_text)
+                if text_parts:
+                    logger.info("Extracted PDF using pdfplumber")
+                    return "\n".join(text_parts)
+        except Exception as exc:
+            logger.warning("pdfplumber extraction failed for %s: %s", path, exc)
+    
+    # Fallback: DOCX
+    if suffix == ".docx" and HAS_DOCX:
+        try:
+            doc = DocxDocument(path)
+            paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
+            if paragraphs:
+                logger.info("Extracted DOCX using python-docx")
+                return "\n".join(paragraphs)
+        except Exception as exc:
+            logger.warning("python-docx extraction failed for %s: %s", path, exc)
+    
+    # Fallback: PPTX
+    if suffix in {".pptx", ".ppt"} and HAS_PPTX:
+        try:
+            prs = Presentation(path)
+            text_parts = []
+            for slide in prs.slides:
+                for shape in slide.shapes:
+                    if hasattr(shape, "text") and shape.text:
+                        text_parts.append(shape.text.strip())
+            if text_parts:
+                logger.info("Extracted PPTX using python-pptx")
+                return "\n".join(text_parts)
+        except Exception as exc:
+            logger.warning("python-pptx extraction failed for %s: %s", path, exc)
+    
+    # Last resort: try to read as text anyway, but validate it's readable
+    try:
+        content = path.read_text(encoding="utf-8", errors="ignore")
+        if content.strip():
+            # Check if content is actually readable text (not binary data)
+            # Simple heuristic: if >30% of characters are printable, consider it text
+            printable_chars = sum(1 for c in content if c.isprintable() or c.isspace())
+            total_chars = len(content)
+            
+            if total_chars > 0 and printable_chars / total_chars > 0.3:
+                logger.warning("Read %s as plain text (may contain binary data)", path)
+                return content
+            else:
+                logger.error("Content from %s appears to be binary data, cannot extract text", path)
+                raise ValueError(f"File {path} appears to be binary or corrupted. Cannot extract readable text.")
+    except Exception as exc:
+        if isinstance(exc, ValueError):
+            raise
+        logger.warning("Failed to read %s as text: %s", path, exc)
+    
+    # If all else fails, raise an error
+    raise ValueError(
+        f"Could not extract text from {path}. "
+        f"File type may not be supported, file may be corrupted, or dependencies are missing. "
+        f"Supported formats: PDF, DOCX, PPTX, XLSX, TXT, MD, JSON, XML, HTML, CSV, PNG, JPG, JPEG (with OCR)"
+    )
+
--- a/services/multi-document-upload-service/src/multi_document_upload_service/extractors/image_extractor.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/extractors/image_extractor.py
@ -0,0 +1,514 @@
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+from typing import List, Tuple
+from io import BytesIO
+
+from PIL import Image
+
+logger = logging.getLogger(__name__)
+
+# Header/Footer detection thresholds
+HEADER_THRESHOLD = 0.15  # Top 15% of page is considered header
+FOOTER_THRESHOLD = 0.15  # Bottom 15% of page is considered footer
+MIN_CONTENT_HEIGHT = 0.3  # Minimum 30% of page height for content area
+
+# Try to import PDF libraries
+try:
+    import fitz  # PyMuPDF
+    HAS_PYMUPDF = True
+except ImportError:
+    HAS_PYMUPDF = False
+    logger.warning("PyMuPDF not available, PDF image extraction will be limited")
+
+try:
+    from pdf2image import convert_from_path
+    HAS_PDF2IMAGE = True
+except ImportError:
+    HAS_PDF2IMAGE = False
+
+# DOCX image extraction
+try:
+    from docx import Document as DocxDocument
+    HAS_DOCX = True
+except ImportError:
+    HAS_DOCX = False
+
+# PPTX image extraction
+try:
+    from pptx import Presentation
+    HAS_PPTX = True
+except ImportError:
+    HAS_PPTX = False
+
+
+def is_header_footer_image(bbox: Tuple[float, float, float, float], page_height: float, page_width: float) -> bool:
+    """
+    Check if an image is in header or footer region.
+    bbox: (x0, y0, x1, y1) - image bounding box coordinates
+    Returns True if image is in header/footer, False otherwise (i.e., in body/content area).
+    """
+    x0, y0, x1, y1 = bbox
+    
+    # Calculate relative positions
+    top_ratio = y0 / page_height if page_height > 0 else 0
+    bottom_ratio = y1 / page_height if page_height > 0 else 0
+    height_ratio = (y1 - y0) / page_height if page_height > 0 else 0
+    
+    # AGGRESSIVE header/footer detection - use 25% threshold for top and bottom
+    # This ensures we only extract images from the middle 50% of the page (body area)
+    HEADER_THRESHOLD = 0.25  # Top 25% is header
+    FOOTER_THRESHOLD = 0.25  # Bottom 25% is footer
+    BODY_START = HEADER_THRESHOLD  # Body starts at 25%
+    BODY_END = 1.0 - FOOTER_THRESHOLD  # Body ends at 75%
+    
+    # PRIMARY CHECK: Image must be ENTIRELY in the body area (middle 50%)
+    # If ANY part of the image is in header or footer, skip it
+    image_center_y = (y0 + y1) / 2.0 / page_height if page_height > 0 else 0
+    
+    # Check if image is completely in header region (top 25%)
+    if bottom_ratio <= HEADER_THRESHOLD:
+        logger.info("Image in header region (top: %.2f%%, bottom: %.2f%%)", top_ratio * 100, bottom_ratio * 100)
+        return True
+    
+    # Check if image is completely in footer region (bottom 25%)
+    if top_ratio >= BODY_END:
+        logger.info("Image in footer region (top: %.2f%%, bottom: %.2f%%)", top_ratio * 100, bottom_ratio * 100)
+        return True
+    
+    # Check if image overlaps header (starts in header, even if extends into body)
+    if top_ratio < HEADER_THRESHOLD:
+        logger.info("Image overlaps header region (top: %.2f%%, bottom: %.2f%%)", top_ratio * 100, bottom_ratio * 100)
+        return True
+    
+    # Check if image overlaps footer (ends in footer, even if starts in body)
+    if bottom_ratio > BODY_END:
+        logger.info("Image overlaps footer region (top: %.2f%%, bottom: %.2f%%)", top_ratio * 100, bottom_ratio * 100)
+        return True
+    
+    # Check if image center is in header or footer (even if image spans both)
+    if image_center_y < HEADER_THRESHOLD or image_center_y > BODY_END:
+        logger.info("Image center in header/footer (center: %.2f%%)", image_center_y * 100)
+        return True
+    
+    # Check if image is very small and near edges (likely logo/icon)
+    if height_ratio < 0.10:  # Less than 10% of page height
+        # If it's small and in top 30% or bottom 30%, likely header/footer
+        if top_ratio < 0.30 or bottom_ratio > 0.70:
+            logger.info("Small image near header/footer (height: %.2f%%, top: %.2f%%, bottom: %.2f%%)", 
+                       height_ratio * 100, top_ratio * 100, bottom_ratio * 100)
+            return True
+    
+    # Image is in body/content area - allow it
+    return False
+
+
+def crop_header_footer(image_path: Path, output_path: Path, header_ratio: float = HEADER_THRESHOLD, footer_ratio: float = FOOTER_THRESHOLD) -> bool:
+    """
+    Crop header and footer regions from a full-page image.
+    Returns True if cropping was successful, False otherwise.
+    """
+    try:
+        img = Image.open(image_path)
+        width, height = img.size
+        
+        # Calculate crop boundaries
+        header_pixels = int(height * header_ratio)
+        footer_pixels = int(height * footer_ratio)
+        
+        # Ensure there's enough content height left after cropping
+        remaining_height = height - header_pixels - footer_pixels
+        remaining_ratio = remaining_height / height
+        
+        if remaining_ratio < MIN_CONTENT_HEIGHT:
+            logger.warning("Cropping would remove too much content from %s (remaining: %.2f%% < %.2f%%), skipping crop", 
+                          image_path.name, remaining_ratio * 100, MIN_CONTENT_HEIGHT * 100)
+            return False
+        
+        # Crop: remove top (header) and bottom (footer)
+        cropped = img.crop((0, header_pixels, width, height - footer_pixels))
+        
+        # Save cropped image
+        cropped.save(output_path)
+        logger.info("Cropped header/footer from %s (removed %dpx top, %dpx bottom, remaining: %.2f%%)", 
+                   image_path.name, header_pixels, footer_pixels, remaining_ratio * 100)
+        return True
+    except Exception as exc:
+        logger.warning("Failed to crop header/footer from %s: %s", image_path, exc)
+        return False
+
+
+def extract_images_from_pdf(pdf_path: Path, output_dir: Path) -> List[Path]:
+    """
+    Extract all images from a PDF file.
+    Returns list of paths to extracted image files.
+    """
+    extracted_images: List[Path] = []
+    
+    if not HAS_PYMUPDF:
+        logger.warning("PyMuPDF not available, cannot extract images from PDF")
+        return extracted_images
+    
+    try:
+        doc = fitz.open(pdf_path)
+        image_count = 0
+        skipped_count = 0
+        
+        for page_num, page in enumerate(doc):
+            page_rect = page.rect
+            page_height = page_rect.height
+            page_width = page_rect.width
+            
+            # Extract embedded images
+            image_list = page.get_images()
+            
+            # Log total images found on this page BEFORE filtering
+            logger.info("Page %d: Found %d embedded images (page size: %.0fx%.0f)", 
+                       page_num, len(image_list), page_width, page_height)
+            
+            for img_index, img in enumerate(image_list):
+                try:
+                    xref = img[0]
+                    base_image = doc.extract_image(xref)
+                    image_bytes = base_image["image"]
+                    image_ext = base_image["ext"]
+                    
+                    logger.debug("Processing image %d from page %d (xref: %d, ext: %s, size: %d bytes)", 
+                               img_index, page_num, xref, image_ext, len(image_bytes))
+                    
+                    # Get image position and size for header/footer detection
+                    is_header_footer = False
+                    image_rect = None
+                    img_width, img_height = 0, 0
+                    position_detection_succeeded = False
+                    size_detection_succeeded = False
+                    aspect_ratio = 0.0
+                    img_height_ratio = 0.0
+                    img_width_ratio = 0.0
+                    
+                    # PRIMARY METHOD: Check position FIRST (most reliable for header/footer detection)
+                    # Position-based detection is the most accurate way to determine if image is in body area
+                    try:
+                        image_rect = page.get_image_rect(xref)
+                        if image_rect and not image_rect.is_empty and image_rect.width > 0 and image_rect.height > 0:
+                            position_detection_succeeded = True
+                            # Check if image is in header/footer based on position
+                            bbox = (image_rect.x0, image_rect.y0, image_rect.x1, image_rect.y1)
+                            if is_header_footer_image(bbox, page_height, page_width):
+                                logger.info("Skipping header/footer image %d from page %d (position: y0=%.1f, y1=%.1f, height=%.1f, width=%.1f)", 
+                                           img_index, page_num, image_rect.y0, image_rect.y1, image_rect.height, image_rect.width)
+                                skipped_count += 1
+                                is_header_footer = True
+                    except Exception as bbox_exc:
+                        logger.debug("Could not get image rect for image %d on page %d: %s", img_index, page_num, bbox_exc)
+                        position_detection_succeeded = False
+                    
+                    # SECONDARY METHOD: Check size (only if position check didn't catch it or failed)
+                    # Use size-based detection as a fallback for banner-like images
+                    if not is_header_footer:
+                        try:
+                            # Check image dimensions - useful for catching banners
+                            from PIL import Image as PILImage
+                            from io import BytesIO
+                            img_obj = PILImage.open(BytesIO(image_bytes))
+                            img_width, img_height = img_obj.size
+                            size_detection_succeeded = True
+                            
+                            # Calculate relative size
+                            img_height_ratio = img_height / page_height if page_height > 0 else 0
+                            img_width_ratio = img_width / page_width if page_width > 0 else 0
+                            aspect_ratio = img_width / img_height if img_height > 0 else 0
+                            
+                            # Size-based filtering: Skip banner-like images
+                            # These checks catch wide banners and small logos/icons
+                            
+                            # 1. Very small absolute height (< 300px) - catches logos and small banners
+                            is_very_small_height = img_height < 300
+                            
+                            # 2. Banner aspect ratio (width >> height) - catches wide banners
+                            is_banner_aspect = aspect_ratio > 2.5
+                            
+                            # 3. Short relative to page (< 30% of page height) - catches banners
+                            is_short_relative = img_height_ratio < 0.30
+                            
+                            # 4. Tiny relative size (< 20% height AND < 50% width) - catches icons/logos
+                            is_tiny_relative = (img_height_ratio < 0.20 and img_width_ratio < 0.50)
+                            
+                            # 5. Wide banner pattern: short height (< 400px) AND wide (width > 2x height)
+                            is_wide_banner_pattern = (img_height < 400 and img_width > img_height * 2.0)
+                            
+                            # 6. Typical banner size: very wide (> 1000px) AND short (< 300px)
+                            is_typical_banner_size = (img_width > 1000 and img_height < 300)
+                            
+                            # 7. Very wide images: width > 800px AND height < 250px
+                            is_very_wide = (img_width > 800 and img_height < 250)
+                            
+                            # 8. Short and wide: height < 250px AND width > 600px
+                            is_short_wide = (img_height < 250 and img_width > 600)
+                            
+                            # 9. Very common banner: width > 600px AND height < 200px
+                            is_common_banner = (img_width > 600 and img_height < 200)
+                            
+                            # Combine checks - skip if it looks like a banner or header/footer element
+                            is_likely_header_footer = (
+                                is_very_small_height or
+                                is_banner_aspect or
+                                is_short_relative or
+                                is_tiny_relative or
+                                is_wide_banner_pattern or
+                                is_typical_banner_size or
+                                is_very_wide or
+                                is_short_wide or
+                                is_common_banner or
+                                # If short AND wide, definitely skip
+                                (is_short_relative and is_banner_aspect) or
+                                # Final catch-all: if width is much larger than height, skip
+                                (img_width > img_height * 2.0 and img_height < 400)
+                            )
+                            
+                            if is_likely_header_footer:
+                                logger.info("Skipping header/footer image %d from page %d (size-based: %dx%d, aspect: %.2f, height_ratio: %.2f%%, width_ratio: %.2f%%)", 
+                                           img_index, page_num, img_width, img_height, aspect_ratio,
+                                           img_height_ratio * 100, img_width_ratio * 100)
+                                skipped_count += 1
+                                is_header_footer = True
+                        except Exception as size_exc:
+                            logger.debug("Could not analyze image size for image %d on page %d: %s", img_index, page_num, size_exc)
+                            size_detection_succeeded = False
+                    
+                    # FINAL SAFETY: If position detection failed, be more aggressive
+                    # If we can't verify position, skip images that are suspicious
+                    if not position_detection_succeeded and size_detection_succeeded and not is_header_footer:
+                        # Skip images larger than the page (likely background/header/footer images)
+                        if img_height_ratio > 1.0 or img_width_ratio > 1.0:
+                            logger.info("Skipping image %d from page %d (position unknown, but image larger than page: height_ratio=%.1f%%, width_ratio=%.1f%%)", 
+                                       img_index, page_num, img_height_ratio * 100, img_width_ratio * 100)
+                            skipped_count += 1
+                            is_header_footer = True
+                        # Also skip if image is very large relative to page (likely background)
+                        elif img_height_ratio > 0.80 or img_width_ratio > 0.80:
+                            logger.info("Skipping image %d from page %d (position unknown, but image very large relative to page: height_ratio=%.1f%%, width_ratio=%.1f%%)", 
+                                       img_index, page_num, img_height_ratio * 100, img_width_ratio * 100)
+                            skipped_count += 1
+                            is_header_footer = True
+                    
+                    # FINAL SAFETY: If we can't determine position AND size, skip the image (conservative approach)
+                    # This prevents unknown images from slipping through
+                    if not position_detection_succeeded and not size_detection_succeeded and not is_header_footer:
+                        logger.warning("Cannot determine position or size for image %d on page %d, skipping for safety (cannot verify it's in body area)", img_index, page_num)
+                        skipped_count += 1
+                        is_header_footer = True
+                    
+                    # Skip this image if it's in header/footer
+                    if is_header_footer:
+                        continue
+                    
+                    # Save image (not in header/footer, passed all checks - must be in body area)
+                    image_filename = f"page_{page_num}_img_{img_index}.{image_ext}"
+                    image_path = output_dir / image_filename
+                    
+                    # Get position info for logging
+                    position_info = ""
+                    if image_rect:
+                        # Calculate relative position to show it's in body area
+                        y0_ratio = image_rect.y0 / page_height if page_height > 0 else 0
+                        y1_ratio = image_rect.y1 / page_height if page_height > 0 else 0
+                        position_info = f", position: y0={image_rect.y0:.1f} ({y0_ratio*100:.1f}%), y1={image_rect.y1:.1f} ({y1_ratio*100:.1f}%) [BODY AREA]"
+                    elif size_detection_succeeded:
+                        position_info = f", size: {img_width}x{img_height}, aspect_ratio={aspect_ratio:.2f}, height_ratio={img_height_ratio*100:.1f}%"
+                    
+                    with open(image_path, "wb") as img_file:
+                        img_file.write(image_bytes)
+                    
+                    extracted_images.append(image_path)
+                    image_count += 1
+                    logger.info("Extracted image %s from PDF page %d (BODY CONTENT image, size: %dx%d%s)", 
+                               image_filename, page_num, img_width if img_width > 0 else 0, img_height if img_height > 0 else 0, position_info)
+                except Exception as exc:
+                    logger.warning("Failed to extract image %d from page %d: %s", img_index, page_num, exc)
+            
+            # DO NOT extract full-page images - only extract embedded images
+            # Full-page images often contain headers/footers and are not needed
+            # We only want actual embedded images from the document content
+            logger.debug("Skipping full-page image extraction for page %d (only extracting embedded images)", page_num)
+        
+        doc.close()
+        if skipped_count > 0:
+            logger.info("Extracted %d images from PDF %s (skipped %d header/footer images)", 
+                       image_count, pdf_path.name, skipped_count)
+        else:
+            logger.info("Extracted %d images from PDF %s", image_count, pdf_path.name)
+        return extracted_images
+        
+    except Exception as exc:
+        logger.exception("Failed to extract images from PDF %s: %s", pdf_path, exc)
+        return extracted_images
+
+
+def extract_images_from_docx(docx_path: Path, output_dir: Path) -> List[Path]:
+    """
+    Extract all embedded images from a DOCX file.
+    Returns list of paths to extracted image files.
+    """
+    extracted_images: List[Path] = []
+    
+    if not HAS_DOCX:
+        logger.warning("python-docx not available, cannot extract images from DOCX")
+        return extracted_images
+    
+    try:
+        doc = DocxDocument(docx_path)
+        image_count = 0
+        
+        # Access document relationships to find images
+        for rel_id, rel in doc.part.rels.items():
+            # Check if relationship is an image
+            if "image" in rel.target_ref or rel.target_part.content_type.startswith("image/"):
+                try:
+                    image_part = rel.target_part
+                    image_bytes = image_part.blob
+                    
+                    # Determine image extension from content type
+                    content_type = image_part.content_type
+                    ext_map = {
+                        "image/png": "png",
+                        "image/jpeg": "jpg",
+                        "image/jpg": "jpg",
+                        "image/gif": "gif",
+                        "image/bmp": "bmp",
+                        "image/webp": "webp",
+                    }
+                    ext = ext_map.get(content_type, "png")
+                    
+                    # Check image size - small images are likely logos/icons (header/footer)
+                    try:
+                        from PIL import Image as PILImage
+                        from io import BytesIO
+                        img_obj = PILImage.open(BytesIO(image_bytes))
+                        img_width, img_height = img_obj.size
+                        # Skip very small images (likely logos/icons in headers/footers)
+                        if img_width < 200 and img_height < 200:
+                            logger.debug("Skipping small image from DOCX (likely header/footer logo, size: %dx%d)", 
+                                       img_width, img_height)
+                            continue
+                    except Exception:
+                        pass  # Continue with extraction if size check fails
+                    
+                    # Save image
+                    image_filename = f"docx_img_{image_count}.{ext}"
+                    image_path = output_dir / image_filename
+                    
+                    with open(image_path, "wb") as img_file:
+                        img_file.write(image_bytes)
+                    
+                    extracted_images.append(image_path)
+                    image_count += 1
+                    logger.debug("Extracted image %s from DOCX", image_filename)
+                except Exception as exc:
+                    logger.warning("Failed to extract image from DOCX: %s", exc)
+        
+        logger.info("Extracted %d images from DOCX %s", image_count, docx_path.name)
+        return extracted_images
+        
+    except Exception as exc:
+        logger.exception("Failed to extract images from DOCX %s: %s", docx_path, exc)
+        return extracted_images
+
+
+def extract_images_from_pptx(pptx_path: Path, output_dir: Path) -> List[Path]:
+    """
+    Extract all images from a PPTX file.
+    Returns list of paths to extracted image files.
+    """
+    extracted_images: List[Path] = []
+    
+    if not HAS_PPTX:
+        logger.warning("python-pptx not available, cannot extract images from PPTX")
+        return extracted_images
+    
+    try:
+        prs = Presentation(pptx_path)
+        image_count = 0
+        
+        for slide_num, slide in enumerate(prs.slides):
+            for shape_num, shape in enumerate(slide.shapes):
+                # Check if shape is a picture
+                if hasattr(shape, "image"):
+                    try:
+                        image = shape.image
+                        image_bytes = image.blob
+                        
+                        # Determine extension from content type
+                        ext = image.ext  # Usually 'png', 'jpg', etc.
+                        if not ext:
+                            ext = "png"
+                        
+                        # Check image size and position
+                        # Small images at edges are likely logos/icons
+                        try:
+                            from PIL import Image as PILImage
+                            from io import BytesIO
+                            img_obj = PILImage.open(BytesIO(image_bytes))
+                            img_width, img_height = img_obj.size
+                            
+                            # Get shape position (if available)
+                            shape_left = shape.left if hasattr(shape, 'left') else 0
+                            shape_top = shape.top if hasattr(shape, 'top') else 0
+                            slide_width = slide.slide_width if hasattr(slide, 'slide_width') else 10000
+                            slide_height = slide.slide_height if hasattr(slide, 'slide_height') else 10000
+                            
+                            # Check if small image is in corner (likely logo)
+                            is_small = img_width < 200 and img_height < 200
+                            is_in_corner = (
+                                (shape_left < slide_width * 0.1 and shape_top < slide_height * 0.1) or  # Top-left
+                                (shape_left > slide_width * 0.9 and shape_top < slide_height * 0.1) or  # Top-right
+                                (shape_left < slide_width * 0.1 and shape_top > slide_height * 0.9) or  # Bottom-left
+                                (shape_left > slide_width * 0.9 and shape_top > slide_height * 0.9)     # Bottom-right
+                            )
+                            
+                            if is_small and is_in_corner:
+                                logger.debug("Skipping small corner image from slide %d (likely header/footer logo)", slide_num)
+                                continue
+                        except Exception:
+                            pass  # Continue with extraction if check fails
+                        
+                        # Save image
+                        image_filename = f"slide_{slide_num}_img_{shape_num}.{ext}"
+                        image_path = output_dir / image_filename
+                        
+                        with open(image_path, "wb") as img_file:
+                            img_file.write(image_bytes)
+                        
+                        extracted_images.append(image_path)
+                        image_count += 1
+                        logger.debug("Extracted image %s from slide %d", image_filename, slide_num)
+                    except Exception as exc:
+                        logger.warning("Failed to extract image from shape: %s", exc)
+        
+        logger.info("Extracted %d images from PPTX %s", image_count, pptx_path.name)
+        return extracted_images
+        
+    except Exception as exc:
+        logger.exception("Failed to extract images from PPTX %s: %s", pptx_path, exc)
+        return extracted_images
+
+
+def extract_images_from_file(file_path: Path, output_dir: Path) -> List[Path]:
+    """
+    Extract images from a file based on its type.
+    Returns list of paths to extracted image files.
+    """
+    suffix = file_path.suffix.lower()
+    output_dir.mkdir(parents=True, exist_ok=True)
+    
+    if suffix == ".pdf":
+        return extract_images_from_pdf(file_path, output_dir)
+    elif suffix == ".docx":
+        return extract_images_from_docx(file_path, output_dir)
+    elif suffix in {".pptx", ".ppt"}:
+        return extract_images_from_pptx(file_path, output_dir)
+    else:
+        logger.debug("No image extraction needed for file type: %s", suffix)
+        return []
+
--- a/services/multi-document-upload-service/src/multi_document_upload_service/jobs.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/jobs.py
@ -0,0 +1,93 @@
+from __future__ import annotations
+
+import json
+import threading
+import uuid
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Dict, Optional
+
+from .models import JobRecord, JobStage
+
+
+class JobStore:
+    """Simple persistent job store backed by a JSON file."""
+
+    def __init__(self, storage_root: Path):
+        self._storage_root = Path(storage_root)
+        self._jobs_dir = self._storage_root / "jobs"
+        self._jobs_dir.mkdir(parents=True, exist_ok=True)
+        self._index_path = self._jobs_dir / "index.json"
+        self._lock = threading.Lock()
+        self._jobs: Dict[str, JobRecord] = {}
+        self._load()
+
+    def _load(self) -> None:
+        if self._index_path.exists():
+            try:
+                data = json.loads(self._index_path.read_text())
+                self._jobs = {job_id: JobRecord.model_validate(job_data) for job_id, job_data in data.items()}
+            except Exception as exc:  # noqa: BLE001
+                print(f"[JobStore] Failed to load job index: {exc}")
+                self._jobs = {}
+
+    def _persist(self) -> None:
+        serializable = {job_id: job.model_dump(mode="json") for job_id, job in self._jobs.items()}
+        tmp_path = self._index_path.with_suffix(".json.tmp")
+        tmp_path.write_text(json.dumps(serializable, indent=2, default=str))
+        tmp_path.replace(self._index_path)
+
+    def create(self, name: Optional[str], total_files: int) -> JobRecord:
+        with self._lock:
+            job_id = uuid.uuid4().hex
+            job = JobRecord(id=job_id, name=name, total_files=total_files)
+            self._jobs[job_id] = job
+            self._persist()
+            return job
+
+    def update(self, job_id: str, **kwargs) -> JobRecord:
+        with self._lock:
+            job = self._jobs[job_id]
+            for key, value in kwargs.items():
+                setattr(job, key, value)
+            job.updated_at = datetime.utcnow()
+            self._jobs[job_id] = job
+            self._persist()
+            return job
+
+    def get(self, job_id: str) -> JobRecord:
+        with self._lock:
+            return self._jobs[job_id]
+
+    def exists(self, job_id: str) -> bool:
+        with self._lock:
+            return job_id in self._jobs
+
+    def list_jobs(self) -> Dict[str, JobRecord]:
+        with self._lock:
+            return dict(self._jobs)
+
+    def mark_error(self, job_id: str, message: str) -> JobRecord:
+        return self.update(
+            job_id,
+            stage=JobStage.FAILED,
+            status_message=message,
+            error=message,
+        )
+
+    def cleanup(self, older_than_days: int) -> int:
+        """Remove jobs older than the retention threshold."""
+        cutoff = datetime.utcnow() - timedelta(days=older_than_days)
+        removed = 0
+        with self._lock:
+            for job_id in list(self._jobs.keys()):
+                if self._jobs[job_id].created_at < cutoff:
+                    removed += 1
+                    del self._jobs[job_id]
+            if removed:
+                self._persist()
+        return removed
+
+
+__all__ = ["JobStore"]
+
--- a/services/multi-document-upload-service/src/multi_document_upload_service/main.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/main.py
@ -0,0 +1,189 @@
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from typing import List, Optional
+
+from fastapi import BackgroundTasks, Depends, FastAPI, File, Form, HTTPException, UploadFile
+from fastapi.middleware.cors import CORSMiddleware
+
+from .claude_client import ClaudeCausalExtractor
+from .config import Settings, get_settings
+from .jobs import JobStore
+from .models import CreateJobResponse, JobGraphSummary, JobStage, JobStatusResponse
+from .processors.graph_writer import GraphWriter
+from .storage import StorageManager
+from .workflows.pipeline import JobPipeline
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+app = FastAPI(
+    title="Multi Document Upload Service",
+    version="0.1.0",
+    description="Processes multi-format documents to build causal knowledge graphs using Claude.",
+)
+
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+
+@dataclass
+class ServiceContainer:
+    settings: Settings
+    storage: StorageManager
+    job_store: JobStore
+    graph_writer: GraphWriter
+    claude_extractor: ClaudeCausalExtractor
+    pipeline: JobPipeline
+
+
+_container: ServiceContainer | None = None
+
+
+def get_container() -> ServiceContainer:
+    global _container
+    if _container is None:
+        settings = get_settings()
+        if not settings.anthropic_api_key:
+            raise HTTPException(status_code=500, detail="ANTHROPIC_API_KEY is not configured")
+
+        storage = StorageManager(settings.storage_root)
+        job_store = JobStore(settings.storage_root)
+        graph_writer = GraphWriter(settings.neo4j_uri, settings.neo4j_user, settings.neo4j_password)
+        claude_extractor = ClaudeCausalExtractor(
+            api_key=settings.anthropic_api_key,
+            model=settings.claude_model,
+            max_output_tokens=min(settings.claude_max_output_tokens, 4000),
+        )
+        pipeline = JobPipeline(
+            job_store=job_store,
+            storage=storage,
+            graph_writer=graph_writer,
+            claude_extractor=claude_extractor,
+        )
+        _container = ServiceContainer(
+            settings=settings,
+            storage=storage,
+            job_store=job_store,
+            graph_writer=graph_writer,
+            claude_extractor=claude_extractor,
+            pipeline=pipeline,
+        )
+    return _container
+
+
+def get_dependencies() -> ServiceContainer:
+    return get_container()
+
+
+@app.post("/jobs", response_model=CreateJobResponse, status_code=202)
+async def create_job(
+    background_tasks: BackgroundTasks,
+    files: List[UploadFile] = File(...),
+    job_name: Optional[str] = Form(default=None),
+    container: ServiceContainer = Depends(get_dependencies),
+) -> CreateJobResponse:
+    settings = container.settings
+    storage = container.storage
+    job_store = container.job_store
+    pipeline = container.pipeline
+
+    if not files:
+        raise HTTPException(status_code=400, detail="At least one file must be uploaded.")
+    if len(files) > settings.max_files_per_job:
+        raise HTTPException(status_code=400, detail="Too many files uploaded for a single job.")
+
+    total_size_bytes = 0
+    for file in files:
+        file.file.seek(0, 2)
+        total_size_bytes += file.file.tell()
+        file.file.seek(0)
+    if total_size_bytes > settings.max_upload_size_mb * 1024 * 1024:
+        raise HTTPException(status_code=400, detail="Uploaded files exceed maximum allowed size.")
+
+    job = job_store.create(job_name, total_files=len(files))
+    job.stage = JobStage.SAVING_FILES
+
+    saved_paths: List[str] = []
+    for upload in files:
+        file_record = storage.save_upload(job.id, upload)
+        saved_paths.append(file_record.stored_path)
+        job.files.append(file_record)
+
+    job_store.update(
+        job.id,
+        stage=JobStage.EXTRACTING,
+        status_message="Files saved; extraction queued.",
+        files=job.files,
+    )
+
+    background_tasks.add_task(pipeline.process_job, job.id, saved_paths)
+
+    return CreateJobResponse(
+        job_id=job.id,
+        stage=job.stage,
+        total_files=job.total_files,
+        created_at=job.created_at,
+    )
+
+
+@app.get("/jobs/{job_id}", response_model=JobStatusResponse)
+async def get_job_status(job_id: str, container: ServiceContainer = Depends(get_dependencies)) -> JobStatusResponse:
+    job_store = container.job_store
+    if not job_store.exists(job_id):
+        raise HTTPException(status_code=404, detail="Job not found")
+    job = job_store.get(job_id)
+    return JobStatusResponse(
+        job_id=job.id,
+        stage=job.stage,
+        status_message=job.status_message,
+        total_files=job.total_files,
+        processed_files=job.processed_files,
+        error=job.error,
+        created_at=job.created_at,
+        updated_at=job.updated_at,
+        files=job.files,
+    )
+
+
+@app.get("/jobs/{job_id}/graph", response_model=JobGraphSummary)
+async def get_job_graph(job_id: str, container: ServiceContainer = Depends(get_dependencies)) -> JobGraphSummary:
+    job_store = container.job_store
+    if not job_store.exists(job_id):
+        raise HTTPException(status_code=404, detail="Job not found")
+    job = job_store.get(job_id)
+    if job.stage != JobStage.COMPLETED:
+        raise HTTPException(status_code=409, detail="Job not completed yet")
+    return JobGraphSummary(
+        job_id=job.id,
+        relations=job.relations,
+        node_count=len({rel.cause for rel in job.relations} | {rel.effect for rel in job.relations}),
+        edge_count=len(job.relations),
+        generated_at=job.updated_at,
+    )
+
+
+@app.get("/health")
+async def healthcheck(container: ServiceContainer = Depends(get_dependencies)):
+    settings = container.settings
+    return {
+        "status": "ok",
+        "claude_model": settings.claude_model,
+        "max_input_tokens_per_min": settings.claude_max_input_tokens,
+        "max_output_tokens_per_min": settings.claude_max_output_tokens,
+    }
+
+
+@app.on_event("shutdown")
+async def shutdown_event() -> None:
+    container = _container
+    if container:
+        container.graph_writer.close()
+
--- a/services/multi-document-upload-service/src/multi_document_upload_service/models.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/models.py
@ -0,0 +1,84 @@
+from __future__ import annotations
+
+from datetime import datetime
+from enum import Enum
+from typing import Any, Dict, List, Optional
+
+from pydantic import BaseModel, Field
+
+
+class JobStage(str, Enum):
+    RECEIVED = "received"
+    SAVING_FILES = "saving_files"
+    EXTRACTING = "extracting"
+    ANALYZING = "analyzing"
+    BUILDING_GRAPH = "building_graph"
+    COMPLETED = "completed"
+    FAILED = "failed"
+
+
+class FileRecord(BaseModel):
+    id: str
+    filename: str
+    content_type: str | None = None
+    size_bytes: int
+    stored_path: str
+    extracted_path: str | None = None
+    error: str | None = None
+
+
+class CausalRelation(BaseModel):
+    cause: str
+    effect: str
+    confidence: float = Field(default=0.0, ge=0.0, le=1.0)
+    explanation: Optional[str] = None
+    source_file_id: Optional[str] = None
+    source_snippet: Optional[str] = None
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+
+
+class JobRecord(BaseModel):
+    id: str
+    name: str | None = None
+    stage: JobStage = JobStage.RECEIVED
+    status_message: str | None = None
+    files: List[FileRecord] = Field(default_factory=list)
+    total_files: int = 0
+    processed_files: int = 0
+    relations: List[CausalRelation] = Field(default_factory=list)
+    created_at: datetime = Field(default_factory=datetime.utcnow)
+    updated_at: datetime = Field(default_factory=datetime.utcnow)
+    error: str | None = None
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+
+    @property
+    def is_finished(self) -> bool:
+        return self.stage in {JobStage.COMPLETED, JobStage.FAILED}
+
+
+class CreateJobResponse(BaseModel):
+    job_id: str
+    stage: JobStage
+    total_files: int
+    created_at: datetime
+
+
+class JobStatusResponse(BaseModel):
+    job_id: str
+    stage: JobStage
+    status_message: str | None = None
+    total_files: int
+    processed_files: int
+    error: str | None = None
+    created_at: datetime
+    updated_at: datetime
+    files: List[FileRecord]
+
+
+class JobGraphSummary(BaseModel):
+    job_id: str
+    relations: List[CausalRelation]
+    node_count: int
+    edge_count: int
+    generated_at: datetime
+
--- a/services/multi-document-upload-service/src/multi_document_upload_service/processors/chunker.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/processors/chunker.py
@ -0,0 +1,24 @@
+from __future__ import annotations
+
+from typing import Iterable, List
+
+import tiktoken
+
+
+class TextChunker:
+    def __init__(self, model_name: str, token_target: int = 800, overlap: int = 200):
+        self.encoder = tiktoken.encoding_for_model("gpt-4o") if "claude" not in model_name else tiktoken.get_encoding("cl100k_base")
+        self.token_target = token_target
+        self.overlap = overlap
+
+    def chunk(self, text: str) -> Iterable[str]:
+        tokens = self.encoder.encode(text)
+        step = max(self.token_target - self.overlap, 1)
+        chunks: List[str] = []
+        for start in range(0, len(tokens), step):
+            end = min(start + self.token_target, len(tokens))
+            chunk_tokens = tokens[start:end]
+            chunk_text = self.encoder.decode(chunk_tokens)
+            chunks.append(chunk_text)
+        return chunks
+
--- a/services/multi-document-upload-service/src/multi_document_upload_service/processors/graph_writer.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/processors/graph_writer.py
@ -0,0 +1,81 @@
+from __future__ import annotations
+
+import logging
+from typing import Iterable
+
+from neo4j import GraphDatabase, Transaction
+
+from ..models import CausalRelation
+
+logger = logging.getLogger(__name__)
+
+
+MERGE_QUERY = """
+MERGE (cause:Concept {name: $cause})
+ON CREATE SET cause.created_at = timestamp(), cause.lastSeen = timestamp()
+ON MATCH SET cause.lastSeen = timestamp()
+MERGE (effect:Concept {name: $effect})
+ON CREATE SET effect.created_at = timestamp(), effect.lastSeen = timestamp()
+ON MATCH SET effect.lastSeen = timestamp()
+MERGE (cause)-[r:CAUSES]->(effect)
+ON CREATE SET r.confidence = $confidence,
+              r.explanation = $explanation,
+              r.source_file_id = $source_file_id,
+              r.source_snippet = $source_snippet,
+              r.job_id = $job_id,
+              r.model = $model,
+              r.created_at = timestamp(),
+              r.updated_at = timestamp()
+ON MATCH SET r.confidence = $confidence,
+             r.explanation = $explanation,
+             r.source_file_id = $source_file_id,
+             r.source_snippet = $source_snippet,
+             r.job_id = $job_id,
+             r.model = $model,
+             r.updated_at = timestamp()
+"""
+
+
+class GraphWriter:
+    def __init__(self, uri: str, user: str, password: str):
+        self._driver = GraphDatabase.driver(uri, auth=(user, password))
+
+    def close(self) -> None:
+        self._driver.close()
+
+    def write_relations(self, job_id: str, relations: Iterable[CausalRelation]) -> None:
+        relations_list = list(relations)
+        if not relations_list:
+            logger.warning("No relations to write for job %s", job_id)
+            return
+            
+        logger.info("Writing %d relations to Neo4j for job %s", len(relations_list), job_id)
+        
+        with self._driver.session() as session:
+            def _write(tx: Transaction) -> None:
+                count = 0
+                for relation in relations_list:
+                    if not relation.cause or not relation.effect:
+                        logger.warning("Skipping relation with empty cause or effect: %s -> %s", relation.cause, relation.effect)
+                        continue
+                    try:
+                        result = tx.run(
+                            MERGE_QUERY,
+                            cause=relation.cause.strip(),
+                            effect=relation.effect.strip(),
+                            confidence=float(relation.confidence) if relation.confidence else 0.0,
+                            explanation=relation.explanation or "",
+                            source_file_id=relation.source_file_id or "",
+                            source_snippet=relation.source_snippet or "",
+                            job_id=job_id,
+                            model=relation.metadata.get("model") or "",
+                        )
+                        count += 1
+                        logger.debug("Wrote relation: %s -> %s (confidence: %s)", relation.cause, relation.effect, relation.confidence)
+                    except Exception as exc:
+                        logger.exception("Failed to write relation %s -> %s: %s", relation.cause, relation.effect, exc)
+                logger.info("Successfully wrote %d/%d relations to Neo4j", count, len(relations_list))
+
+            session.execute_write(_write)
+            logger.info("Persisted causal relations for job %s", job_id)
+
--- a/services/multi-document-upload-service/src/multi_document_upload_service/storage.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/storage.py
@ -0,0 +1,59 @@
+from __future__ import annotations
+
+import shutil
+from pathlib import Path
+from typing import Iterable, Tuple
+
+from fastapi import UploadFile
+
+from .models import FileRecord
+
+
+class StorageManager:
+    def __init__(self, root: Path):
+        self.root = Path(root)
+        self.upload_dir = self.root / "uploads"
+        self.extract_dir = self.root / "extracted"
+        self.images_dir = self.root / "images"
+        self.upload_dir.mkdir(parents=True, exist_ok=True)
+        self.extract_dir.mkdir(parents=True, exist_ok=True)
+        self.images_dir.mkdir(parents=True, exist_ok=True)
+
+    def save_upload(self, job_id: str, upload: UploadFile) -> FileRecord:
+        job_dir = self.upload_dir / job_id
+        job_dir.mkdir(parents=True, exist_ok=True)
+
+        destination = job_dir / upload.filename
+        upload.file.seek(0)
+        with destination.open("wb") as out_file:
+            shutil.copyfileobj(upload.file, out_file)
+
+        size_bytes = destination.stat().st_size
+        return FileRecord(
+            id=destination.stem,
+            filename=upload.filename,
+            content_type=upload.content_type,
+            size_bytes=size_bytes,
+            stored_path=str(destination),
+        )
+
+    def stage_extracted_content(self, job_id: str, file_name: str, content: str) -> Path:
+        job_dir = self.extract_dir / job_id
+        job_dir.mkdir(parents=True, exist_ok=True)
+        safe_name = f"{Path(file_name).stem}.txt"
+        destination = job_dir / safe_name
+        destination.write_text(content, encoding="utf-8")
+        return destination
+
+    def list_saved_files(self, job_id: str) -> Iterable[Tuple[str, Path]]:
+        job_dir = self.upload_dir / job_id
+        if not job_dir.exists():
+            return []
+        return [(file.name, file) for file in job_dir.iterdir() if file.is_file()]
+    
+    def get_images_dir(self, job_id: str) -> Path:
+        """Get or create directory for extracted images."""
+        images_dir = self.root / "images" / job_id
+        images_dir.mkdir(parents=True, exist_ok=True)
+        return images_dir
+
--- a/services/multi-document-upload-service/src/multi_document_upload_service/workflows/pipeline.py
+++ b/services/multi-document-upload-service/src/multi_document_upload_service/workflows/pipeline.py
@ -0,0 +1,164 @@
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+from typing import Iterable, List
+
+from ..claude_client import ClaudeCausalExtractor
+from ..config import get_settings
+from ..extractors.auto import extract_text
+from ..extractors.image_extractor import extract_images_from_file
+from ..jobs import JobStore
+from ..models import CausalRelation, JobStage
+from ..processors.chunker import TextChunker
+from ..processors.graph_writer import GraphWriter
+from ..storage import StorageManager
+
+logger = logging.getLogger(__name__)
+
+
+class JobPipeline:
+    def __init__(
+        self,
+        job_store: JobStore,
+        storage: StorageManager,
+        graph_writer: GraphWriter,
+        claude_extractor: ClaudeCausalExtractor,
+    ):
+        self.job_store = job_store
+        self.storage = storage
+        self.graph_writer = graph_writer
+        self.claude_extractor = claude_extractor
+        settings = get_settings()
+        self.chunker = TextChunker(
+            model_name=settings.claude_model,
+            token_target=settings.chunk_token_target,
+            overlap=settings.chunk_token_overlap,
+        )
+
+    def process_job(self, job_id: str, saved_files: Iterable[str]) -> None:
+        job = self.job_store.get(job_id)
+        logger.info("Processing job %s with %d files", job_id, job.total_files)
+
+        relations: List[CausalRelation] = []
+
+        try:
+            self.job_store.update(job_id, stage=JobStage.EXTRACTING, status_message="Extracting content")
+            for count, file_path in enumerate(saved_files, start=1):
+                file_path_obj = Path(file_path)
+                file_record = next((f for f in job.files if f.stored_path == file_path), None)
+                logger.info("Processing %s", file_path_obj.name)
+                source_file_id = file_record.id if file_record else file_path_obj.name
+                suffix = file_path_obj.suffix.lower()
+                
+                # Check if this is a direct image upload
+                is_direct_image = suffix in {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}
+                
+                try:
+                    # Extract text from document (if not a direct image)
+                    text = ""
+                    if not is_direct_image:
+                        try:
+                            text = extract_text(file_path_obj)
+                            
+                            # Process text if available
+                            if text and text.strip():
+                                # Validate text is readable
+                                printable_chars = sum(1 for c in text if c.isprintable() or c.isspace())
+                                total_chars = len(text)
+                                if total_chars > 100 and printable_chars / total_chars < 0.3:
+                                    logger.warning("Text from %s appears to be binary, skipping text processing", file_path_obj.name)
+                                    text = ""
+                                else:
+                                    extracted_path = self.storage.stage_extracted_content(job_id, file_path_obj.name, text)
+                                    if file_record:
+                                        file_record.extracted_path = str(extracted_path)
+                                    logger.info("Successfully extracted %d characters from %s", len(text), file_path_obj.name)
+                        except Exception as text_exc:
+                            logger.warning("Text extraction failed for %s: %s. Will continue with image extraction if available.", file_path_obj.name, text_exc)
+                            text = ""
+                    
+                    # Extract images from documents (PDF, DOCX, PPTX)
+                    extracted_images: List[Path] = []
+                    if suffix in {".pdf", ".docx", ".pptx", ".ppt"}:
+                        try:
+                            images_dir = self.storage.get_images_dir(job_id)
+                            extracted_images = extract_images_from_file(file_path_obj, images_dir)
+                            logger.info("Extracted %d images from %s", len(extracted_images), file_path_obj.name)
+                        except Exception as img_exc:
+                            logger.warning("Failed to extract images from %s: %s", file_path_obj.name, img_exc)
+                    
+                    # For direct image uploads, add the file itself to images list
+                    if is_direct_image:
+                        extracted_images = [file_path_obj]
+                        logger.info("Direct image upload detected: %s", file_path_obj.name)
+                    
+                except Exception as exc:  # noqa: BLE001
+                    logger.exception("Extraction failed for %s", file_path_obj)
+                    if file_record:
+                        file_record.error = str(exc)
+                    continue
+
+                self.job_store.update(
+                    job_id,
+                    files=job.files,
+                    processed_files=count,
+                    status_message=f"Analyzing causal relations ({count}/{job.total_files})",
+                    stage=JobStage.ANALYZING,
+                )
+
+                # Process text content
+                if text and text.strip():
+                    chunks = self.chunker.chunk(text)
+                    text_relations = self.claude_extractor.analyze(chunks, source_file_id=source_file_id)
+                    relations.extend(text_relations)
+                    logger.info("Extracted %d relations from text in %s", len(text_relations), file_path_obj.name)
+                
+                # Process images (extracted from documents or direct uploads)
+                if extracted_images:
+                    for image_path in extracted_images:
+                        try:
+                            image_relations = self.claude_extractor.analyze_image(image_path, source_file_id=source_file_id)
+                            relations.extend(image_relations)
+                            logger.info("Extracted %d relations from image %s", len(image_relations), image_path.name)
+                        except Exception as img_exc:
+                            logger.warning("Failed to analyze image %s: %s", image_path, img_exc)
+                            # Continue with other images
+                elif not text or not text.strip():
+                    # No text and no images - file might be empty or unsupported
+                    logger.warning("File %s has no extractable text or images", file_path_obj.name)
+                    if file_record:
+                        file_record.error = "No extractable content found (no text or images)"
+
+            # Write relations to Neo4j if any were found
+            if relations:
+                self.job_store.update(job_id, status_message="Writing to knowledge graph", stage=JobStage.BUILDING_GRAPH)
+                try:
+                    self.graph_writer.write_relations(job_id, relations)
+                    logger.info("Wrote %d relations to Neo4j for job %s", len(relations), job_id)
+                    status_message = f"Completed with {len(relations)} causal relationship(s) written to Neo4j"
+                except Exception as graph_exc:
+                    logger.exception("Failed to write relations to Neo4j for job %s: %s", job_id, graph_exc)
+                    status_message = f"Completed with {len(relations)} relations extracted, but failed to write to Neo4j: {graph_exc}"
+            else:
+                logger.warning("Job %s completed with 0 relations - no causal relationships found", job_id)
+                # Check if any files failed to extract
+                failed_files = [f for f in job.files if f.error]
+                if failed_files:
+                    status_message = f"Completed but {len(failed_files)} file(s) failed to extract. No relations found."
+                else:
+                    status_message = "Completed but no causal relationships were found in the documents."
+
+            # Final update
+            self.job_store.update(
+                job_id,
+                stage=JobStage.COMPLETED,
+                status_message=status_message,
+                relations=relations,
+                processed_files=job.total_files,
+            )
+            logger.info("Job %s completed with %d relations", job_id, len(relations))
+        except Exception as exc:  # noqa: BLE001
+            logger.exception("Job %s failed: %s", job_id, exc)
+            self.job_store.mark_error(job_id, f"Pipeline failed: {exc}")
+