newly added multi doc upload service
This commit is contained in:
parent
ad2c27d793
commit
603e9b4b20
@ -131,11 +131,11 @@ services:
|
|||||||
networks:
|
networks:
|
||||||
- pipeline_network
|
- pipeline_network
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD", "cypher-shell", "--username", "neo4j", "--password", "password", "MATCH () RETURN count(*) as count"]
|
test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:7474 || exit 1"]
|
||||||
interval: 30s
|
interval: 30s
|
||||||
timeout: 10s
|
timeout: 10s
|
||||||
retries: 5
|
retries: 5
|
||||||
start_period: 60s
|
start_period: 90s
|
||||||
|
|
||||||
# chromadb:
|
# chromadb:
|
||||||
# image: chromadb/chroma:latest
|
# image: chromadb/chroma:latest
|
||||||
@ -269,6 +269,7 @@ services:
|
|||||||
- SELF_IMPROVING_GENERATOR_URL=http://self-improving-generator:8007
|
- SELF_IMPROVING_GENERATOR_URL=http://self-improving-generator:8007
|
||||||
- AI_MOCKUP_URL=http://ai-mockup-service:8021
|
- AI_MOCKUP_URL=http://ai-mockup-service:8021
|
||||||
- AI_ANALYSIS_URL=http://ai-analysis-service:8022
|
- AI_ANALYSIS_URL=http://ai-analysis-service:8022
|
||||||
|
- MULTI_DOCUMENT_UPLOAD_URL=http://multi-document-upload-service:8024
|
||||||
- UNISON_URL=http://unison:8010
|
- UNISON_URL=http://unison:8010
|
||||||
- TEMPLATE_MANAGER_AI_URL=http://template-manager:8013
|
- TEMPLATE_MANAGER_AI_URL=http://template-manager:8013
|
||||||
volumes:
|
volumes:
|
||||||
@ -775,6 +776,67 @@ services:
|
|||||||
retries: 3
|
retries: 3
|
||||||
start_period: 60s
|
start_period: 60s
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
|
||||||
|
# Multi-Document Upload Service
|
||||||
|
# =====================================
|
||||||
|
|
||||||
|
multi-document-upload-service:
|
||||||
|
build:
|
||||||
|
context: ./services/multi-document-upload-service
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
container_name: pipeline_multi_document_upload
|
||||||
|
ports:
|
||||||
|
- "8024:8024"
|
||||||
|
environment:
|
||||||
|
- PORT=8024
|
||||||
|
- HOST=0.0.0.0
|
||||||
|
- ANTHROPIC_API_KEY=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
|
||||||
|
- CLAUDE_MODEL=claude-3-5-haiku-latest
|
||||||
|
|
||||||
|
# Neo4j Configuration
|
||||||
|
- NEO4J_URI=bolt://neo4j:7687
|
||||||
|
- NEO4J_USER=neo4j
|
||||||
|
- NEO4J_PASSWORD=password
|
||||||
|
- NEO4J_DATABASE=neo4j
|
||||||
|
|
||||||
|
# Storage Configuration
|
||||||
|
- STORAGE_DIR=/app/storage
|
||||||
|
|
||||||
|
# Database configurations (optional, for job tracking)
|
||||||
|
- POSTGRES_HOST=pipeline_postgres
|
||||||
|
- POSTGRES_PORT=5432
|
||||||
|
- POSTGRES_DB=dev_pipeline
|
||||||
|
- POSTGRES_USER=pipeline_admin
|
||||||
|
- POSTGRES_PASSWORD=secure_pipeline_2024
|
||||||
|
|
||||||
|
- REDIS_HOST=pipeline_redis
|
||||||
|
- REDIS_PORT=6379
|
||||||
|
- REDIS_PASSWORD=redis_secure_2024
|
||||||
|
volumes:
|
||||||
|
- multi_document_storage:/app/storage
|
||||||
|
depends_on:
|
||||||
|
neo4j:
|
||||||
|
condition: service_healthy
|
||||||
|
postgres:
|
||||||
|
condition: service_healthy
|
||||||
|
redis:
|
||||||
|
condition: service_healthy
|
||||||
|
networks:
|
||||||
|
- pipeline_network
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
memory: 4G
|
||||||
|
reservations:
|
||||||
|
memory: 2G
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://localhost:8024/health"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 60s
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
# =====================================
|
# =====================================
|
||||||
# Workflow Orchestration
|
# Workflow Orchestration
|
||||||
# =====================================
|
# =====================================
|
||||||
@ -894,6 +956,8 @@ volumes:
|
|||||||
driver: local
|
driver: local
|
||||||
ai_analysis_temp:
|
ai_analysis_temp:
|
||||||
driver: local
|
driver: local
|
||||||
|
multi_document_storage:
|
||||||
|
driver: local
|
||||||
|
|
||||||
# =====================================
|
# =====================================
|
||||||
# Networks
|
# Networks
|
||||||
|
|||||||
@ -13,8 +13,10 @@ import time
|
|||||||
import hashlib
|
import hashlib
|
||||||
import traceback
|
import traceback
|
||||||
import uuid
|
import uuid
|
||||||
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Any, Optional, List, Tuple
|
from typing import Dict, Any, Optional, List, Tuple, Set
|
||||||
|
from dataclasses import dataclass, field
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
|
|
||||||
@ -53,7 +55,8 @@ from ai_analyze import (
|
|||||||
CodeQualityAnalysis,
|
CodeQualityAnalysis,
|
||||||
Issue,
|
Issue,
|
||||||
ModuleAnalysis,
|
ModuleAnalysis,
|
||||||
ModuleSummary
|
ModuleSummary,
|
||||||
|
FileAnalysis
|
||||||
)
|
)
|
||||||
|
|
||||||
# Import enhanced analyzer (backward compatible)
|
# Import enhanced analyzer (backward compatible)
|
||||||
@ -72,6 +75,222 @@ analyzer = None
|
|||||||
neo4j_client: Optional[Neo4jGraphClient] = None
|
neo4j_client: Optional[Neo4jGraphClient] = None
|
||||||
USE_KNOWLEDGE_GRAPH = False
|
USE_KNOWLEDGE_GRAPH = False
|
||||||
|
|
||||||
|
CANONICAL_CHUNK_SUFFIX_RE = re.compile(r'(_part|_chunk)\d+$', re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
def get_canonical_module_name(raw_name: str) -> str:
|
||||||
|
"""Normalize chunk/module names so split chunks collapse to one canonical module."""
|
||||||
|
if not raw_name:
|
||||||
|
return "unknown"
|
||||||
|
cleaned = raw_name.strip()
|
||||||
|
canonical = CANONICAL_CHUNK_SUFFIX_RE.sub("", cleaned)
|
||||||
|
canonical = canonical.strip("_- ")
|
||||||
|
return canonical or cleaned
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_list_of_strings(value: Any) -> List[str]:
|
||||||
|
if value is None:
|
||||||
|
return []
|
||||||
|
if isinstance(value, str):
|
||||||
|
value = value.strip()
|
||||||
|
return [value] if value else []
|
||||||
|
if isinstance(value, (list, tuple, set)):
|
||||||
|
return [str(item).strip() for item in value if item is not None and str(item).strip()]
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def _dedupe_preserve_order(items: List[str]) -> List[str]:
|
||||||
|
seen = set()
|
||||||
|
result = []
|
||||||
|
for item in items:
|
||||||
|
if item not in seen:
|
||||||
|
seen.add(item)
|
||||||
|
result.append(item)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def sanitize_file_analysis_for_aggregation(fa: Any) -> FileAnalysis:
|
||||||
|
"""Create a lightweight, serialization-safe FileAnalysis for aggregation."""
|
||||||
|
if isinstance(fa, FileAnalysis):
|
||||||
|
path = str(fa.path) if fa.path else ""
|
||||||
|
language = fa.language or "unknown"
|
||||||
|
lines = int(fa.lines_of_code or 0)
|
||||||
|
complexity = float(fa.complexity_score or 0.0)
|
||||||
|
severity_value = fa.severity_score
|
||||||
|
severity = float(severity_value) if isinstance(severity_value, (int, float)) else 5.0
|
||||||
|
issues = _ensure_list_of_strings(fa.issues_found)
|
||||||
|
recommendations = _ensure_list_of_strings(fa.recommendations)
|
||||||
|
detailed = fa.detailed_analysis or ""
|
||||||
|
elif isinstance(fa, dict):
|
||||||
|
path = str(fa.get("path") or fa.get("file_path") or "")
|
||||||
|
language = fa.get("language") or "unknown"
|
||||||
|
lines = int(fa.get("lines_of_code") or 0)
|
||||||
|
complexity = float(fa.get("complexity_score") or 0.0)
|
||||||
|
severity_value = fa.get("severity_score")
|
||||||
|
severity = float(severity_value) if isinstance(severity_value, (int, float)) else 5.0
|
||||||
|
issues = _ensure_list_of_strings(fa.get("issues_found", []))
|
||||||
|
recommendations = _ensure_list_of_strings(fa.get("recommendations", []))
|
||||||
|
detailed = fa.get("detailed_analysis") or ""
|
||||||
|
else:
|
||||||
|
path = str(getattr(fa, "path", "") or "")
|
||||||
|
language = getattr(fa, "language", "unknown") or "unknown"
|
||||||
|
lines = int(getattr(fa, "lines_of_code", 0) or 0)
|
||||||
|
complexity = float(getattr(fa, "complexity_score", 0) or 0.0)
|
||||||
|
severity_value = getattr(fa, "severity_score", 5.0)
|
||||||
|
severity = float(severity_value) if isinstance(severity_value, (int, float)) else 5.0
|
||||||
|
issues = _ensure_list_of_strings(getattr(fa, "issues_found", []))
|
||||||
|
recommendations = _ensure_list_of_strings(getattr(fa, "recommendations", []))
|
||||||
|
detailed = getattr(fa, "detailed_analysis", "") or ""
|
||||||
|
|
||||||
|
return FileAnalysis(
|
||||||
|
path=path,
|
||||||
|
language=language,
|
||||||
|
lines_of_code=lines,
|
||||||
|
complexity_score=complexity,
|
||||||
|
issues_found=issues,
|
||||||
|
recommendations=recommendations,
|
||||||
|
detailed_analysis=detailed,
|
||||||
|
severity_score=severity
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def merge_file_analyses(existing: FileAnalysis, new: FileAnalysis) -> FileAnalysis:
|
||||||
|
"""Merge two FileAnalysis objects for the same file path."""
|
||||||
|
severity = (
|
||||||
|
(existing.severity_score or 0) + (new.severity_score or 0)
|
||||||
|
) / 2.0 if isinstance(existing.severity_score, (int, float)) and isinstance(new.severity_score, (int, float)) else (existing.severity_score or new.severity_score or 5.0)
|
||||||
|
|
||||||
|
complexity = (
|
||||||
|
(existing.complexity_score or 0) + (new.complexity_score or 0)
|
||||||
|
) / 2.0 if isinstance(existing.complexity_score, (int, float)) and isinstance(new.complexity_score, (int, float)) else (existing.complexity_score or new.complexity_score or 0.0)
|
||||||
|
|
||||||
|
language = existing.language if existing.language and existing.language != "unknown" else new.language
|
||||||
|
|
||||||
|
issues = _ensure_list_of_strings(existing.issues_found) + _ensure_list_of_strings(new.issues_found)
|
||||||
|
recommendations = _ensure_list_of_strings(existing.recommendations) + _ensure_list_of_strings(new.recommendations)
|
||||||
|
|
||||||
|
issues = _dedupe_preserve_order(issues)
|
||||||
|
recommendations = _dedupe_preserve_order(recommendations)
|
||||||
|
|
||||||
|
detailed = existing.detailed_analysis or new.detailed_analysis or ""
|
||||||
|
|
||||||
|
return FileAnalysis(
|
||||||
|
path=existing.path or new.path,
|
||||||
|
language=language or "unknown",
|
||||||
|
lines_of_code=max(existing.lines_of_code or 0, new.lines_of_code or 0),
|
||||||
|
complexity_score=complexity,
|
||||||
|
issues_found=issues,
|
||||||
|
recommendations=recommendations,
|
||||||
|
detailed_analysis=detailed,
|
||||||
|
severity_score=severity
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class AggregatedModuleData:
|
||||||
|
canonical_name: str
|
||||||
|
original_names: Set[str] = field(default_factory=set)
|
||||||
|
chunk_ids: List[str] = field(default_factory=list)
|
||||||
|
chunk_types: Set[str] = field(default_factory=set)
|
||||||
|
file_map: Dict[str, FileAnalysis] = field(default_factory=dict)
|
||||||
|
quality_scores: List[float] = field(default_factory=list)
|
||||||
|
overviews: List[str] = field(default_factory=list)
|
||||||
|
architectures: List[str] = field(default_factory=list)
|
||||||
|
security_notes: List[str] = field(default_factory=list)
|
||||||
|
recommendations: Set[str] = field(default_factory=set)
|
||||||
|
ai_responses: List[str] = field(default_factory=list)
|
||||||
|
dependencies: Set[str] = field(default_factory=set)
|
||||||
|
metadata_records: List[Dict[str, Any]] = field(default_factory=list)
|
||||||
|
context_dependencies: Set[str] = field(default_factory=set)
|
||||||
|
|
||||||
|
|
||||||
|
class ModuleAggregationManager:
|
||||||
|
"""Collects chunk-level results and exposes aggregated module summaries."""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._cache: Dict[str, Dict[str, AggregatedModuleData]] = {}
|
||||||
|
|
||||||
|
def reset(self, run_id: str) -> None:
|
||||||
|
self._cache[run_id] = {}
|
||||||
|
|
||||||
|
def clear(self, run_id: Optional[str]) -> None:
|
||||||
|
if run_id and run_id in self._cache:
|
||||||
|
del self._cache[run_id]
|
||||||
|
|
||||||
|
def add_chunk(
|
||||||
|
self,
|
||||||
|
run_id: str,
|
||||||
|
chunk_name: str,
|
||||||
|
chunk_id: Optional[str],
|
||||||
|
chunk_type: Optional[str],
|
||||||
|
chunk: Dict[str, Any],
|
||||||
|
chunk_analysis: Dict[str, Any],
|
||||||
|
file_analyses: List[Any],
|
||||||
|
metadata: Dict[str, Any],
|
||||||
|
ai_response: str
|
||||||
|
) -> None:
|
||||||
|
if not run_id:
|
||||||
|
return
|
||||||
|
|
||||||
|
canonical_name = get_canonical_module_name(chunk_name)
|
||||||
|
modules = self._cache.setdefault(run_id, {})
|
||||||
|
module_data = modules.get(canonical_name)
|
||||||
|
if module_data is None:
|
||||||
|
module_data = AggregatedModuleData(canonical_name=canonical_name)
|
||||||
|
modules[canonical_name] = module_data
|
||||||
|
|
||||||
|
module_data.original_names.add(chunk_name)
|
||||||
|
if chunk_id:
|
||||||
|
module_data.chunk_ids.append(chunk_id)
|
||||||
|
if chunk_type:
|
||||||
|
module_data.chunk_types.add(chunk_type)
|
||||||
|
|
||||||
|
quality_value = chunk_analysis.get('module_quality_score')
|
||||||
|
if quality_value is None:
|
||||||
|
quality_value = chunk_analysis.get('module_quality')
|
||||||
|
if isinstance(quality_value, (int, float)):
|
||||||
|
module_data.quality_scores.append(float(quality_value))
|
||||||
|
|
||||||
|
overview_text = chunk_analysis.get('module_overview')
|
||||||
|
if overview_text:
|
||||||
|
module_data.overviews.append(str(overview_text).strip())
|
||||||
|
|
||||||
|
architecture_text = chunk_analysis.get('module_architecture')
|
||||||
|
if architecture_text:
|
||||||
|
module_data.architectures.append(str(architecture_text).strip())
|
||||||
|
|
||||||
|
security_text = chunk_analysis.get('module_security_assessment')
|
||||||
|
if security_text:
|
||||||
|
module_data.security_notes.append(str(security_text).strip())
|
||||||
|
|
||||||
|
recommendations = chunk_analysis.get('module_recommendations', [])
|
||||||
|
module_data.recommendations.update(_ensure_list_of_strings(recommendations))
|
||||||
|
|
||||||
|
if ai_response:
|
||||||
|
module_data.ai_responses.append(ai_response)
|
||||||
|
|
||||||
|
module_data.dependencies.update(chunk.get('dependencies', []))
|
||||||
|
module_data.context_dependencies.update(chunk.get('context_dependencies', []))
|
||||||
|
|
||||||
|
for fa in file_analyses:
|
||||||
|
sanitized = sanitize_file_analysis_for_aggregation(fa)
|
||||||
|
if not sanitized.path:
|
||||||
|
continue
|
||||||
|
existing = module_data.file_map.get(sanitized.path)
|
||||||
|
if existing:
|
||||||
|
module_data.file_map[sanitized.path] = merge_file_analyses(existing, sanitized)
|
||||||
|
else:
|
||||||
|
module_data.file_map[sanitized.path] = sanitized
|
||||||
|
|
||||||
|
if metadata:
|
||||||
|
module_data.metadata_records.append(metadata)
|
||||||
|
|
||||||
|
def get_modules(self, run_id: str) -> Dict[str, AggregatedModuleData]:
|
||||||
|
return self._cache.get(run_id, {})
|
||||||
|
|
||||||
|
|
||||||
|
module_aggregation_manager = ModuleAggregationManager()
|
||||||
|
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def lifespan(app: FastAPI):
|
async def lifespan(app: FastAPI):
|
||||||
"""Lifespan context manager for startup and shutdown events."""
|
"""Lifespan context manager for startup and shutdown events."""
|
||||||
@ -2105,10 +2324,14 @@ def estimate_tokens(files: List[Tuple[str, str]]) -> int:
|
|||||||
# Rough estimate: 4 characters per token
|
# Rough estimate: 4 characters per token
|
||||||
return total_chars // 4
|
return total_chars // 4
|
||||||
|
|
||||||
def split_by_token_limit(module_files: List[Tuple[str, str]], max_tokens: int = 15000) -> List[List[Tuple[str, str]]]:
|
def split_by_token_limit(
|
||||||
|
module_files: List[Tuple[str, str]],
|
||||||
|
max_tokens: int = 15000,
|
||||||
|
max_files: int = 12
|
||||||
|
) -> List[List[Tuple[str, str]]]:
|
||||||
"""Split large module into sub-chunks while preserving related files together."""
|
"""Split large module into sub-chunks while preserving related files together."""
|
||||||
sub_chunks = []
|
sub_chunks: List[List[Tuple[str, str]]] = []
|
||||||
current_chunk = []
|
current_chunk: List[Tuple[str, str]] = []
|
||||||
current_tokens = 0
|
current_tokens = 0
|
||||||
|
|
||||||
for file_path, content in module_files:
|
for file_path, content in module_files:
|
||||||
@ -2116,9 +2339,15 @@ def split_by_token_limit(module_files: List[Tuple[str, str]], max_tokens: int =
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
file_tokens = len(content) // 4
|
file_tokens = len(content) // 4
|
||||||
|
should_split = (
|
||||||
|
current_chunk
|
||||||
|
and (
|
||||||
|
current_tokens + file_tokens > max_tokens
|
||||||
|
or len(current_chunk) >= max_files
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
if current_tokens + file_tokens > max_tokens and current_chunk:
|
if should_split:
|
||||||
# Save current chunk and start new one
|
|
||||||
sub_chunks.append(current_chunk)
|
sub_chunks.append(current_chunk)
|
||||||
current_chunk = [(file_path, content)]
|
current_chunk = [(file_path, content)]
|
||||||
current_tokens = file_tokens
|
current_tokens = file_tokens
|
||||||
@ -2158,6 +2387,10 @@ def find_dependencies(chunk_files: List[Tuple[str, str]], dependency_graph: Opti
|
|||||||
# For now, return empty list - can be enhanced with actual dependency tracking
|
# For now, return empty list - can be enhanced with actual dependency tracking
|
||||||
return dependencies
|
return dependencies
|
||||||
|
|
||||||
|
MAX_TOKENS_PER_CHUNK = int(os.getenv("MAX_TOKENS_PER_CHUNK", "18000"))
|
||||||
|
MAX_FILES_PER_CHUNK = int(os.getenv("MAX_FILES_PER_CHUNK", "12"))
|
||||||
|
|
||||||
|
|
||||||
def create_intelligent_chunks(files: List[Tuple[str, str]], dependency_graph: Optional[Dict] = None) -> List[Dict]:
|
def create_intelligent_chunks(files: List[Tuple[str, str]], dependency_graph: Optional[Dict] = None) -> List[Dict]:
|
||||||
"""
|
"""
|
||||||
Group files by module/feature for semantic analysis.
|
Group files by module/feature for semantic analysis.
|
||||||
@ -2192,14 +2425,15 @@ def create_intelligent_chunks(files: List[Tuple[str, str]], dependency_graph: Op
|
|||||||
if not module_files:
|
if not module_files:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Check token limit (increased for better context and fewer chunks)
|
# Check token and file limits to keep prompts manageable for Claude
|
||||||
# With 2000 req/min API limit, we can handle larger chunks
|
|
||||||
# Increased from 15000 to 25000 tokens for better module-level context
|
|
||||||
module_tokens = estimate_tokens(module_files)
|
module_tokens = estimate_tokens(module_files)
|
||||||
MAX_TOKENS_PER_CHUNK = 25000 # Increased for more files per chunk
|
if module_tokens > MAX_TOKENS_PER_CHUNK or len(module_files) > MAX_FILES_PER_CHUNK:
|
||||||
if module_tokens > MAX_TOKENS_PER_CHUNK:
|
|
||||||
# Split large modules
|
# Split large modules
|
||||||
sub_chunks = split_by_token_limit(module_files, MAX_TOKENS_PER_CHUNK)
|
sub_chunks = split_by_token_limit(
|
||||||
|
module_files,
|
||||||
|
max_tokens=MAX_TOKENS_PER_CHUNK,
|
||||||
|
max_files=MAX_FILES_PER_CHUNK
|
||||||
|
)
|
||||||
for i, sub_chunk in enumerate(sub_chunks):
|
for i, sub_chunk in enumerate(sub_chunks):
|
||||||
chunks.append({
|
chunks.append({
|
||||||
'id': f'chunk_{chunk_counter:03d}',
|
'id': f'chunk_{chunk_counter:03d}',
|
||||||
@ -2354,7 +2588,8 @@ def update_state_with_findings(analysis_state: Dict, chunk: Dict, chunk_analysis
|
|||||||
Update analysis_state with findings from current chunk analysis.
|
Update analysis_state with findings from current chunk analysis.
|
||||||
Returns updated analysis_state.
|
Returns updated analysis_state.
|
||||||
"""
|
"""
|
||||||
chunk_name = chunk.get('name', 'unknown')
|
raw_chunk_name = chunk.get('name', 'unknown')
|
||||||
|
chunk_name = get_canonical_module_name(raw_chunk_name)
|
||||||
chunk_id = chunk.get('id', 'unknown')
|
chunk_id = chunk.get('id', 'unknown')
|
||||||
|
|
||||||
# Initialize state if needed
|
# Initialize state if needed
|
||||||
@ -2522,6 +2757,7 @@ def build_intelligent_chunk_prompt(chunk: Dict, analysis_state: Optional[Dict] =
|
|||||||
"## RESPONSE FORMAT:",
|
"## RESPONSE FORMAT:",
|
||||||
"",
|
"",
|
||||||
"⚠️ CRITICAL: You MUST analyze ALL files listed above. Do NOT skip any files.",
|
"⚠️ CRITICAL: You MUST analyze ALL files listed above. Do NOT skip any files.",
|
||||||
|
"If a file looks empty or repetitive, still return a JSON entry with notes explaining limited context.",
|
||||||
f"Files to analyze ({len(optimized_files)} total):",
|
f"Files to analyze ({len(optimized_files)} total):",
|
||||||
])
|
])
|
||||||
for i, file_path in enumerate(file_paths_list, 1):
|
for i, file_path in enumerate(file_paths_list, 1):
|
||||||
@ -3271,19 +3507,144 @@ async def store_chunk_analysis_in_memory(chunk: Dict, file_analyses: List, chunk
|
|||||||
},
|
},
|
||||||
'file_analyses': file_analyses_data
|
'file_analyses': file_analyses_data
|
||||||
}
|
}
|
||||||
|
metadata['dependencies'] = {
|
||||||
|
'depends_on_chunks': chunk.get('context_dependencies', []),
|
||||||
|
'raw_dependencies': chunk.get('dependencies', [])
|
||||||
|
}
|
||||||
|
|
||||||
# Prioritize Knowledge Graph storage
|
canonical_name = get_canonical_module_name(chunk_name)
|
||||||
|
module_aggregation_manager.add_chunk(
|
||||||
|
run_id=run_id,
|
||||||
|
chunk_name=chunk_name,
|
||||||
|
chunk_id=chunk.get('id'),
|
||||||
|
chunk_type=chunk_type,
|
||||||
|
chunk=chunk,
|
||||||
|
chunk_analysis=chunk_analysis or {},
|
||||||
|
file_analyses=file_analyses,
|
||||||
|
metadata=metadata,
|
||||||
|
ai_response=ai_response
|
||||||
|
)
|
||||||
|
print(f" 📦 Aggregated chunk '{chunk_name}' into canonical module '{canonical_name}'")
|
||||||
|
return canonical_name
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ [MEMORY] Failed to store chunk analysis: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def flush_module_aggregations(run_id: Optional[str], repository_id: str, session_id: Optional[str] = None) -> None:
|
||||||
|
"""Persist aggregated module data to the knowledge graph (or fallback memory)."""
|
||||||
|
if not run_id:
|
||||||
|
return
|
||||||
|
|
||||||
|
aggregated_modules = module_aggregation_manager.get_modules(run_id)
|
||||||
|
if not aggregated_modules:
|
||||||
|
print(f"ℹ️ [AGGREGATION] No aggregated modules to persist for run {run_id}")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"📦 [AGGREGATION] Persisting {len(aggregated_modules)} aggregated modules for run {run_id}")
|
||||||
|
|
||||||
|
for canonical_name, module_data in aggregated_modules.items():
|
||||||
|
file_list = list(module_data.file_map.values())
|
||||||
|
if not file_list:
|
||||||
|
print(f" ⚠️ [AGGREGATION] Skipping module '{canonical_name}' (no file analyses aggregated)")
|
||||||
|
continue
|
||||||
|
|
||||||
|
total_files = len(file_list)
|
||||||
|
total_lines = sum(fa.lines_of_code or 0 for fa in file_list)
|
||||||
|
total_issues = sum(len(_ensure_list_of_strings(fa.issues_found)) for fa in file_list)
|
||||||
|
total_recommendations = sum(len(_ensure_list_of_strings(fa.recommendations)) for fa in file_list)
|
||||||
|
high_quality = len([fa for fa in file_list if isinstance(fa.severity_score, (int, float)) and fa.severity_score >= 8])
|
||||||
|
medium_quality = len([fa for fa in file_list if isinstance(fa.severity_score, (int, float)) and 5 <= fa.severity_score < 8])
|
||||||
|
low_quality = len([fa for fa in file_list if isinstance(fa.severity_score, (int, float)) and fa.severity_score < 5])
|
||||||
|
|
||||||
|
if module_data.quality_scores:
|
||||||
|
quality_score = sum(module_data.quality_scores) / max(len(module_data.quality_scores), 1)
|
||||||
|
elif total_files:
|
||||||
|
severity_sum = sum(fa.severity_score for fa in file_list if isinstance(fa.severity_score, (int, float)))
|
||||||
|
quality_score = severity_sum / total_files if total_files else 5.0
|
||||||
|
else:
|
||||||
|
quality_score = 5.0
|
||||||
|
|
||||||
|
overviews = _dedupe_preserve_order([text for text in module_data.overviews if text])
|
||||||
|
architectures = _dedupe_preserve_order([text for text in module_data.architectures if text])
|
||||||
|
security_notes = _dedupe_preserve_order([text for text in module_data.security_notes if text])
|
||||||
|
recommendations_list = _dedupe_preserve_order(list(module_data.recommendations))
|
||||||
|
|
||||||
|
module_overview = "\n\n".join(overviews)
|
||||||
|
module_architecture = "\n\n".join(architectures)
|
||||||
|
module_security = "\n\n".join(security_notes)
|
||||||
|
|
||||||
|
ai_response_blocks = _dedupe_preserve_order([text for text in module_data.ai_responses if text])
|
||||||
|
ai_response_text = "\n\n".join(ai_response_blocks) if ai_response_blocks else module_overview
|
||||||
|
|
||||||
|
aggregated_chunk_analysis = {
|
||||||
|
'module_overview': module_overview or f"Aggregated analysis for {canonical_name}",
|
||||||
|
'module_quality_score': round(quality_score, 2),
|
||||||
|
'module_architecture': module_architecture,
|
||||||
|
'module_security_assessment': module_security,
|
||||||
|
'module_recommendations': recommendations_list
|
||||||
|
}
|
||||||
|
|
||||||
|
file_analyses_for_metadata = [
|
||||||
|
{
|
||||||
|
'file_path': fa.path,
|
||||||
|
'language': fa.language,
|
||||||
|
'lines_of_code': fa.lines_of_code,
|
||||||
|
'complexity_score': fa.complexity_score,
|
||||||
|
'severity_score': fa.severity_score,
|
||||||
|
'issues_found': _ensure_list_of_strings(fa.issues_found),
|
||||||
|
'recommendations': _ensure_list_of_strings(fa.recommendations),
|
||||||
|
'detailed_analysis': fa.detailed_analysis,
|
||||||
|
}
|
||||||
|
for fa in file_list
|
||||||
|
]
|
||||||
|
|
||||||
|
metadata = {
|
||||||
|
'type': 'module_analysis',
|
||||||
|
'run_id': run_id,
|
||||||
|
'chunk_name': canonical_name,
|
||||||
|
'chunk_type': 'module',
|
||||||
|
'repository_id': repository_id,
|
||||||
|
'total_files_in_chunk': total_files,
|
||||||
|
'chunk_metrics': {
|
||||||
|
'total_issues': total_issues,
|
||||||
|
'total_recommendations': total_recommendations,
|
||||||
|
'high_quality_files': high_quality,
|
||||||
|
'medium_quality_files': medium_quality,
|
||||||
|
'low_quality_files': low_quality
|
||||||
|
},
|
||||||
|
'file_analyses': file_analyses_for_metadata,
|
||||||
|
'dependencies': {
|
||||||
|
'depends_on_chunks': sorted(module_data.context_dependencies),
|
||||||
|
'raw_dependencies': sorted(module_data.dependencies)
|
||||||
|
},
|
||||||
|
'source_chunks': sorted(module_data.original_names),
|
||||||
|
'total_lines': total_lines
|
||||||
|
}
|
||||||
|
|
||||||
|
aggregated_chunk = {
|
||||||
|
'id': module_data.chunk_ids[0] if module_data.chunk_ids else f'aggregated_{canonical_name}',
|
||||||
|
'name': canonical_name,
|
||||||
|
'priority': 2,
|
||||||
|
'type': 'module',
|
||||||
|
'context_dependencies': list(module_data.context_dependencies),
|
||||||
|
'dependencies': list(module_data.dependencies)
|
||||||
|
}
|
||||||
|
|
||||||
|
stored = False
|
||||||
if USE_KNOWLEDGE_GRAPH and neo4j_client:
|
if USE_KNOWLEDGE_GRAPH and neo4j_client:
|
||||||
try:
|
try:
|
||||||
module_payload = kg_ops.build_module_payload(
|
module_payload = kg_ops.build_module_payload(
|
||||||
run_id=run_id,
|
run_id=run_id,
|
||||||
repository_id=repository_id,
|
repository_id=repository_id,
|
||||||
module_name=chunk_name,
|
module_name=canonical_name,
|
||||||
chunk=chunk,
|
chunk=aggregated_chunk,
|
||||||
chunk_analysis=chunk_analysis,
|
chunk_analysis=aggregated_chunk_analysis,
|
||||||
file_analyses=file_analyses,
|
file_analyses=file_list,
|
||||||
metadata=metadata,
|
metadata=metadata,
|
||||||
ai_response=ai_response,
|
ai_response=ai_response_text,
|
||||||
)
|
)
|
||||||
await kg_ops.store_module_analysis(
|
await kg_ops.store_module_analysis(
|
||||||
client=neo4j_client,
|
client=neo4j_client,
|
||||||
@ -3291,33 +3652,30 @@ async def store_chunk_analysis_in_memory(chunk: Dict, file_analyses: List, chunk
|
|||||||
repository_id=repository_id,
|
repository_id=repository_id,
|
||||||
module_payload=module_payload,
|
module_payload=module_payload,
|
||||||
)
|
)
|
||||||
print(f" ✅ Stored in Neo4j knowledge graph (module: {chunk_name})")
|
print(f" ✅ [AGGREGATION] Stored aggregated module '{canonical_name}' in Neo4j")
|
||||||
return module_payload["module_props"]["module_id"]
|
stored = True
|
||||||
except Exception as kg_error:
|
except Exception as kg_error:
|
||||||
print(f" ⚠️ Failed to store module in knowledge graph: {kg_error}. Falling back to episodic memory.")
|
print(f" ⚠️ [AGGREGATION] Failed to store '{canonical_name}' in Neo4j: {kg_error}")
|
||||||
|
|
||||||
# Fallback to Episodic Memory
|
if not stored and analyzer and hasattr(analyzer, 'memory_manager'):
|
||||||
try:
|
try:
|
||||||
memory_id = await analyzer.memory_manager.store_episodic_memory(
|
memory_id = await analyzer.memory_manager.store_episodic_memory(
|
||||||
session_id=session_id,
|
session_id=session_id,
|
||||||
user_query=user_query,
|
user_query=f"Aggregated analysis for module: {canonical_name}",
|
||||||
ai_response=ai_response,
|
ai_response=ai_response_text or module_overview or f"Aggregated analysis for {canonical_name}",
|
||||||
repo_context=repository_id,
|
repo_context=repository_id,
|
||||||
metadata=metadata
|
metadata=metadata
|
||||||
)
|
)
|
||||||
print(f" ✅ Stored in episodic memory with ID: {memory_id}")
|
print(f" ✅ [AGGREGATION] Stored aggregated module '{canonical_name}' in episodic memory (ID: {memory_id})")
|
||||||
return memory_id
|
stored = True
|
||||||
except Exception as memory_error:
|
except Exception as memory_error:
|
||||||
print(f" ❌ Failed to store in episodic memory: {memory_error}")
|
print(f" ❌ [AGGREGATION] Failed to store '{canonical_name}' in episodic memory: {memory_error}")
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
return None
|
|
||||||
|
|
||||||
except Exception as e:
|
if not stored:
|
||||||
print(f"❌ [MEMORY] Failed to store chunk analysis: {e}")
|
print(f" ❌ [AGGREGATION] Unable to persist aggregated module '{canonical_name}' in any storage backend")
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
module_aggregation_manager.clear(run_id)
|
||||||
return None
|
|
||||||
|
|
||||||
async def store_cumulative_analysis_state(session_id: str, repository_id: str, analysis_state: Dict, chunk_sequence: int):
|
async def store_cumulative_analysis_state(session_id: str, repository_id: str, analysis_state: Dict, chunk_sequence: int):
|
||||||
"""
|
"""
|
||||||
@ -5307,6 +5665,7 @@ async def process_chunks_in_parallel_batches(chunks, repository_id, progress_mgr
|
|||||||
|
|
||||||
async def analyze_repository_with_optimizations_parallel(repo_path: str, repository_id: str, user_id: str, max_files: Optional[int] = None, progress_mgr: Optional[AnalysisProgressManager] = None):
|
async def analyze_repository_with_optimizations_parallel(repo_path: str, repository_id: str, user_id: str, max_files: Optional[int] = None, progress_mgr: Optional[AnalysisProgressManager] = None):
|
||||||
"""Analyze repository with PARALLEL BATCH PROCESSING for faster analysis."""
|
"""Analyze repository with PARALLEL BATCH PROCESSING for faster analysis."""
|
||||||
|
run_id: Optional[str] = None
|
||||||
try:
|
try:
|
||||||
# Set run_id early so it's available for chunk storage
|
# Set run_id early so it's available for chunk storage
|
||||||
# Extract analysis_id from progress_mgr if available, otherwise generate
|
# Extract analysis_id from progress_mgr if available, otherwise generate
|
||||||
@ -5321,6 +5680,9 @@ async def analyze_repository_with_optimizations_parallel(repo_path: str, reposit
|
|||||||
if not hasattr(analyzer, 'session_id') or not analyzer.session_id:
|
if not hasattr(analyzer, 'session_id') or not analyzer.session_id:
|
||||||
analyzer.session_id = str(uuid.uuid4())
|
analyzer.session_id = str(uuid.uuid4())
|
||||||
|
|
||||||
|
if run_id:
|
||||||
|
module_aggregation_manager.reset(run_id)
|
||||||
|
|
||||||
print(f"🔑 [ANALYSIS] Set run_id: {run_id}")
|
print(f"🔑 [ANALYSIS] Set run_id: {run_id}")
|
||||||
|
|
||||||
# Get repository files from Git Integration Service API
|
# Get repository files from Git Integration Service API
|
||||||
@ -5447,6 +5809,11 @@ async def analyze_repository_with_optimizations_parallel(repo_path: str, reposit
|
|||||||
})
|
})
|
||||||
|
|
||||||
print(f"✅ [STORAGE] All chunk analyses stored")
|
print(f"✅ [STORAGE] All chunk analyses stored")
|
||||||
|
await flush_module_aggregations(
|
||||||
|
run_id=run_id,
|
||||||
|
repository_id=repository_id,
|
||||||
|
session_id=getattr(analyzer, 'session_id', None) if analyzer else None
|
||||||
|
)
|
||||||
|
|
||||||
# ========================================================================
|
# ========================================================================
|
||||||
# PHASE 2: CROSS-MODULE SYNTHESIS
|
# PHASE 2: CROSS-MODULE SYNTHESIS
|
||||||
@ -5568,11 +5935,15 @@ async def analyze_repository_with_optimizations_parallel(repo_path: str, reposit
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error in parallel analysis: {e}")
|
print(f"Error in parallel analysis: {e}")
|
||||||
raise
|
raise
|
||||||
|
finally:
|
||||||
|
if run_id:
|
||||||
|
module_aggregation_manager.clear(run_id)
|
||||||
|
|
||||||
async def analyze_repository_with_optimizations(repo_path: str, repository_id: str, user_id: str, max_files: int = 100, progress_mgr: Optional[AnalysisProgressManager] = None):
|
async def analyze_repository_with_optimizations(repo_path: str, repository_id: str, user_id: str, max_files: int = 100, progress_mgr: Optional[AnalysisProgressManager] = None):
|
||||||
"""Analyze repository with SMART BATCHING for maximum efficiency."""
|
"""Analyze repository with SMART BATCHING for maximum efficiency."""
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
run_id: Optional[str] = None
|
||||||
try:
|
try:
|
||||||
# Get repository files from Git Integration Service API
|
# Get repository files from Git Integration Service API
|
||||||
files_to_analyze = await get_repository_files_from_api(repository_id, user_id, max_files)
|
files_to_analyze = await get_repository_files_from_api(repository_id, user_id, max_files)
|
||||||
@ -5601,6 +5972,19 @@ async def analyze_repository_with_optimizations(repo_path: str, repository_id: s
|
|||||||
for i, chunk in enumerate(chunks, 1):
|
for i, chunk in enumerate(chunks, 1):
|
||||||
print(f" Chunk {i}: {chunk['name']} ({chunk['chunk_type']}) - {len(chunk['files'])} files")
|
print(f" Chunk {i}: {chunk['name']} ({chunk['chunk_type']}) - {len(chunk['files'])} files")
|
||||||
|
|
||||||
|
if analyzer:
|
||||||
|
run_id = getattr(analyzer, 'run_id', None)
|
||||||
|
if not run_id:
|
||||||
|
run_id = f"repo_analysis_{repository_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
||||||
|
analyzer.run_id = run_id
|
||||||
|
if not hasattr(analyzer, 'session_id') or not analyzer.session_id:
|
||||||
|
analyzer.session_id = str(uuid.uuid4())
|
||||||
|
else:
|
||||||
|
run_id = f"repo_analysis_{repository_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
||||||
|
|
||||||
|
if run_id:
|
||||||
|
module_aggregation_manager.reset(run_id)
|
||||||
|
|
||||||
# Initialize analysis_state for progressive context
|
# Initialize analysis_state for progressive context
|
||||||
analysis_state = {}
|
analysis_state = {}
|
||||||
|
|
||||||
@ -5739,6 +6123,12 @@ async def analyze_repository_with_optimizations(repo_path: str, repository_id: s
|
|||||||
|
|
||||||
print(f"🎉 [INTELLIGENT CHUNKING] Completed all {total_chunks} chunks - {len(file_analyses)} files analyzed")
|
print(f"🎉 [INTELLIGENT CHUNKING] Completed all {total_chunks} chunks - {len(file_analyses)} files analyzed")
|
||||||
|
|
||||||
|
await flush_module_aggregations(
|
||||||
|
run_id=run_id,
|
||||||
|
repository_id=repository_id,
|
||||||
|
session_id=getattr(analyzer, 'session_id', None) if analyzer else None
|
||||||
|
)
|
||||||
|
|
||||||
# ========================================================================
|
# ========================================================================
|
||||||
# PHASE 2: CROSS-MODULE SYNTHESIS
|
# PHASE 2: CROSS-MODULE SYNTHESIS
|
||||||
# ========================================================================
|
# ========================================================================
|
||||||
@ -5868,6 +6258,9 @@ async def analyze_repository_with_optimizations(repo_path: str, repository_id: s
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error in optimized analysis: {e}")
|
print(f"Error in optimized analysis: {e}")
|
||||||
raise
|
raise
|
||||||
|
finally:
|
||||||
|
if run_id:
|
||||||
|
module_aggregation_manager.clear(run_id)
|
||||||
|
|
||||||
@app.get("/repository/{repository_id}/info")
|
@app.get("/repository/{repository_id}/info")
|
||||||
async def get_repository_info(repository_id: str, user_id: str):
|
async def get_repository_info(repository_id: str, user_id: str):
|
||||||
|
|||||||
@ -70,6 +70,7 @@ const serviceTargets = {
|
|||||||
AI_MOCKUP_URL: process.env.AI_MOCKUP_URL || 'http://localhost:8021',
|
AI_MOCKUP_URL: process.env.AI_MOCKUP_URL || 'http://localhost:8021',
|
||||||
AI_ANALYSIS_URL: process.env.AI_ANALYSIS_URL || 'http://localhost:8022',
|
AI_ANALYSIS_URL: process.env.AI_ANALYSIS_URL || 'http://localhost:8022',
|
||||||
FAST_AI_ANALYSIS_URL: process.env.FAST_AI_ANALYSIS_URL || 'http://localhost:8023',
|
FAST_AI_ANALYSIS_URL: process.env.FAST_AI_ANALYSIS_URL || 'http://localhost:8023',
|
||||||
|
MULTI_DOCUMENT_UPLOAD_URL: process.env.MULTI_DOCUMENT_UPLOAD_URL || 'http://localhost:8024',
|
||||||
};
|
};
|
||||||
|
|
||||||
// Log service targets for debugging
|
// Log service targets for debugging
|
||||||
@ -944,6 +945,31 @@ app.use('/api/ai/repository',
|
|||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Multi-Document Upload Service - handles large multipart uploads
|
||||||
|
console.log('🔧 Registering /api/multi-docs proxy route...');
|
||||||
|
app.use('/api/multi-docs',
|
||||||
|
createServiceLimiter(120),
|
||||||
|
(req, res, next) => {
|
||||||
|
console.log(`📁 [MULTI-DOCS PROXY] ${req.method} ${req.originalUrl}`);
|
||||||
|
next();
|
||||||
|
},
|
||||||
|
createProxyMiddleware({
|
||||||
|
target: serviceTargets.MULTI_DOCUMENT_UPLOAD_URL,
|
||||||
|
changeOrigin: true,
|
||||||
|
pathRewrite: { '^/api/multi-docs': '' },
|
||||||
|
logLevel: 'warn',
|
||||||
|
proxyTimeout: 1800000,
|
||||||
|
timeout: 1800000,
|
||||||
|
onProxyReq: (proxyReq, req, res) => {
|
||||||
|
proxyReq.setHeader('X-Forwarded-By', 'api-gateway');
|
||||||
|
},
|
||||||
|
onProxyRes: (proxyRes, req, res) => {
|
||||||
|
res.setHeader('Access-Control-Allow-Origin', req.headers.origin || '*');
|
||||||
|
res.setHeader('Access-Control-Allow-Credentials', 'true');
|
||||||
|
}
|
||||||
|
})
|
||||||
|
);
|
||||||
|
|
||||||
// Template Manager AI - expose AI recommendations through the gateway
|
// Template Manager AI - expose AI recommendations through the gateway
|
||||||
console.log('🔧 Registering /api/ai/tech-stack proxy route...');
|
console.log('🔧 Registering /api/ai/tech-stack proxy route...');
|
||||||
app.use('/api/ai/tech-stack',
|
app.use('/api/ai/tech-stack',
|
||||||
|
|||||||
30
services/multi-document-upload-service/Dockerfile
Normal file
30
services/multi-document-upload-service/Dockerfile
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||||
|
PYTHONUNBUFFERED=1
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y --no-install-recommends \
|
||||||
|
build-essential \
|
||||||
|
poppler-utils \
|
||||||
|
tesseract-ocr \
|
||||||
|
ffmpeg \
|
||||||
|
libmagic1 \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
COPY src ./src
|
||||||
|
|
||||||
|
ENV PYTHONPATH=/app/src \
|
||||||
|
MULTI_DOC_STORAGE_ROOT=/app/storage \
|
||||||
|
MULTI_DOC_CLAUDE_MODEL=claude-3-5-sonnet-20241022 \
|
||||||
|
PORT=8024
|
||||||
|
|
||||||
|
EXPOSE 8024
|
||||||
|
|
||||||
|
CMD ["sh", "-c", "uvicorn multi_document_upload_service.main:app --host 0.0.0.0 --port ${PORT:-8024}"]
|
||||||
|
|
||||||
144
services/multi-document-upload-service/FIX_EMPTY_GRAPH.md
Normal file
144
services/multi-document-upload-service/FIX_EMPTY_GRAPH.md
Normal file
@ -0,0 +1,144 @@
|
|||||||
|
# Fix: Empty Graph in Neo4j (No Relationships Found)
|
||||||
|
|
||||||
|
## Problem
|
||||||
|
|
||||||
|
When querying Neo4j for `CAUSES` relationships, you get "(no changes, no records)" because:
|
||||||
|
|
||||||
|
1. **PDF extraction failed** - Missing dependencies (`unstructured[pdf]`)
|
||||||
|
2. **0 relations extracted** - No text was extracted, so no analysis happened
|
||||||
|
3. **0 relations written** - Nothing was written to Neo4j (correct behavior)
|
||||||
|
|
||||||
|
## Root Cause
|
||||||
|
|
||||||
|
The service completed with 0 relations because:
|
||||||
|
- PDF file extraction failed: `partition_pdf() is not available because one or more dependencies are not installed`
|
||||||
|
- No text was extracted from the PDF
|
||||||
|
- No chunks were created
|
||||||
|
- No Claude analysis happened
|
||||||
|
- 0 relations were extracted
|
||||||
|
- 0 relations were written to Neo4j
|
||||||
|
|
||||||
|
## Solution
|
||||||
|
|
||||||
|
### Step 1: Update Dependencies
|
||||||
|
|
||||||
|
The `requirements.txt` has been updated to include:
|
||||||
|
```
|
||||||
|
unstructured[pdf]>=0.15.0
|
||||||
|
unstructured[docx]>=0.15.0
|
||||||
|
unstructured[pptx]>=0.15.0
|
||||||
|
unstructured[xlsx]>=0.15.0
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 2: Rebuild the Service
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine
|
||||||
|
|
||||||
|
# Rebuild the service with new dependencies
|
||||||
|
docker-compose build multi-document-upload-service
|
||||||
|
|
||||||
|
# Restart the service
|
||||||
|
docker-compose restart multi-document-upload-service
|
||||||
|
|
||||||
|
# Check logs to verify it's working
|
||||||
|
docker-compose logs -f multi-document-upload-service
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 3: Verify Dependencies
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check if unstructured[pdf] is installed
|
||||||
|
docker-compose exec multi-document-upload-service pip list | grep unstructured
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 4: Re-upload Documents
|
||||||
|
|
||||||
|
1. Go to Project Builder in the frontend
|
||||||
|
2. Click on "Upload Documents for Knowledge Graph"
|
||||||
|
3. Upload a PDF or other document
|
||||||
|
4. Wait for processing to complete
|
||||||
|
5. Check Neo4j for relationships
|
||||||
|
|
||||||
|
### Step 5: Check Neo4j
|
||||||
|
|
||||||
|
Run these queries in Neo4j Browser:
|
||||||
|
|
||||||
|
```cypher
|
||||||
|
// Check if any nodes exist
|
||||||
|
MATCH (n)
|
||||||
|
RETURN count(n) as node_count
|
||||||
|
|
||||||
|
// Check for CAUSES relationships
|
||||||
|
MATCH (n:Concept)-[r:CAUSES]->(m:Concept)
|
||||||
|
RETURN n.name as cause, m.name as effect, r.confidence as confidence
|
||||||
|
LIMIT 50
|
||||||
|
```
|
||||||
|
|
||||||
|
## Expected Behavior After Fix
|
||||||
|
|
||||||
|
1. **PDF extraction succeeds** - Text is extracted from PDF files
|
||||||
|
2. **Text is chunked** - Document is split into manageable chunks
|
||||||
|
3. **Claude analyzes** - Causal relationships are extracted
|
||||||
|
4. **Relations are written** - Relationships are stored in Neo4j
|
||||||
|
5. **Query returns results** - Neo4j query shows relationships
|
||||||
|
|
||||||
|
## Verification Steps
|
||||||
|
|
||||||
|
1. **Check service logs**:
|
||||||
|
```bash
|
||||||
|
docker-compose logs multi-document-upload-service | grep -i "extracted\|relation\|neo4j"
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Check job status**:
|
||||||
|
```bash
|
||||||
|
curl http://localhost:8000/api/multi-docs/jobs/{job_id}
|
||||||
|
```
|
||||||
|
Should show: `"processed_files": 1` and relations count > 0
|
||||||
|
|
||||||
|
3. **Check Neo4j**:
|
||||||
|
```cypher
|
||||||
|
MATCH (n:Concept)-[r:CAUSES]->(m:Concept)
|
||||||
|
RETURN count(r) as relation_count
|
||||||
|
```
|
||||||
|
|
||||||
|
## Improvements Made
|
||||||
|
|
||||||
|
1. ✅ **Added PDF dependencies** - `unstructured[pdf]`, `unstructured[docx]`, etc.
|
||||||
|
2. ✅ **Added fallback extractors** - Uses `pdfplumber` if unstructured fails
|
||||||
|
3. ✅ **Better error handling** - Shows actual errors in job status
|
||||||
|
4. ✅ **Improved logging** - More detailed logs for debugging
|
||||||
|
5. ✅ **Better Neo4j query** - Validates data before writing
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
If you still see 0 relations after rebuilding:
|
||||||
|
|
||||||
|
1. **Check extraction logs**:
|
||||||
|
```bash
|
||||||
|
docker-compose logs multi-document-upload-service | grep -i "extract"
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Check Claude analysis**:
|
||||||
|
```bash
|
||||||
|
docker-compose logs multi-document-upload-service | grep -i "claude\|analyze"
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Check Neo4j connection**:
|
||||||
|
```bash
|
||||||
|
docker-compose logs multi-document-upload-service | grep -i "neo4j\|graph"
|
||||||
|
```
|
||||||
|
|
||||||
|
4. **Verify document has causal language**:
|
||||||
|
- Not all documents contain causal relationships
|
||||||
|
- Try uploading a document with clear cause-effect statements
|
||||||
|
- Example: "Smoking causes lung cancer" or "Rain causes flooding"
|
||||||
|
|
||||||
|
## Next Steps
|
||||||
|
|
||||||
|
1. Rebuild the service with new dependencies
|
||||||
|
2. Re-upload documents
|
||||||
|
3. Check Neo4j for relationships
|
||||||
|
4. If still no results, check service logs for errors
|
||||||
|
5. Verify the document contains causal language
|
||||||
|
|
||||||
@ -0,0 +1,176 @@
|
|||||||
|
# Neo4j Diagnostic Queries
|
||||||
|
|
||||||
|
## Issue: No relationships found in Neo4j
|
||||||
|
|
||||||
|
If you're seeing "(no changes, no records)" when querying for `CAUSES` relationships, here are diagnostic queries to check what's actually in the database.
|
||||||
|
|
||||||
|
## Diagnostic Queries
|
||||||
|
|
||||||
|
### 1. Check if any nodes exist
|
||||||
|
```cypher
|
||||||
|
MATCH (n)
|
||||||
|
RETURN count(n) as node_count
|
||||||
|
LIMIT 1
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Check if Concept nodes exist
|
||||||
|
```cypher
|
||||||
|
MATCH (n:Concept)
|
||||||
|
RETURN count(n) as concept_count,
|
||||||
|
collect(DISTINCT labels(n)) as labels,
|
||||||
|
collect(DISTINCT keys(n)) as properties
|
||||||
|
LIMIT 10
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Check all relationship types
|
||||||
|
```cypher
|
||||||
|
CALL db.relationshipTypes() YIELD relationshipType
|
||||||
|
RETURN relationshipType
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Check all node labels
|
||||||
|
```cypher
|
||||||
|
CALL db.labels() YIELD label
|
||||||
|
RETURN label
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5. Check all relationships (any type)
|
||||||
|
```cypher
|
||||||
|
MATCH (n)-[r]->(m)
|
||||||
|
RETURN type(r) as relationship_type,
|
||||||
|
count(r) as count,
|
||||||
|
labels(n) as from_labels,
|
||||||
|
labels(m) as to_labels
|
||||||
|
LIMIT 50
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6. Check for CAUSES relationships specifically
|
||||||
|
```cypher
|
||||||
|
MATCH (n)-[r:CAUSES]->(m)
|
||||||
|
RETURN n, r, m
|
||||||
|
LIMIT 50
|
||||||
|
```
|
||||||
|
|
||||||
|
### 7. Check for relationships with lowercase "causes"
|
||||||
|
```cypher
|
||||||
|
MATCH (n)-[r]->(m)
|
||||||
|
WHERE type(r) =~ '(?i)causes'
|
||||||
|
RETURN type(r) as relationship_type, n, r, m
|
||||||
|
LIMIT 50
|
||||||
|
```
|
||||||
|
|
||||||
|
### 8. Check all nodes and their relationships
|
||||||
|
```cypher
|
||||||
|
MATCH (n)
|
||||||
|
OPTIONAL MATCH (n)-[r]->(m)
|
||||||
|
RETURN n, labels(n) as node_labels,
|
||||||
|
type(r) as relationship_type,
|
||||||
|
m, labels(m) as target_labels
|
||||||
|
LIMIT 50
|
||||||
|
```
|
||||||
|
|
||||||
|
### 9. Check for nodes created by the service (by job_id property)
|
||||||
|
```cypher
|
||||||
|
MATCH (n)-[r]->(m)
|
||||||
|
WHERE r.job_id IS NOT NULL
|
||||||
|
RETURN n, r, m, r.job_id as job_id
|
||||||
|
LIMIT 50
|
||||||
|
```
|
||||||
|
|
||||||
|
### 10. Check database statistics
|
||||||
|
```cypher
|
||||||
|
MATCH (n)
|
||||||
|
RETURN count(n) as total_nodes,
|
||||||
|
size([(n)-[r]->() | r]) as total_relationships
|
||||||
|
```
|
||||||
|
|
||||||
|
## Common Issues and Solutions
|
||||||
|
|
||||||
|
### Issue 1: No nodes at all
|
||||||
|
**Symptom**: Query 1 returns 0 nodes
|
||||||
|
**Cause**: Service hasn't written anything to Neo4j, or connection failed
|
||||||
|
**Solution**:
|
||||||
|
- Check service logs: `docker-compose logs multi-document-upload-service`
|
||||||
|
- Verify Neo4j connection in service configuration
|
||||||
|
- Check if job completed with 0 relations (extraction failed)
|
||||||
|
|
||||||
|
### Issue 2: Nodes exist but no relationships
|
||||||
|
**Symptom**: Query 1 returns nodes, but Query 6 returns no relationships
|
||||||
|
**Cause**: Relationships weren't created, or different relationship type
|
||||||
|
**Solution**:
|
||||||
|
- Check Query 5 to see what relationship types actually exist
|
||||||
|
- Check service logs for graph writing errors
|
||||||
|
- Verify the job actually extracted relations (check job status)
|
||||||
|
|
||||||
|
### Issue 3: Different relationship type
|
||||||
|
**Symptom**: Query 5 shows relationships but not `CAUSES`
|
||||||
|
**Cause**: Service might be using a different relationship type
|
||||||
|
**Solution**:
|
||||||
|
- Check Query 3 to see all relationship types
|
||||||
|
- Update query to use the correct relationship type
|
||||||
|
|
||||||
|
### Issue 4: Different node labels
|
||||||
|
**Symptom**: Query 6 returns no results, but Query 2 shows different labels
|
||||||
|
**Cause**: Service might be using different node labels
|
||||||
|
**Solution**:
|
||||||
|
- Check Query 2 to see what labels exist
|
||||||
|
- Update query to match actual labels
|
||||||
|
|
||||||
|
## Expected Structure
|
||||||
|
|
||||||
|
After a successful upload, you should see:
|
||||||
|
|
||||||
|
### Nodes
|
||||||
|
- **Label**: `Concept`
|
||||||
|
- **Properties**: `name`, `lastSeen`
|
||||||
|
|
||||||
|
### Relationships
|
||||||
|
- **Type**: `CAUSES`
|
||||||
|
- **Properties**: `confidence`, `explanation`, `source_file_id`, `source_snippet`, `job_id`, `model`, `updated_at`
|
||||||
|
|
||||||
|
### Example Query
|
||||||
|
```cypher
|
||||||
|
MATCH (cause:Concept)-[r:CAUSES]->(effect:Concept)
|
||||||
|
RETURN cause.name as cause,
|
||||||
|
effect.name as effect,
|
||||||
|
r.confidence as confidence,
|
||||||
|
r.job_id as job_id,
|
||||||
|
r.source_file_id as source_file
|
||||||
|
LIMIT 50
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting Steps
|
||||||
|
|
||||||
|
1. **Check service logs**:
|
||||||
|
```bash
|
||||||
|
docker-compose logs -f multi-document-upload-service
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Check if job completed successfully**:
|
||||||
|
```bash
|
||||||
|
curl http://localhost:8000/api/multi-docs/jobs/{job_id}
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Check Neo4j connection**:
|
||||||
|
```bash
|
||||||
|
docker-compose logs neo4j | grep -i error
|
||||||
|
```
|
||||||
|
|
||||||
|
4. **Verify Neo4j is running**:
|
||||||
|
```bash
|
||||||
|
docker-compose ps neo4j
|
||||||
|
```
|
||||||
|
|
||||||
|
5. **Test Neo4j connection manually**:
|
||||||
|
```bash
|
||||||
|
docker-compose exec neo4j cypher-shell -u neo4j -p password "MATCH (n) RETURN count(n)"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Next Steps
|
||||||
|
|
||||||
|
1. Run the diagnostic queries above
|
||||||
|
2. Check the service logs for errors
|
||||||
|
3. Verify the job status via API
|
||||||
|
4. Re-upload documents after fixing dependencies
|
||||||
|
5. Check if relations were actually extracted (job status should show relation count)
|
||||||
|
|
||||||
85
services/multi-document-upload-service/QUICK_TEST.md
Normal file
85
services/multi-document-upload-service/QUICK_TEST.md
Normal file
@ -0,0 +1,85 @@
|
|||||||
|
# Quick Testing Guide - Multi-Document Upload
|
||||||
|
|
||||||
|
## 🚀 Quick Start Testing
|
||||||
|
|
||||||
|
### 1. Start Services
|
||||||
|
```bash
|
||||||
|
cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine
|
||||||
|
docker-compose up -d multi-document-upload-service neo4j redis postgres api-gateway
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Verify Services
|
||||||
|
```bash
|
||||||
|
# Check health
|
||||||
|
curl http://localhost:8024/health
|
||||||
|
curl http://localhost:8000/api/multi-docs/health
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Test via Frontend
|
||||||
|
|
||||||
|
1. **Open Frontend**: `http://localhost:3001`
|
||||||
|
2. **Login** (if required)
|
||||||
|
3. **Go to Project Builder**
|
||||||
|
4. **Complete Steps 1-2** (Project Type & Features)
|
||||||
|
5. **Step 3: Multi Docs Upload** appears
|
||||||
|
6. **Upload files**:
|
||||||
|
- Click upload area
|
||||||
|
- Select multiple files (PDF, DOCX, etc.)
|
||||||
|
- Click "Start Upload"
|
||||||
|
7. **Watch Progress**:
|
||||||
|
- Progress bar updates
|
||||||
|
- Status messages appear
|
||||||
|
- Polls every 4 seconds
|
||||||
|
8. **Auto-proceeds** when completed
|
||||||
|
|
||||||
|
### 4. Verify in Neo4j
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Open Neo4j Browser: http://localhost:7474
|
||||||
|
# Login: neo4j / password
|
||||||
|
|
||||||
|
# Query causal relationships:
|
||||||
|
MATCH (n)-[r:CAUSES]->(m)
|
||||||
|
RETURN n, r, m
|
||||||
|
LIMIT 50
|
||||||
|
```
|
||||||
|
|
||||||
|
## 📝 Test Checklist
|
||||||
|
|
||||||
|
- [ ] Service starts successfully
|
||||||
|
- [ ] Health endpoint works
|
||||||
|
- [ ] Frontend component renders
|
||||||
|
- [ ] File upload works
|
||||||
|
- [ ] Progress updates correctly
|
||||||
|
- [ ] Job completes successfully
|
||||||
|
- [ ] Neo4j graph contains relationships
|
||||||
|
- [ ] Error handling works
|
||||||
|
- [ ] Skip button works
|
||||||
|
|
||||||
|
## 🔍 Debug Commands
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# View service logs
|
||||||
|
docker-compose logs -f multi-document-upload-service
|
||||||
|
|
||||||
|
# Check job status (replace {job_id})
|
||||||
|
curl http://localhost:8000/api/multi-docs/jobs/{job_id}
|
||||||
|
|
||||||
|
# Check graph summary
|
||||||
|
curl http://localhost:8000/api/multi-docs/jobs/{job_id}/graph
|
||||||
|
```
|
||||||
|
|
||||||
|
## ⚠️ Common Issues
|
||||||
|
|
||||||
|
1. **502 Bad Gateway**: Service not running → `docker-compose ps`
|
||||||
|
2. **413 Too Large**: File too big → Reduce file size
|
||||||
|
3. **No progress**: Check browser console → Check network tab
|
||||||
|
4. **No relationships**: Check Claude API key → Check service logs
|
||||||
|
|
||||||
|
## 🎯 Expected Flow
|
||||||
|
|
||||||
|
```
|
||||||
|
Upload Files → Job Created → Files Saved → Content Extracted →
|
||||||
|
Claude Analysis → Graph Built → Completed → Auto-proceed to Next Step
|
||||||
|
```
|
||||||
|
|
||||||
36
services/multi-document-upload-service/README.md
Normal file
36
services/multi-document-upload-service/README.md
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
# Multi Document Upload Service
|
||||||
|
|
||||||
|
This service accepts large batches of heterogeneous documents, extracts causal
|
||||||
|
relationships with Claude Sonnet 3.5, and writes them into Neo4j as a
|
||||||
|
knowledge graph.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- Multipart upload endpoint (`POST /jobs`) capable of handling dozens of files
|
||||||
|
and mixed formats (PDF, DOCX, PPTX, XLSX/CSV, JSON/XML, images, audio/video).
|
||||||
|
- Content extraction powered by the `unstructured` library with fallbacks.
|
||||||
|
- Chunking tuned for Claude Sonnet (800 token target, 200 overlap).
|
||||||
|
- High-accuracy causal extraction using Anthropic Claude with provenance.
|
||||||
|
- Neo4j graph writer that upserts `Concept` nodes and `CAUSES` edges.
|
||||||
|
- Status endpoint (`GET /jobs/{id}`) and graph summary endpoint
|
||||||
|
(`GET /jobs/{id}/graph`).
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
Environment variables:
|
||||||
|
|
||||||
|
- `ANTHROPIC_API_KEY` (required)
|
||||||
|
- `MULTI_DOC_CLAUDE_MODEL` (default `claude-3-5-sonnet-20241022`)
|
||||||
|
- `NEO4J_URI` (default `bolt://localhost:7687`)
|
||||||
|
- `NEO4J_USER` / `NEO4J_PASSWORD` (default `neo4j` / `neo4j`)
|
||||||
|
- `MULTI_DOC_STORAGE_ROOT` (default `storage` inside project)
|
||||||
|
|
||||||
|
## Run locally
|
||||||
|
|
||||||
|
```bash
|
||||||
|
uvicorn multi_document_upload_service.main:app --reload --host 0.0.0.0 --port 8035
|
||||||
|
```
|
||||||
|
|
||||||
|
Ensure Neo4j is reachable and Anthropic credentials are exported before
|
||||||
|
starting the service.
|
||||||
|
|
||||||
152
services/multi-document-upload-service/REBUILD_INSTRUCTIONS.md
Normal file
152
services/multi-document-upload-service/REBUILD_INSTRUCTIONS.md
Normal file
@ -0,0 +1,152 @@
|
|||||||
|
# Rebuild Instructions - Multi-Document Upload Service
|
||||||
|
|
||||||
|
## Issue: Empty Graph in Neo4j
|
||||||
|
|
||||||
|
**Problem**: Query returns "(no changes, no records)" because the job completed with 0 relations.
|
||||||
|
|
||||||
|
**Root Cause**: PDF extraction failed due to missing dependencies (`unstructured[pdf]`).
|
||||||
|
|
||||||
|
## Fixes Applied
|
||||||
|
|
||||||
|
1. ✅ Added PDF dependencies (`unstructured[pdf]`, `unstructured[docx]`, etc.)
|
||||||
|
2. ✅ Added fallback extractors (pdfplumber, python-docx, python-pptx)
|
||||||
|
3. ✅ Improved error handling and logging
|
||||||
|
4. ✅ Fixed Neo4j query syntax
|
||||||
|
5. ✅ Better status messages
|
||||||
|
|
||||||
|
## Rebuild Steps
|
||||||
|
|
||||||
|
### Step 1: Rebuild the Service
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine
|
||||||
|
|
||||||
|
# Stop the service
|
||||||
|
docker-compose stop multi-document-upload-service
|
||||||
|
|
||||||
|
# Rebuild with new dependencies
|
||||||
|
docker-compose build --no-cache multi-document-upload-service
|
||||||
|
|
||||||
|
# Start the service
|
||||||
|
docker-compose up -d multi-document-upload-service
|
||||||
|
|
||||||
|
# Check logs to verify it's starting correctly
|
||||||
|
docker-compose logs -f multi-document-upload-service
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 2: Verify Dependencies
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check if unstructured[pdf] is installed
|
||||||
|
docker-compose exec multi-document-upload-service pip list | grep unstructured
|
||||||
|
|
||||||
|
# You should see:
|
||||||
|
# unstructured
|
||||||
|
# unstructured-pdf
|
||||||
|
# unstructured-docx
|
||||||
|
# etc.
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 3: Test the Service
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check health endpoint
|
||||||
|
curl http://localhost:8024/health
|
||||||
|
|
||||||
|
# Should return:
|
||||||
|
# {
|
||||||
|
# "status": "ok",
|
||||||
|
# "claude_model": "claude-3-5-haiku-latest",
|
||||||
|
# ...
|
||||||
|
# }
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 4: Re-upload Documents
|
||||||
|
|
||||||
|
1. Open frontend: `http://localhost:3001/project-builder`
|
||||||
|
2. Go to Step 1: Project Type
|
||||||
|
3. Find "Upload Documents for Knowledge Graph" section
|
||||||
|
4. Upload a PDF or other document
|
||||||
|
5. Wait for processing to complete
|
||||||
|
6. Check status - should show relation count > 0
|
||||||
|
|
||||||
|
### Step 5: Verify in Neo4j
|
||||||
|
|
||||||
|
Run these queries in Neo4j Browser (`http://localhost:7474`):
|
||||||
|
|
||||||
|
```cypher
|
||||||
|
// Check if any nodes exist
|
||||||
|
MATCH (n)
|
||||||
|
RETURN count(n) as node_count
|
||||||
|
|
||||||
|
// Check for CAUSES relationships
|
||||||
|
MATCH (n:Concept)-[r:CAUSES]->(m:Concept)
|
||||||
|
RETURN n.name as cause,
|
||||||
|
m.name as effect,
|
||||||
|
r.confidence as confidence,
|
||||||
|
r.job_id as job_id
|
||||||
|
LIMIT 50
|
||||||
|
```
|
||||||
|
|
||||||
|
## Expected Results
|
||||||
|
|
||||||
|
After rebuilding and re-uploading:
|
||||||
|
|
||||||
|
1. **PDF extraction succeeds** ✅
|
||||||
|
2. **Text is extracted** ✅
|
||||||
|
3. **Relations are extracted** ✅
|
||||||
|
4. **Relations are written to Neo4j** ✅
|
||||||
|
5. **Query returns results** ✅
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
If you still see 0 relations:
|
||||||
|
|
||||||
|
1. **Check service logs**:
|
||||||
|
```bash
|
||||||
|
docker-compose logs multi-document-upload-service | tail -50
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Check extraction logs**:
|
||||||
|
```bash
|
||||||
|
docker-compose logs multi-document-upload-service | grep -i "extract\|pdf"
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Check Claude analysis**:
|
||||||
|
```bash
|
||||||
|
docker-compose logs multi-document-upload-service | grep -i "claude\|analyze\|relation"
|
||||||
|
```
|
||||||
|
|
||||||
|
4. **Check Neo4j connection**:
|
||||||
|
```bash
|
||||||
|
docker-compose logs multi-document-upload-service | grep -i "neo4j\|graph\|write"
|
||||||
|
```
|
||||||
|
|
||||||
|
5. **Verify document has causal language**:
|
||||||
|
- Not all documents contain causal relationships
|
||||||
|
- Try uploading a document with clear cause-effect statements
|
||||||
|
- Example: "Smoking causes lung cancer"
|
||||||
|
|
||||||
|
## Quick Test
|
||||||
|
|
||||||
|
Test with a simple text file:
|
||||||
|
|
||||||
|
1. Create a test file `test_causal.txt`:
|
||||||
|
```
|
||||||
|
Smoking cigarettes causes lung cancer.
|
||||||
|
Heavy rain causes flooding.
|
||||||
|
Exercise improves health.
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Upload it via the frontend
|
||||||
|
3. Check Neo4j for relationships
|
||||||
|
4. Should see 3 causal relationships
|
||||||
|
|
||||||
|
## Next Steps
|
||||||
|
|
||||||
|
1. Rebuild the service
|
||||||
|
2. Re-upload documents
|
||||||
|
3. Check Neo4j for relationships
|
||||||
|
4. If still no results, check service logs
|
||||||
|
5. Verify the document contains causal language
|
||||||
|
|
||||||
300
services/multi-document-upload-service/TESTING_GUIDE.md
Normal file
300
services/multi-document-upload-service/TESTING_GUIDE.md
Normal file
@ -0,0 +1,300 @@
|
|||||||
|
# Multi-Document Upload Service - Frontend Testing Guide
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
1. **Backend Services Running**:
|
||||||
|
```bash
|
||||||
|
cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine
|
||||||
|
docker-compose up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Verify Services are Running**:
|
||||||
|
- API Gateway: `http://localhost:8000/health`
|
||||||
|
- Multi-Document Upload Service: `http://localhost:8024/health`
|
||||||
|
- Neo4j: `http://localhost:7474` (Browser interface)
|
||||||
|
- Frontend: `http://localhost:3001` (or your frontend port)
|
||||||
|
|
||||||
|
3. **Check Service Health**:
|
||||||
|
```bash
|
||||||
|
# Check API Gateway
|
||||||
|
curl http://localhost:8000/health
|
||||||
|
|
||||||
|
# Check Multi-Document Upload Service directly
|
||||||
|
curl http://localhost:8024/health
|
||||||
|
|
||||||
|
# Check via API Gateway proxy
|
||||||
|
curl http://localhost:8000/api/multi-docs/health
|
||||||
|
```
|
||||||
|
|
||||||
|
## Frontend Testing Steps
|
||||||
|
|
||||||
|
### Step 1: Navigate to Project Builder
|
||||||
|
|
||||||
|
1. Open your browser and go to: `http://localhost:3001` (or your frontend URL)
|
||||||
|
2. Log in if required
|
||||||
|
3. Click on **"Project Builder"** in the navigation
|
||||||
|
|
||||||
|
### Step 2: Go to Multi Docs Upload Step
|
||||||
|
|
||||||
|
1. In the Project Builder, you should see the workflow steps:
|
||||||
|
- **Step 1**: Project Type
|
||||||
|
- **Step 2**: Features
|
||||||
|
- **Step 3**: Multi Docs Upload ← **This is the new step**
|
||||||
|
- **Step 4**: Business Context
|
||||||
|
- **Step 5**: Generate
|
||||||
|
- **Step 6**: Architecture
|
||||||
|
|
||||||
|
2. Complete Steps 1 and 2 (Project Type and Features selection)
|
||||||
|
3. You will automatically be taken to **Step 3: Multi Docs Upload**
|
||||||
|
|
||||||
|
### Step 3: Upload Documents
|
||||||
|
|
||||||
|
1. **Click on the upload area** or **drag and drop files**
|
||||||
|
2. **Select multiple files** (you can mix different formats):
|
||||||
|
- PDF files (`.pdf`)
|
||||||
|
- Word documents (`.doc`, `.docx`)
|
||||||
|
- PowerPoint (`.ppt`, `.pptx`)
|
||||||
|
- Excel files (`.xls`, `.xlsx`)
|
||||||
|
- JSON files (`.json`)
|
||||||
|
- XML files (`.xml`)
|
||||||
|
- Markdown files (`.md`)
|
||||||
|
- Images (`.png`, `.jpg`, `.jpeg`) - will use OCR
|
||||||
|
- Audio files (`.mp3`, `.wav`) - will be transcribed
|
||||||
|
- Video files (`.mp4`, `.avi`) - will be transcribed
|
||||||
|
|
||||||
|
3. **View selected files**: You should see a list of all selected files with:
|
||||||
|
- File icon
|
||||||
|
- File name
|
||||||
|
- Remove button for each file
|
||||||
|
|
||||||
|
4. **Click "Start Upload"** button
|
||||||
|
|
||||||
|
### Step 4: Monitor Upload Progress
|
||||||
|
|
||||||
|
After clicking "Start Upload", you should see:
|
||||||
|
|
||||||
|
1. **Upload Status**:
|
||||||
|
- Button shows "Uploading..." with spinner
|
||||||
|
- Progress bar appears
|
||||||
|
- Stage messages appear:
|
||||||
|
- "Job received"
|
||||||
|
- "Saving files"
|
||||||
|
- "Extracting document content"
|
||||||
|
- "Calling Claude for causal relations"
|
||||||
|
- "Writing to Neo4j knowledge graph"
|
||||||
|
- "Completed"
|
||||||
|
|
||||||
|
2. **Progress Indicators**:
|
||||||
|
- Progress percentage (0-100%)
|
||||||
|
- Status message showing current stage
|
||||||
|
- Processed files count vs total files count
|
||||||
|
|
||||||
|
3. **Polling**: The frontend automatically polls the job status every 4 seconds
|
||||||
|
|
||||||
|
### Step 5: Verify Results
|
||||||
|
|
||||||
|
Once the job is completed:
|
||||||
|
|
||||||
|
1. **Check Neo4j Graph**:
|
||||||
|
- Open Neo4j Browser: `http://localhost:7474`
|
||||||
|
- Login with:
|
||||||
|
- Username: `neo4j`
|
||||||
|
- Password: `password`
|
||||||
|
- Run Cypher query to see the graph:
|
||||||
|
```cypher
|
||||||
|
MATCH (n)-[r:CAUSES]->(m)
|
||||||
|
RETURN n, r, m
|
||||||
|
LIMIT 50
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Check Job Status via API**:
|
||||||
|
```bash
|
||||||
|
# Replace {job_id} with the actual job ID from the frontend
|
||||||
|
curl http://localhost:8000/api/multi-docs/jobs/{job_id}
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Get Graph Summary**:
|
||||||
|
```bash
|
||||||
|
curl http://localhost:8000/api/multi-docs/jobs/{job_id}/graph
|
||||||
|
```
|
||||||
|
|
||||||
|
## Testing Different Scenarios
|
||||||
|
|
||||||
|
### Scenario 1: Single PDF File
|
||||||
|
- Upload one PDF file
|
||||||
|
- Verify it processes correctly
|
||||||
|
- Check Neo4j for causal relationships
|
||||||
|
|
||||||
|
### Scenario 2: Multiple Mixed Format Files
|
||||||
|
- Upload 3-5 files of different formats (PDF, DOCX, JSON, image)
|
||||||
|
- Verify all files are processed
|
||||||
|
- Check that progress updates correctly
|
||||||
|
|
||||||
|
### Scenario 3: Large Files
|
||||||
|
- Upload a large PDF (10+ MB)
|
||||||
|
- Verify it handles large files correctly
|
||||||
|
- Check processing time
|
||||||
|
|
||||||
|
### Scenario 4: Error Handling
|
||||||
|
- Try uploading an unsupported file type
|
||||||
|
- Verify error message appears
|
||||||
|
- Check that the error is displayed clearly
|
||||||
|
|
||||||
|
### Scenario 5: Skip Option
|
||||||
|
- Upload files
|
||||||
|
- Click "Skip" button before completion
|
||||||
|
- Verify you can proceed to the next step
|
||||||
|
- Job continues processing in the background
|
||||||
|
|
||||||
|
## Browser Developer Tools
|
||||||
|
|
||||||
|
### Check Network Requests
|
||||||
|
|
||||||
|
1. **Open Developer Tools** (F12)
|
||||||
|
2. **Go to Network tab**
|
||||||
|
3. **Filter by "multi-docs"**
|
||||||
|
4. **Monitor requests**:
|
||||||
|
- `POST /api/multi-docs/jobs` - Upload files
|
||||||
|
- `GET /api/multi-docs/jobs/{job_id}` - Poll job status
|
||||||
|
- `GET /api/multi-docs/jobs/{job_id}/graph` - Get graph summary
|
||||||
|
|
||||||
|
### Check Console Logs
|
||||||
|
|
||||||
|
1. **Open Console tab**
|
||||||
|
2. **Look for**:
|
||||||
|
- Upload progress logs
|
||||||
|
- Job status updates
|
||||||
|
- Any error messages
|
||||||
|
|
||||||
|
### Check Response Data
|
||||||
|
|
||||||
|
Verify the API responses:
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// Upload response should be:
|
||||||
|
{
|
||||||
|
"job_id": "uuid-here",
|
||||||
|
"stage": "received",
|
||||||
|
"total_files": 3,
|
||||||
|
"created_at": "2024-01-01T00:00:00Z"
|
||||||
|
}
|
||||||
|
|
||||||
|
// Status response should be:
|
||||||
|
{
|
||||||
|
"job_id": "uuid-here",
|
||||||
|
"stage": "extracting",
|
||||||
|
"status_message": "Extracting document content",
|
||||||
|
"total_files": 3,
|
||||||
|
"processed_files": 1,
|
||||||
|
"error": null,
|
||||||
|
"created_at": "2024-01-01T00:00:00Z",
|
||||||
|
"updated_at": "2024-01-01T00:01:00Z",
|
||||||
|
"files": [...]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Issue: Upload fails with 502 Bad Gateway
|
||||||
|
**Solution**:
|
||||||
|
- Check if multi-document-upload-service is running:
|
||||||
|
```bash
|
||||||
|
docker-compose ps multi-document-upload-service
|
||||||
|
```
|
||||||
|
- Check service logs:
|
||||||
|
```bash
|
||||||
|
docker-compose logs multi-document-upload-service
|
||||||
|
```
|
||||||
|
|
||||||
|
### Issue: Upload fails with 413 Request Entity Too Large
|
||||||
|
**Solution**:
|
||||||
|
- Check file sizes (max 500MB total per job)
|
||||||
|
- Reduce number of files or file sizes
|
||||||
|
- Check API Gateway body size limits
|
||||||
|
|
||||||
|
### Issue: Status polling stops working
|
||||||
|
**Solution**:
|
||||||
|
- Check browser console for errors
|
||||||
|
- Verify job ID is correct
|
||||||
|
- Check if job completed or failed
|
||||||
|
- Check network tab for failed requests
|
||||||
|
|
||||||
|
### Issue: No causal relationships found
|
||||||
|
**Solution**:
|
||||||
|
- Check Claude API key is configured correctly
|
||||||
|
- Check service logs for Claude API errors
|
||||||
|
- Verify documents contain causal language
|
||||||
|
- Check Neo4j connection
|
||||||
|
|
||||||
|
### Issue: Frontend shows "Failed" status
|
||||||
|
**Solution**:
|
||||||
|
- Check the error message in the frontend
|
||||||
|
- Check backend service logs:
|
||||||
|
```bash
|
||||||
|
docker-compose logs -f multi-document-upload-service
|
||||||
|
```
|
||||||
|
- Verify all dependencies are running (Neo4j, Redis, Postgres)
|
||||||
|
|
||||||
|
## Expected Behavior
|
||||||
|
|
||||||
|
### Successful Flow:
|
||||||
|
1. ✅ Files upload successfully
|
||||||
|
2. ✅ Job ID is returned
|
||||||
|
3. ✅ Status polling starts automatically
|
||||||
|
4. ✅ Progress updates every 4 seconds
|
||||||
|
5. ✅ Stage changes are displayed
|
||||||
|
6. ✅ Progress bar updates
|
||||||
|
7. ✅ Job completes successfully
|
||||||
|
8. ✅ Frontend automatically proceeds to next step
|
||||||
|
9. ✅ Neo4j contains causal relationships
|
||||||
|
|
||||||
|
### Error Flow:
|
||||||
|
1. ✅ Error message is displayed clearly
|
||||||
|
2. ✅ User can retry upload
|
||||||
|
3. ✅ User can skip and proceed
|
||||||
|
4. ✅ Error details are logged in console
|
||||||
|
|
||||||
|
## API Endpoints Reference
|
||||||
|
|
||||||
|
### Upload Files
|
||||||
|
```bash
|
||||||
|
POST /api/multi-docs/jobs
|
||||||
|
Content-Type: multipart/form-data
|
||||||
|
|
||||||
|
Form Data:
|
||||||
|
- files: File[] (multiple files)
|
||||||
|
- job_name: string (optional)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Get Job Status
|
||||||
|
```bash
|
||||||
|
GET /api/multi-docs/jobs/{job_id}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Get Graph Summary
|
||||||
|
```bash
|
||||||
|
GET /api/multi-docs/jobs/{job_id}/graph
|
||||||
|
```
|
||||||
|
|
||||||
|
### Health Check
|
||||||
|
```bash
|
||||||
|
GET /api/multi-docs/health
|
||||||
|
```
|
||||||
|
|
||||||
|
## Next Steps After Testing
|
||||||
|
|
||||||
|
1. **Verify Neo4j Graph**: Check that causal relationships are stored correctly
|
||||||
|
2. **Check Storage**: Verify files are stored in the persistent volume
|
||||||
|
3. **Monitor Performance**: Check processing times for different file types
|
||||||
|
4. **Test Error Scenarios**: Verify error handling works correctly
|
||||||
|
5. **Test Large Batches**: Upload 50+ files to test scalability
|
||||||
|
|
||||||
|
## Support
|
||||||
|
|
||||||
|
If you encounter issues:
|
||||||
|
1. Check service logs: `docker-compose logs multi-document-upload-service`
|
||||||
|
2. Check API Gateway logs: `docker-compose logs api-gateway`
|
||||||
|
3. Check Neo4j logs: `docker-compose logs neo4j`
|
||||||
|
4. Verify all environment variables are set correctly
|
||||||
|
5. Check network connectivity between services
|
||||||
|
|
||||||
34
services/multi-document-upload-service/requirements.txt
Normal file
34
services/multi-document-upload-service/requirements.txt
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
fastapi>=0.110.0
|
||||||
|
uvicorn[standard]>=0.29.0
|
||||||
|
anthropic>=0.33.0
|
||||||
|
neo4j>=5.23.0
|
||||||
|
python-multipart>=0.0.9
|
||||||
|
pydantic>=2.7.0
|
||||||
|
pydantic-settings>=2.2.1
|
||||||
|
aiofiles>=23.2.1
|
||||||
|
tenacity>=8.2.3
|
||||||
|
python-dotenv>=1.0.1
|
||||||
|
unstructured[pdf]>=0.15.0
|
||||||
|
unstructured[docx]>=0.15.0
|
||||||
|
unstructured[pptx]>=0.15.0
|
||||||
|
unstructured[xlsx]>=0.15.0
|
||||||
|
pdfplumber>=0.11.0
|
||||||
|
python-docx>=1.1.0
|
||||||
|
python-pptx>=0.6.23
|
||||||
|
pandas>=2.2.2
|
||||||
|
openpyxl>=3.1.2
|
||||||
|
xlrd>=2.0.1
|
||||||
|
pytesseract>=0.3.10
|
||||||
|
Pillow>=10.3.0
|
||||||
|
opencv-python-headless>=4.9.0.80
|
||||||
|
PyMuPDF>=1.23.0
|
||||||
|
pdf2image>=1.16.3
|
||||||
|
faster-whisper>=0.10.0
|
||||||
|
ffmpeg-python>=0.2.0
|
||||||
|
pydub>=0.25.1
|
||||||
|
beautifulsoup4>=4.12.3
|
||||||
|
lxml>=5.2.1
|
||||||
|
sqlalchemy>=2.0.25
|
||||||
|
httpx>=0.27.0
|
||||||
|
tiktoken>=0.7.0
|
||||||
|
|
||||||
@ -0,0 +1,4 @@
|
|||||||
|
"""
|
||||||
|
Multi Document Upload Service package.
|
||||||
|
"""
|
||||||
|
|
||||||
@ -0,0 +1,328 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import base64
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable, List
|
||||||
|
|
||||||
|
from anthropic import Anthropic, BadRequestError
|
||||||
|
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential, RetryCallState
|
||||||
|
|
||||||
|
from .models import CausalRelation
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def is_billing_error(exception: Exception) -> bool:
|
||||||
|
"""Check if the exception is a billing/credit related error that shouldn't be retried."""
|
||||||
|
if isinstance(exception, BadRequestError):
|
||||||
|
error_message = str(exception).lower()
|
||||||
|
billing_keywords = ["credit", "balance", "too low", "billing", "upgrade", "purchase credits"]
|
||||||
|
return any(keyword in error_message for keyword in billing_keywords)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def should_retry_exception(retry_state: RetryCallState) -> bool:
|
||||||
|
"""Custom retry condition that excludes billing errors."""
|
||||||
|
exception = retry_state.outcome.exception()
|
||||||
|
if exception is None:
|
||||||
|
return False
|
||||||
|
# Don't retry billing errors - they won't be resolved by retrying
|
||||||
|
if is_billing_error(exception):
|
||||||
|
return False
|
||||||
|
# Retry other exceptions
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
CLAUDE_PROMPT_TEMPLATE = """You are an expert analyst extracting causal relationships from documents.
|
||||||
|
|
||||||
|
Given the following text chunk, identify all explicit or strongly implied cause and effect pairs.
|
||||||
|
Return JSON with the schema:
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"cause": "<short phrase>",
|
||||||
|
"effect": "<short phrase>",
|
||||||
|
"confidence": 0-1 float,
|
||||||
|
"explanation": "<why this is causal>",
|
||||||
|
"source_snippet": "<exact quote or paraphrase>"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
Only include items when the causal direction is clear.
|
||||||
|
If none are found, return an empty list [].
|
||||||
|
|
||||||
|
Text chunk:
|
||||||
|
```
|
||||||
|
<<<CHUNK_PLACEHOLDER>>>
|
||||||
|
```"""
|
||||||
|
|
||||||
|
IMAGE_PROMPT_TEMPLATE = """You are an expert analyst extracting causal relationships from images, diagrams, and visual content.
|
||||||
|
|
||||||
|
Analyze this image/diagram for causal relationships. Look for:
|
||||||
|
- Architecture flows (A → B → C)
|
||||||
|
- Dependency relationships
|
||||||
|
- Cause-effect chains in diagrams
|
||||||
|
- Process flows
|
||||||
|
- System interactions
|
||||||
|
- Data flows
|
||||||
|
- Sequential relationships
|
||||||
|
- Visual connections between components
|
||||||
|
|
||||||
|
Return JSON with the schema:
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"cause": "<short phrase describing the cause>",
|
||||||
|
"effect": "<short phrase describing the effect>",
|
||||||
|
"confidence": 0-1 float,
|
||||||
|
"explanation": "<why this is causal, referencing visual elements>",
|
||||||
|
"source_snippet": "<description of what you see in the image that shows this relationship>"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
Only include items when the causal direction is clear from the visual structure.
|
||||||
|
If none are found, return an empty list []."""
|
||||||
|
|
||||||
|
|
||||||
|
class ClaudeCausalExtractor:
|
||||||
|
def __init__(self, api_key: str, model: str, max_output_tokens: int = 4000):
|
||||||
|
self.client = Anthropic(api_key=api_key)
|
||||||
|
self.model = model
|
||||||
|
self.max_output_tokens = max_output_tokens
|
||||||
|
|
||||||
|
@retry(
|
||||||
|
retry=should_retry_exception,
|
||||||
|
wait=wait_exponential(multiplier=1, min=1, max=10),
|
||||||
|
stop=stop_after_attempt(3),
|
||||||
|
reraise=True,
|
||||||
|
)
|
||||||
|
def analyze_chunk(self, chunk: str, source_file_id: str) -> List[CausalRelation]:
|
||||||
|
logger.debug("Analyzing chunk with Claude model %s", self.model)
|
||||||
|
|
||||||
|
# Validate chunk is not empty and is readable text
|
||||||
|
if not chunk or not chunk.strip():
|
||||||
|
logger.warning("Empty or whitespace-only chunk, skipping")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Check if chunk contains mostly readable text (not binary data)
|
||||||
|
# Simple heuristic: if >50% of characters are non-printable or control chars, skip it
|
||||||
|
printable_chars = sum(1 for c in chunk if c.isprintable() or c.isspace())
|
||||||
|
if len(chunk) > 100 and printable_chars / len(chunk) < 0.5:
|
||||||
|
logger.warning("Chunk appears to contain binary data, skipping analysis")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Use string replacement with a unique placeholder to avoid KeyError with braces in content
|
||||||
|
# This prevents Python's .format() from interpreting braces in the chunk text as format placeholders
|
||||||
|
prompt_text = CLAUDE_PROMPT_TEMPLATE.replace("<<<CHUNK_PLACEHOLDER>>>", chunk)
|
||||||
|
|
||||||
|
try:
|
||||||
|
message = self.client.messages.create(
|
||||||
|
model=self.model,
|
||||||
|
max_tokens=self.max_output_tokens,
|
||||||
|
temperature=0.0,
|
||||||
|
system="You extract causal (cause→effect) relations with high precision.",
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [{"type": "text", "text": prompt_text}],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
except BadRequestError as e:
|
||||||
|
# Check if it's a billing error
|
||||||
|
if is_billing_error(e):
|
||||||
|
error_msg = (
|
||||||
|
"Anthropic API credit balance is too low. "
|
||||||
|
"Please go to Plans & Billing to upgrade or purchase credits. "
|
||||||
|
f"Error: {str(e)}"
|
||||||
|
)
|
||||||
|
logger.error(error_msg)
|
||||||
|
raise RuntimeError(error_msg) from e
|
||||||
|
# Re-raise other BadRequestErrors
|
||||||
|
raise
|
||||||
|
|
||||||
|
content_blocks = message.content or []
|
||||||
|
raw_text = "".join(block.text for block in content_blocks if hasattr(block, "text")) # type: ignore[attr-defined]
|
||||||
|
if not raw_text:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Try to extract JSON from markdown code blocks if present
|
||||||
|
json_text = raw_text.strip()
|
||||||
|
|
||||||
|
# Look for JSON in markdown code blocks (```json ... ```)
|
||||||
|
json_match = re.search(r'```(?:json)?\s*(\[.*?\])\s*```', json_text, re.DOTALL)
|
||||||
|
if json_match:
|
||||||
|
json_text = json_match.group(1)
|
||||||
|
else:
|
||||||
|
# Look for JSON array/object at the start or end
|
||||||
|
json_match = re.search(r'(\[.*?\]|{.*?})', json_text, re.DOTALL)
|
||||||
|
if json_match:
|
||||||
|
json_text = json_match.group(1)
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = json.loads(json_text)
|
||||||
|
if not isinstance(data, list):
|
||||||
|
logger.warning("Claude response is not a list: %s", type(data))
|
||||||
|
return []
|
||||||
|
|
||||||
|
relations: List[CausalRelation] = []
|
||||||
|
for item in data:
|
||||||
|
if not isinstance(item, dict):
|
||||||
|
continue
|
||||||
|
cause = item.get("cause", "").strip()
|
||||||
|
effect = item.get("effect", "").strip()
|
||||||
|
if not cause or not effect:
|
||||||
|
continue # Skip invalid relations
|
||||||
|
|
||||||
|
relations.append(
|
||||||
|
CausalRelation(
|
||||||
|
cause=cause,
|
||||||
|
effect=effect,
|
||||||
|
confidence=float(item.get("confidence", 0.0)),
|
||||||
|
explanation=item.get("explanation"),
|
||||||
|
source_file_id=source_file_id,
|
||||||
|
source_snippet=item.get("source_snippet"),
|
||||||
|
metadata={"model": self.model},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
logger.info("Extracted %d relations from Claude response", len(relations))
|
||||||
|
return relations
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
logger.warning("Failed to parse Claude response as JSON: %s. Raw text: %s", e, raw_text[:200])
|
||||||
|
return []
|
||||||
|
|
||||||
|
def analyze(self, chunks: Iterable[str], source_file_id: str) -> List[CausalRelation]:
|
||||||
|
relations: List[CausalRelation] = []
|
||||||
|
for chunk in chunks:
|
||||||
|
relations.extend(self.analyze_chunk(chunk, source_file_id=source_file_id))
|
||||||
|
return relations
|
||||||
|
|
||||||
|
@retry(
|
||||||
|
retry=should_retry_exception,
|
||||||
|
wait=wait_exponential(multiplier=1, min=1, max=10),
|
||||||
|
stop=stop_after_attempt(3),
|
||||||
|
reraise=True,
|
||||||
|
)
|
||||||
|
def analyze_image(self, image_path: Path, source_file_id: str) -> List[CausalRelation]:
|
||||||
|
"""
|
||||||
|
Analyze an image using Claude Vision API to extract causal relationships.
|
||||||
|
Sends image directly to Claude (no OCR).
|
||||||
|
"""
|
||||||
|
logger.info("Analyzing image with Claude Vision: %s", image_path.name)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Read and encode image as base64
|
||||||
|
with open(image_path, "rb") as image_file:
|
||||||
|
image_data = image_file.read()
|
||||||
|
|
||||||
|
# Determine media type
|
||||||
|
suffix = image_path.suffix.lower()
|
||||||
|
media_type_map = {
|
||||||
|
".png": "image/png",
|
||||||
|
".jpg": "image/jpeg",
|
||||||
|
".jpeg": "image/jpeg",
|
||||||
|
".gif": "image/gif",
|
||||||
|
".webp": "image/webp",
|
||||||
|
}
|
||||||
|
media_type = media_type_map.get(suffix, "image/png")
|
||||||
|
|
||||||
|
# Encode to base64
|
||||||
|
base64_image = base64.b64encode(image_data).decode("utf-8")
|
||||||
|
|
||||||
|
# Prepare content for Claude Vision API
|
||||||
|
content = [
|
||||||
|
{
|
||||||
|
"type": "image",
|
||||||
|
"source": {
|
||||||
|
"type": "base64",
|
||||||
|
"media_type": media_type,
|
||||||
|
"data": base64_image,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": IMAGE_PROMPT_TEMPLATE,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
# Call Claude Vision API
|
||||||
|
try:
|
||||||
|
message = self.client.messages.create(
|
||||||
|
model=self.model, # Claude models support vision
|
||||||
|
max_tokens=self.max_output_tokens,
|
||||||
|
temperature=0.0,
|
||||||
|
system="You extract causal (cause→effect) relations from visual content with high precision.",
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": content,
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
except BadRequestError as e:
|
||||||
|
# Check if it's a billing error
|
||||||
|
if is_billing_error(e):
|
||||||
|
error_msg = (
|
||||||
|
"Anthropic API credit balance is too low. "
|
||||||
|
"Please go to Plans & Billing to upgrade or purchase credits. "
|
||||||
|
f"Error: {str(e)}"
|
||||||
|
)
|
||||||
|
logger.error(error_msg)
|
||||||
|
raise RuntimeError(error_msg) from e
|
||||||
|
# Re-raise other BadRequestErrors
|
||||||
|
raise
|
||||||
|
|
||||||
|
# Parse response
|
||||||
|
content_blocks = message.content or []
|
||||||
|
raw_text = "".join(block.text for block in content_blocks if hasattr(block, "text")) # type: ignore[attr-defined]
|
||||||
|
if not raw_text:
|
||||||
|
logger.warning("No text response from Claude Vision for image %s", image_path.name)
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Extract JSON from response
|
||||||
|
json_text = raw_text.strip()
|
||||||
|
json_match = re.search(r'```(?:json)?\s*(\[.*?\])\s*```', json_text, re.DOTALL)
|
||||||
|
if json_match:
|
||||||
|
json_text = json_match.group(1)
|
||||||
|
else:
|
||||||
|
json_match = re.search(r'(\[.*?\]|{.*?})', json_text, re.DOTALL)
|
||||||
|
if json_match:
|
||||||
|
json_text = json_match.group(1)
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = json.loads(json_text)
|
||||||
|
if not isinstance(data, list):
|
||||||
|
logger.warning("Claude Vision response is not a list: %s", type(data))
|
||||||
|
return []
|
||||||
|
|
||||||
|
relations: List[CausalRelation] = []
|
||||||
|
for item in data:
|
||||||
|
if not isinstance(item, dict):
|
||||||
|
continue
|
||||||
|
cause = item.get("cause", "").strip()
|
||||||
|
effect = item.get("effect", "").strip()
|
||||||
|
if not cause or not effect:
|
||||||
|
continue
|
||||||
|
|
||||||
|
relations.append(
|
||||||
|
CausalRelation(
|
||||||
|
cause=cause,
|
||||||
|
effect=effect,
|
||||||
|
confidence=float(item.get("confidence", 0.0)),
|
||||||
|
explanation=item.get("explanation"),
|
||||||
|
source_file_id=source_file_id,
|
||||||
|
source_snippet=item.get("source_snippet") or f"Image: {image_path.name}",
|
||||||
|
metadata={"model": self.model, "content_type": "image", "image_path": str(image_path)},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
logger.info("Extracted %d relations from image %s", len(relations), image_path.name)
|
||||||
|
return relations
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
logger.warning("Failed to parse Claude Vision response as JSON: %s. Raw text: %s", e, raw_text[:200])
|
||||||
|
return []
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception("Failed to analyze image %s: %s", image_path, exc)
|
||||||
|
return []
|
||||||
|
|
||||||
@ -0,0 +1,52 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
from functools import lru_cache
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from pydantic import Field
|
||||||
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_STORAGE_ROOT = Path(
|
||||||
|
os.getenv("MULTI_DOC_STORAGE_ROOT", Path(__file__).resolve().parent.parent.parent / "storage")
|
||||||
|
)
|
||||||
|
DEFAULT_STORAGE_ROOT.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
class Settings(BaseSettings):
|
||||||
|
"""Application configuration loaded from environment variables."""
|
||||||
|
|
||||||
|
model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")
|
||||||
|
|
||||||
|
anthropic_api_key: str | None = Field(default=None, validation_alias="ANTHROPIC_API_KEY")
|
||||||
|
claude_model: str = Field(default=os.getenv("MULTI_DOC_CLAUDE_MODEL", "claude-3-5-sonnet-20241022"))
|
||||||
|
claude_max_input_tokens: int = Field(default=200_000)
|
||||||
|
claude_max_output_tokens: int = Field(default=16_000)
|
||||||
|
|
||||||
|
neo4j_uri: str = Field(default=os.getenv("NEO4J_URI", "bolt://localhost:7687"))
|
||||||
|
neo4j_user: str = Field(default=os.getenv("NEO4J_USER", "neo4j"))
|
||||||
|
neo4j_password: str = Field(default=os.getenv("NEO4J_PASSWORD", "neo4j"))
|
||||||
|
|
||||||
|
storage_root: Path = Field(default=DEFAULT_STORAGE_ROOT)
|
||||||
|
max_upload_size_mb: int = Field(default=500)
|
||||||
|
max_files_per_job: int = Field(default=200)
|
||||||
|
|
||||||
|
chunk_token_target: int = Field(default=800)
|
||||||
|
chunk_token_overlap: int = Field(default=200)
|
||||||
|
|
||||||
|
job_retention_days: int = Field(default=30)
|
||||||
|
|
||||||
|
def ensure_storage_dirs(self) -> None:
|
||||||
|
(self.storage_root / "jobs").mkdir(parents=True, exist_ok=True)
|
||||||
|
(self.storage_root / "uploads").mkdir(parents=True, exist_ok=True)
|
||||||
|
(self.storage_root / "extracted").mkdir(parents=True, exist_ok=True)
|
||||||
|
(self.storage_root / "images").mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache
|
||||||
|
def get_settings() -> Settings:
|
||||||
|
settings = Settings()
|
||||||
|
settings.ensure_storage_dirs()
|
||||||
|
return settings
|
||||||
|
|
||||||
@ -0,0 +1,168 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Try to import unstructured, but fall back to alternatives if not available
|
||||||
|
try:
|
||||||
|
from unstructured.partition.auto import partition
|
||||||
|
HAS_UNSTRUCTURED = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_UNSTRUCTURED = False
|
||||||
|
logger.warning("unstructured not available, will use fallback extractors")
|
||||||
|
|
||||||
|
# Fallback extractors
|
||||||
|
try:
|
||||||
|
import pdfplumber
|
||||||
|
HAS_PDFPLUMBER = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_PDFPLUMBER = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
from docx import Document as DocxDocument
|
||||||
|
HAS_DOCX = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_DOCX = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
from pptx import Presentation
|
||||||
|
HAS_PPTX = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_PPTX = False
|
||||||
|
|
||||||
|
# Image processing libraries
|
||||||
|
try:
|
||||||
|
from PIL import Image
|
||||||
|
import pytesseract
|
||||||
|
HAS_OCR = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_OCR = False
|
||||||
|
logger.warning("OCR libraries not available, image extraction will be limited")
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text(path: Path) -> str:
|
||||||
|
"""
|
||||||
|
Extract text from a file using multiple strategies.
|
||||||
|
Falls back through: unstructured -> format-specific -> plain text read.
|
||||||
|
"""
|
||||||
|
suffix = path.suffix.lower()
|
||||||
|
|
||||||
|
# Validate PDF file before processing
|
||||||
|
if suffix == ".pdf":
|
||||||
|
# Quick validation: check if file starts with PDF magic bytes
|
||||||
|
try:
|
||||||
|
with path.open("rb") as f:
|
||||||
|
header = f.read(4)
|
||||||
|
if header != b"%PDF":
|
||||||
|
raise ValueError(
|
||||||
|
f"File {path.name} does not appear to be a valid PDF. "
|
||||||
|
f"PDF files must start with '%PDF' magic bytes. "
|
||||||
|
f"Got: {header[:20] if len(header) > 0 else 'empty file'}"
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
if isinstance(exc, ValueError):
|
||||||
|
raise
|
||||||
|
logger.warning("Could not validate PDF header: %s", exc)
|
||||||
|
|
||||||
|
# Image files - return empty text (will be processed directly with Claude Vision)
|
||||||
|
# We skip OCR and send images directly to Claude Vision API
|
||||||
|
if suffix in {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}:
|
||||||
|
logger.info("Image file detected: %s. Will be processed directly with Claude Vision (no OCR)", path.name)
|
||||||
|
# Return empty string - images will be handled separately in pipeline
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Plain text files - direct read
|
||||||
|
if suffix in {".txt", ".md", ".json", ".xml", ".html", ".csv"}:
|
||||||
|
try:
|
||||||
|
return path.read_text(encoding="utf-8", errors="ignore")
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("Failed to read %s as text: %s", path, exc)
|
||||||
|
raise
|
||||||
|
|
||||||
|
# Try unstructured first (if available)
|
||||||
|
if HAS_UNSTRUCTURED:
|
||||||
|
try:
|
||||||
|
elements = partition(filename=str(path))
|
||||||
|
lines: List[str] = []
|
||||||
|
for element in elements:
|
||||||
|
text = getattr(element, "text", None)
|
||||||
|
if text:
|
||||||
|
lines.append(text.strip())
|
||||||
|
if lines:
|
||||||
|
logger.info("Extracted %d lines using unstructured", len(lines))
|
||||||
|
return "\n".join(lines)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("unstructured extraction failed for %s: %s", path, exc)
|
||||||
|
# Continue to fallback methods
|
||||||
|
|
||||||
|
# Fallback: PDF with pdfplumber
|
||||||
|
if suffix == ".pdf" and HAS_PDFPLUMBER:
|
||||||
|
try:
|
||||||
|
with pdfplumber.open(path) as pdf:
|
||||||
|
text_parts = []
|
||||||
|
for page in pdf.pages:
|
||||||
|
page_text = page.extract_text()
|
||||||
|
if page_text:
|
||||||
|
text_parts.append(page_text)
|
||||||
|
if text_parts:
|
||||||
|
logger.info("Extracted PDF using pdfplumber")
|
||||||
|
return "\n".join(text_parts)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("pdfplumber extraction failed for %s: %s", path, exc)
|
||||||
|
|
||||||
|
# Fallback: DOCX
|
||||||
|
if suffix == ".docx" and HAS_DOCX:
|
||||||
|
try:
|
||||||
|
doc = DocxDocument(path)
|
||||||
|
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
||||||
|
if paragraphs:
|
||||||
|
logger.info("Extracted DOCX using python-docx")
|
||||||
|
return "\n".join(paragraphs)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("python-docx extraction failed for %s: %s", path, exc)
|
||||||
|
|
||||||
|
# Fallback: PPTX
|
||||||
|
if suffix in {".pptx", ".ppt"} and HAS_PPTX:
|
||||||
|
try:
|
||||||
|
prs = Presentation(path)
|
||||||
|
text_parts = []
|
||||||
|
for slide in prs.slides:
|
||||||
|
for shape in slide.shapes:
|
||||||
|
if hasattr(shape, "text") and shape.text:
|
||||||
|
text_parts.append(shape.text.strip())
|
||||||
|
if text_parts:
|
||||||
|
logger.info("Extracted PPTX using python-pptx")
|
||||||
|
return "\n".join(text_parts)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("python-pptx extraction failed for %s: %s", path, exc)
|
||||||
|
|
||||||
|
# Last resort: try to read as text anyway, but validate it's readable
|
||||||
|
try:
|
||||||
|
content = path.read_text(encoding="utf-8", errors="ignore")
|
||||||
|
if content.strip():
|
||||||
|
# Check if content is actually readable text (not binary data)
|
||||||
|
# Simple heuristic: if >30% of characters are printable, consider it text
|
||||||
|
printable_chars = sum(1 for c in content if c.isprintable() or c.isspace())
|
||||||
|
total_chars = len(content)
|
||||||
|
|
||||||
|
if total_chars > 0 and printable_chars / total_chars > 0.3:
|
||||||
|
logger.warning("Read %s as plain text (may contain binary data)", path)
|
||||||
|
return content
|
||||||
|
else:
|
||||||
|
logger.error("Content from %s appears to be binary data, cannot extract text", path)
|
||||||
|
raise ValueError(f"File {path} appears to be binary or corrupted. Cannot extract readable text.")
|
||||||
|
except Exception as exc:
|
||||||
|
if isinstance(exc, ValueError):
|
||||||
|
raise
|
||||||
|
logger.warning("Failed to read %s as text: %s", path, exc)
|
||||||
|
|
||||||
|
# If all else fails, raise an error
|
||||||
|
raise ValueError(
|
||||||
|
f"Could not extract text from {path}. "
|
||||||
|
f"File type may not be supported, file may be corrupted, or dependencies are missing. "
|
||||||
|
f"Supported formats: PDF, DOCX, PPTX, XLSX, TXT, MD, JSON, XML, HTML, CSV, PNG, JPG, JPEG (with OCR)"
|
||||||
|
)
|
||||||
|
|
||||||
@ -0,0 +1,514 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Tuple
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Header/Footer detection thresholds
|
||||||
|
HEADER_THRESHOLD = 0.15 # Top 15% of page is considered header
|
||||||
|
FOOTER_THRESHOLD = 0.15 # Bottom 15% of page is considered footer
|
||||||
|
MIN_CONTENT_HEIGHT = 0.3 # Minimum 30% of page height for content area
|
||||||
|
|
||||||
|
# Try to import PDF libraries
|
||||||
|
try:
|
||||||
|
import fitz # PyMuPDF
|
||||||
|
HAS_PYMUPDF = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_PYMUPDF = False
|
||||||
|
logger.warning("PyMuPDF not available, PDF image extraction will be limited")
|
||||||
|
|
||||||
|
try:
|
||||||
|
from pdf2image import convert_from_path
|
||||||
|
HAS_PDF2IMAGE = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_PDF2IMAGE = False
|
||||||
|
|
||||||
|
# DOCX image extraction
|
||||||
|
try:
|
||||||
|
from docx import Document as DocxDocument
|
||||||
|
HAS_DOCX = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_DOCX = False
|
||||||
|
|
||||||
|
# PPTX image extraction
|
||||||
|
try:
|
||||||
|
from pptx import Presentation
|
||||||
|
HAS_PPTX = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_PPTX = False
|
||||||
|
|
||||||
|
|
||||||
|
def is_header_footer_image(bbox: Tuple[float, float, float, float], page_height: float, page_width: float) -> bool:
|
||||||
|
"""
|
||||||
|
Check if an image is in header or footer region.
|
||||||
|
bbox: (x0, y0, x1, y1) - image bounding box coordinates
|
||||||
|
Returns True if image is in header/footer, False otherwise (i.e., in body/content area).
|
||||||
|
"""
|
||||||
|
x0, y0, x1, y1 = bbox
|
||||||
|
|
||||||
|
# Calculate relative positions
|
||||||
|
top_ratio = y0 / page_height if page_height > 0 else 0
|
||||||
|
bottom_ratio = y1 / page_height if page_height > 0 else 0
|
||||||
|
height_ratio = (y1 - y0) / page_height if page_height > 0 else 0
|
||||||
|
|
||||||
|
# AGGRESSIVE header/footer detection - use 25% threshold for top and bottom
|
||||||
|
# This ensures we only extract images from the middle 50% of the page (body area)
|
||||||
|
HEADER_THRESHOLD = 0.25 # Top 25% is header
|
||||||
|
FOOTER_THRESHOLD = 0.25 # Bottom 25% is footer
|
||||||
|
BODY_START = HEADER_THRESHOLD # Body starts at 25%
|
||||||
|
BODY_END = 1.0 - FOOTER_THRESHOLD # Body ends at 75%
|
||||||
|
|
||||||
|
# PRIMARY CHECK: Image must be ENTIRELY in the body area (middle 50%)
|
||||||
|
# If ANY part of the image is in header or footer, skip it
|
||||||
|
image_center_y = (y0 + y1) / 2.0 / page_height if page_height > 0 else 0
|
||||||
|
|
||||||
|
# Check if image is completely in header region (top 25%)
|
||||||
|
if bottom_ratio <= HEADER_THRESHOLD:
|
||||||
|
logger.info("Image in header region (top: %.2f%%, bottom: %.2f%%)", top_ratio * 100, bottom_ratio * 100)
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check if image is completely in footer region (bottom 25%)
|
||||||
|
if top_ratio >= BODY_END:
|
||||||
|
logger.info("Image in footer region (top: %.2f%%, bottom: %.2f%%)", top_ratio * 100, bottom_ratio * 100)
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check if image overlaps header (starts in header, even if extends into body)
|
||||||
|
if top_ratio < HEADER_THRESHOLD:
|
||||||
|
logger.info("Image overlaps header region (top: %.2f%%, bottom: %.2f%%)", top_ratio * 100, bottom_ratio * 100)
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check if image overlaps footer (ends in footer, even if starts in body)
|
||||||
|
if bottom_ratio > BODY_END:
|
||||||
|
logger.info("Image overlaps footer region (top: %.2f%%, bottom: %.2f%%)", top_ratio * 100, bottom_ratio * 100)
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check if image center is in header or footer (even if image spans both)
|
||||||
|
if image_center_y < HEADER_THRESHOLD or image_center_y > BODY_END:
|
||||||
|
logger.info("Image center in header/footer (center: %.2f%%)", image_center_y * 100)
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check if image is very small and near edges (likely logo/icon)
|
||||||
|
if height_ratio < 0.10: # Less than 10% of page height
|
||||||
|
# If it's small and in top 30% or bottom 30%, likely header/footer
|
||||||
|
if top_ratio < 0.30 or bottom_ratio > 0.70:
|
||||||
|
logger.info("Small image near header/footer (height: %.2f%%, top: %.2f%%, bottom: %.2f%%)",
|
||||||
|
height_ratio * 100, top_ratio * 100, bottom_ratio * 100)
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Image is in body/content area - allow it
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def crop_header_footer(image_path: Path, output_path: Path, header_ratio: float = HEADER_THRESHOLD, footer_ratio: float = FOOTER_THRESHOLD) -> bool:
|
||||||
|
"""
|
||||||
|
Crop header and footer regions from a full-page image.
|
||||||
|
Returns True if cropping was successful, False otherwise.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
img = Image.open(image_path)
|
||||||
|
width, height = img.size
|
||||||
|
|
||||||
|
# Calculate crop boundaries
|
||||||
|
header_pixels = int(height * header_ratio)
|
||||||
|
footer_pixels = int(height * footer_ratio)
|
||||||
|
|
||||||
|
# Ensure there's enough content height left after cropping
|
||||||
|
remaining_height = height - header_pixels - footer_pixels
|
||||||
|
remaining_ratio = remaining_height / height
|
||||||
|
|
||||||
|
if remaining_ratio < MIN_CONTENT_HEIGHT:
|
||||||
|
logger.warning("Cropping would remove too much content from %s (remaining: %.2f%% < %.2f%%), skipping crop",
|
||||||
|
image_path.name, remaining_ratio * 100, MIN_CONTENT_HEIGHT * 100)
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Crop: remove top (header) and bottom (footer)
|
||||||
|
cropped = img.crop((0, header_pixels, width, height - footer_pixels))
|
||||||
|
|
||||||
|
# Save cropped image
|
||||||
|
cropped.save(output_path)
|
||||||
|
logger.info("Cropped header/footer from %s (removed %dpx top, %dpx bottom, remaining: %.2f%%)",
|
||||||
|
image_path.name, header_pixels, footer_pixels, remaining_ratio * 100)
|
||||||
|
return True
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("Failed to crop header/footer from %s: %s", image_path, exc)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def extract_images_from_pdf(pdf_path: Path, output_dir: Path) -> List[Path]:
|
||||||
|
"""
|
||||||
|
Extract all images from a PDF file.
|
||||||
|
Returns list of paths to extracted image files.
|
||||||
|
"""
|
||||||
|
extracted_images: List[Path] = []
|
||||||
|
|
||||||
|
if not HAS_PYMUPDF:
|
||||||
|
logger.warning("PyMuPDF not available, cannot extract images from PDF")
|
||||||
|
return extracted_images
|
||||||
|
|
||||||
|
try:
|
||||||
|
doc = fitz.open(pdf_path)
|
||||||
|
image_count = 0
|
||||||
|
skipped_count = 0
|
||||||
|
|
||||||
|
for page_num, page in enumerate(doc):
|
||||||
|
page_rect = page.rect
|
||||||
|
page_height = page_rect.height
|
||||||
|
page_width = page_rect.width
|
||||||
|
|
||||||
|
# Extract embedded images
|
||||||
|
image_list = page.get_images()
|
||||||
|
|
||||||
|
# Log total images found on this page BEFORE filtering
|
||||||
|
logger.info("Page %d: Found %d embedded images (page size: %.0fx%.0f)",
|
||||||
|
page_num, len(image_list), page_width, page_height)
|
||||||
|
|
||||||
|
for img_index, img in enumerate(image_list):
|
||||||
|
try:
|
||||||
|
xref = img[0]
|
||||||
|
base_image = doc.extract_image(xref)
|
||||||
|
image_bytes = base_image["image"]
|
||||||
|
image_ext = base_image["ext"]
|
||||||
|
|
||||||
|
logger.debug("Processing image %d from page %d (xref: %d, ext: %s, size: %d bytes)",
|
||||||
|
img_index, page_num, xref, image_ext, len(image_bytes))
|
||||||
|
|
||||||
|
# Get image position and size for header/footer detection
|
||||||
|
is_header_footer = False
|
||||||
|
image_rect = None
|
||||||
|
img_width, img_height = 0, 0
|
||||||
|
position_detection_succeeded = False
|
||||||
|
size_detection_succeeded = False
|
||||||
|
aspect_ratio = 0.0
|
||||||
|
img_height_ratio = 0.0
|
||||||
|
img_width_ratio = 0.0
|
||||||
|
|
||||||
|
# PRIMARY METHOD: Check position FIRST (most reliable for header/footer detection)
|
||||||
|
# Position-based detection is the most accurate way to determine if image is in body area
|
||||||
|
try:
|
||||||
|
image_rect = page.get_image_rect(xref)
|
||||||
|
if image_rect and not image_rect.is_empty and image_rect.width > 0 and image_rect.height > 0:
|
||||||
|
position_detection_succeeded = True
|
||||||
|
# Check if image is in header/footer based on position
|
||||||
|
bbox = (image_rect.x0, image_rect.y0, image_rect.x1, image_rect.y1)
|
||||||
|
if is_header_footer_image(bbox, page_height, page_width):
|
||||||
|
logger.info("Skipping header/footer image %d from page %d (position: y0=%.1f, y1=%.1f, height=%.1f, width=%.1f)",
|
||||||
|
img_index, page_num, image_rect.y0, image_rect.y1, image_rect.height, image_rect.width)
|
||||||
|
skipped_count += 1
|
||||||
|
is_header_footer = True
|
||||||
|
except Exception as bbox_exc:
|
||||||
|
logger.debug("Could not get image rect for image %d on page %d: %s", img_index, page_num, bbox_exc)
|
||||||
|
position_detection_succeeded = False
|
||||||
|
|
||||||
|
# SECONDARY METHOD: Check size (only if position check didn't catch it or failed)
|
||||||
|
# Use size-based detection as a fallback for banner-like images
|
||||||
|
if not is_header_footer:
|
||||||
|
try:
|
||||||
|
# Check image dimensions - useful for catching banners
|
||||||
|
from PIL import Image as PILImage
|
||||||
|
from io import BytesIO
|
||||||
|
img_obj = PILImage.open(BytesIO(image_bytes))
|
||||||
|
img_width, img_height = img_obj.size
|
||||||
|
size_detection_succeeded = True
|
||||||
|
|
||||||
|
# Calculate relative size
|
||||||
|
img_height_ratio = img_height / page_height if page_height > 0 else 0
|
||||||
|
img_width_ratio = img_width / page_width if page_width > 0 else 0
|
||||||
|
aspect_ratio = img_width / img_height if img_height > 0 else 0
|
||||||
|
|
||||||
|
# Size-based filtering: Skip banner-like images
|
||||||
|
# These checks catch wide banners and small logos/icons
|
||||||
|
|
||||||
|
# 1. Very small absolute height (< 300px) - catches logos and small banners
|
||||||
|
is_very_small_height = img_height < 300
|
||||||
|
|
||||||
|
# 2. Banner aspect ratio (width >> height) - catches wide banners
|
||||||
|
is_banner_aspect = aspect_ratio > 2.5
|
||||||
|
|
||||||
|
# 3. Short relative to page (< 30% of page height) - catches banners
|
||||||
|
is_short_relative = img_height_ratio < 0.30
|
||||||
|
|
||||||
|
# 4. Tiny relative size (< 20% height AND < 50% width) - catches icons/logos
|
||||||
|
is_tiny_relative = (img_height_ratio < 0.20 and img_width_ratio < 0.50)
|
||||||
|
|
||||||
|
# 5. Wide banner pattern: short height (< 400px) AND wide (width > 2x height)
|
||||||
|
is_wide_banner_pattern = (img_height < 400 and img_width > img_height * 2.0)
|
||||||
|
|
||||||
|
# 6. Typical banner size: very wide (> 1000px) AND short (< 300px)
|
||||||
|
is_typical_banner_size = (img_width > 1000 and img_height < 300)
|
||||||
|
|
||||||
|
# 7. Very wide images: width > 800px AND height < 250px
|
||||||
|
is_very_wide = (img_width > 800 and img_height < 250)
|
||||||
|
|
||||||
|
# 8. Short and wide: height < 250px AND width > 600px
|
||||||
|
is_short_wide = (img_height < 250 and img_width > 600)
|
||||||
|
|
||||||
|
# 9. Very common banner: width > 600px AND height < 200px
|
||||||
|
is_common_banner = (img_width > 600 and img_height < 200)
|
||||||
|
|
||||||
|
# Combine checks - skip if it looks like a banner or header/footer element
|
||||||
|
is_likely_header_footer = (
|
||||||
|
is_very_small_height or
|
||||||
|
is_banner_aspect or
|
||||||
|
is_short_relative or
|
||||||
|
is_tiny_relative or
|
||||||
|
is_wide_banner_pattern or
|
||||||
|
is_typical_banner_size or
|
||||||
|
is_very_wide or
|
||||||
|
is_short_wide or
|
||||||
|
is_common_banner or
|
||||||
|
# If short AND wide, definitely skip
|
||||||
|
(is_short_relative and is_banner_aspect) or
|
||||||
|
# Final catch-all: if width is much larger than height, skip
|
||||||
|
(img_width > img_height * 2.0 and img_height < 400)
|
||||||
|
)
|
||||||
|
|
||||||
|
if is_likely_header_footer:
|
||||||
|
logger.info("Skipping header/footer image %d from page %d (size-based: %dx%d, aspect: %.2f, height_ratio: %.2f%%, width_ratio: %.2f%%)",
|
||||||
|
img_index, page_num, img_width, img_height, aspect_ratio,
|
||||||
|
img_height_ratio * 100, img_width_ratio * 100)
|
||||||
|
skipped_count += 1
|
||||||
|
is_header_footer = True
|
||||||
|
except Exception as size_exc:
|
||||||
|
logger.debug("Could not analyze image size for image %d on page %d: %s", img_index, page_num, size_exc)
|
||||||
|
size_detection_succeeded = False
|
||||||
|
|
||||||
|
# FINAL SAFETY: If position detection failed, be more aggressive
|
||||||
|
# If we can't verify position, skip images that are suspicious
|
||||||
|
if not position_detection_succeeded and size_detection_succeeded and not is_header_footer:
|
||||||
|
# Skip images larger than the page (likely background/header/footer images)
|
||||||
|
if img_height_ratio > 1.0 or img_width_ratio > 1.0:
|
||||||
|
logger.info("Skipping image %d from page %d (position unknown, but image larger than page: height_ratio=%.1f%%, width_ratio=%.1f%%)",
|
||||||
|
img_index, page_num, img_height_ratio * 100, img_width_ratio * 100)
|
||||||
|
skipped_count += 1
|
||||||
|
is_header_footer = True
|
||||||
|
# Also skip if image is very large relative to page (likely background)
|
||||||
|
elif img_height_ratio > 0.80 or img_width_ratio > 0.80:
|
||||||
|
logger.info("Skipping image %d from page %d (position unknown, but image very large relative to page: height_ratio=%.1f%%, width_ratio=%.1f%%)",
|
||||||
|
img_index, page_num, img_height_ratio * 100, img_width_ratio * 100)
|
||||||
|
skipped_count += 1
|
||||||
|
is_header_footer = True
|
||||||
|
|
||||||
|
# FINAL SAFETY: If we can't determine position AND size, skip the image (conservative approach)
|
||||||
|
# This prevents unknown images from slipping through
|
||||||
|
if not position_detection_succeeded and not size_detection_succeeded and not is_header_footer:
|
||||||
|
logger.warning("Cannot determine position or size for image %d on page %d, skipping for safety (cannot verify it's in body area)", img_index, page_num)
|
||||||
|
skipped_count += 1
|
||||||
|
is_header_footer = True
|
||||||
|
|
||||||
|
# Skip this image if it's in header/footer
|
||||||
|
if is_header_footer:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Save image (not in header/footer, passed all checks - must be in body area)
|
||||||
|
image_filename = f"page_{page_num}_img_{img_index}.{image_ext}"
|
||||||
|
image_path = output_dir / image_filename
|
||||||
|
|
||||||
|
# Get position info for logging
|
||||||
|
position_info = ""
|
||||||
|
if image_rect:
|
||||||
|
# Calculate relative position to show it's in body area
|
||||||
|
y0_ratio = image_rect.y0 / page_height if page_height > 0 else 0
|
||||||
|
y1_ratio = image_rect.y1 / page_height if page_height > 0 else 0
|
||||||
|
position_info = f", position: y0={image_rect.y0:.1f} ({y0_ratio*100:.1f}%), y1={image_rect.y1:.1f} ({y1_ratio*100:.1f}%) [BODY AREA]"
|
||||||
|
elif size_detection_succeeded:
|
||||||
|
position_info = f", size: {img_width}x{img_height}, aspect_ratio={aspect_ratio:.2f}, height_ratio={img_height_ratio*100:.1f}%"
|
||||||
|
|
||||||
|
with open(image_path, "wb") as img_file:
|
||||||
|
img_file.write(image_bytes)
|
||||||
|
|
||||||
|
extracted_images.append(image_path)
|
||||||
|
image_count += 1
|
||||||
|
logger.info("Extracted image %s from PDF page %d (BODY CONTENT image, size: %dx%d%s)",
|
||||||
|
image_filename, page_num, img_width if img_width > 0 else 0, img_height if img_height > 0 else 0, position_info)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("Failed to extract image %d from page %d: %s", img_index, page_num, exc)
|
||||||
|
|
||||||
|
# DO NOT extract full-page images - only extract embedded images
|
||||||
|
# Full-page images often contain headers/footers and are not needed
|
||||||
|
# We only want actual embedded images from the document content
|
||||||
|
logger.debug("Skipping full-page image extraction for page %d (only extracting embedded images)", page_num)
|
||||||
|
|
||||||
|
doc.close()
|
||||||
|
if skipped_count > 0:
|
||||||
|
logger.info("Extracted %d images from PDF %s (skipped %d header/footer images)",
|
||||||
|
image_count, pdf_path.name, skipped_count)
|
||||||
|
else:
|
||||||
|
logger.info("Extracted %d images from PDF %s", image_count, pdf_path.name)
|
||||||
|
return extracted_images
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception("Failed to extract images from PDF %s: %s", pdf_path, exc)
|
||||||
|
return extracted_images
|
||||||
|
|
||||||
|
|
||||||
|
def extract_images_from_docx(docx_path: Path, output_dir: Path) -> List[Path]:
|
||||||
|
"""
|
||||||
|
Extract all embedded images from a DOCX file.
|
||||||
|
Returns list of paths to extracted image files.
|
||||||
|
"""
|
||||||
|
extracted_images: List[Path] = []
|
||||||
|
|
||||||
|
if not HAS_DOCX:
|
||||||
|
logger.warning("python-docx not available, cannot extract images from DOCX")
|
||||||
|
return extracted_images
|
||||||
|
|
||||||
|
try:
|
||||||
|
doc = DocxDocument(docx_path)
|
||||||
|
image_count = 0
|
||||||
|
|
||||||
|
# Access document relationships to find images
|
||||||
|
for rel_id, rel in doc.part.rels.items():
|
||||||
|
# Check if relationship is an image
|
||||||
|
if "image" in rel.target_ref or rel.target_part.content_type.startswith("image/"):
|
||||||
|
try:
|
||||||
|
image_part = rel.target_part
|
||||||
|
image_bytes = image_part.blob
|
||||||
|
|
||||||
|
# Determine image extension from content type
|
||||||
|
content_type = image_part.content_type
|
||||||
|
ext_map = {
|
||||||
|
"image/png": "png",
|
||||||
|
"image/jpeg": "jpg",
|
||||||
|
"image/jpg": "jpg",
|
||||||
|
"image/gif": "gif",
|
||||||
|
"image/bmp": "bmp",
|
||||||
|
"image/webp": "webp",
|
||||||
|
}
|
||||||
|
ext = ext_map.get(content_type, "png")
|
||||||
|
|
||||||
|
# Check image size - small images are likely logos/icons (header/footer)
|
||||||
|
try:
|
||||||
|
from PIL import Image as PILImage
|
||||||
|
from io import BytesIO
|
||||||
|
img_obj = PILImage.open(BytesIO(image_bytes))
|
||||||
|
img_width, img_height = img_obj.size
|
||||||
|
# Skip very small images (likely logos/icons in headers/footers)
|
||||||
|
if img_width < 200 and img_height < 200:
|
||||||
|
logger.debug("Skipping small image from DOCX (likely header/footer logo, size: %dx%d)",
|
||||||
|
img_width, img_height)
|
||||||
|
continue
|
||||||
|
except Exception:
|
||||||
|
pass # Continue with extraction if size check fails
|
||||||
|
|
||||||
|
# Save image
|
||||||
|
image_filename = f"docx_img_{image_count}.{ext}"
|
||||||
|
image_path = output_dir / image_filename
|
||||||
|
|
||||||
|
with open(image_path, "wb") as img_file:
|
||||||
|
img_file.write(image_bytes)
|
||||||
|
|
||||||
|
extracted_images.append(image_path)
|
||||||
|
image_count += 1
|
||||||
|
logger.debug("Extracted image %s from DOCX", image_filename)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("Failed to extract image from DOCX: %s", exc)
|
||||||
|
|
||||||
|
logger.info("Extracted %d images from DOCX %s", image_count, docx_path.name)
|
||||||
|
return extracted_images
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception("Failed to extract images from DOCX %s: %s", docx_path, exc)
|
||||||
|
return extracted_images
|
||||||
|
|
||||||
|
|
||||||
|
def extract_images_from_pptx(pptx_path: Path, output_dir: Path) -> List[Path]:
|
||||||
|
"""
|
||||||
|
Extract all images from a PPTX file.
|
||||||
|
Returns list of paths to extracted image files.
|
||||||
|
"""
|
||||||
|
extracted_images: List[Path] = []
|
||||||
|
|
||||||
|
if not HAS_PPTX:
|
||||||
|
logger.warning("python-pptx not available, cannot extract images from PPTX")
|
||||||
|
return extracted_images
|
||||||
|
|
||||||
|
try:
|
||||||
|
prs = Presentation(pptx_path)
|
||||||
|
image_count = 0
|
||||||
|
|
||||||
|
for slide_num, slide in enumerate(prs.slides):
|
||||||
|
for shape_num, shape in enumerate(slide.shapes):
|
||||||
|
# Check if shape is a picture
|
||||||
|
if hasattr(shape, "image"):
|
||||||
|
try:
|
||||||
|
image = shape.image
|
||||||
|
image_bytes = image.blob
|
||||||
|
|
||||||
|
# Determine extension from content type
|
||||||
|
ext = image.ext # Usually 'png', 'jpg', etc.
|
||||||
|
if not ext:
|
||||||
|
ext = "png"
|
||||||
|
|
||||||
|
# Check image size and position
|
||||||
|
# Small images at edges are likely logos/icons
|
||||||
|
try:
|
||||||
|
from PIL import Image as PILImage
|
||||||
|
from io import BytesIO
|
||||||
|
img_obj = PILImage.open(BytesIO(image_bytes))
|
||||||
|
img_width, img_height = img_obj.size
|
||||||
|
|
||||||
|
# Get shape position (if available)
|
||||||
|
shape_left = shape.left if hasattr(shape, 'left') else 0
|
||||||
|
shape_top = shape.top if hasattr(shape, 'top') else 0
|
||||||
|
slide_width = slide.slide_width if hasattr(slide, 'slide_width') else 10000
|
||||||
|
slide_height = slide.slide_height if hasattr(slide, 'slide_height') else 10000
|
||||||
|
|
||||||
|
# Check if small image is in corner (likely logo)
|
||||||
|
is_small = img_width < 200 and img_height < 200
|
||||||
|
is_in_corner = (
|
||||||
|
(shape_left < slide_width * 0.1 and shape_top < slide_height * 0.1) or # Top-left
|
||||||
|
(shape_left > slide_width * 0.9 and shape_top < slide_height * 0.1) or # Top-right
|
||||||
|
(shape_left < slide_width * 0.1 and shape_top > slide_height * 0.9) or # Bottom-left
|
||||||
|
(shape_left > slide_width * 0.9 and shape_top > slide_height * 0.9) # Bottom-right
|
||||||
|
)
|
||||||
|
|
||||||
|
if is_small and is_in_corner:
|
||||||
|
logger.debug("Skipping small corner image from slide %d (likely header/footer logo)", slide_num)
|
||||||
|
continue
|
||||||
|
except Exception:
|
||||||
|
pass # Continue with extraction if check fails
|
||||||
|
|
||||||
|
# Save image
|
||||||
|
image_filename = f"slide_{slide_num}_img_{shape_num}.{ext}"
|
||||||
|
image_path = output_dir / image_filename
|
||||||
|
|
||||||
|
with open(image_path, "wb") as img_file:
|
||||||
|
img_file.write(image_bytes)
|
||||||
|
|
||||||
|
extracted_images.append(image_path)
|
||||||
|
image_count += 1
|
||||||
|
logger.debug("Extracted image %s from slide %d", image_filename, slide_num)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("Failed to extract image from shape: %s", exc)
|
||||||
|
|
||||||
|
logger.info("Extracted %d images from PPTX %s", image_count, pptx_path.name)
|
||||||
|
return extracted_images
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception("Failed to extract images from PPTX %s: %s", pptx_path, exc)
|
||||||
|
return extracted_images
|
||||||
|
|
||||||
|
|
||||||
|
def extract_images_from_file(file_path: Path, output_dir: Path) -> List[Path]:
|
||||||
|
"""
|
||||||
|
Extract images from a file based on its type.
|
||||||
|
Returns list of paths to extracted image files.
|
||||||
|
"""
|
||||||
|
suffix = file_path.suffix.lower()
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
if suffix == ".pdf":
|
||||||
|
return extract_images_from_pdf(file_path, output_dir)
|
||||||
|
elif suffix == ".docx":
|
||||||
|
return extract_images_from_docx(file_path, output_dir)
|
||||||
|
elif suffix in {".pptx", ".ppt"}:
|
||||||
|
return extract_images_from_pptx(file_path, output_dir)
|
||||||
|
else:
|
||||||
|
logger.debug("No image extraction needed for file type: %s", suffix)
|
||||||
|
return []
|
||||||
|
|
||||||
@ -0,0 +1,93 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import threading
|
||||||
|
import uuid
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Optional
|
||||||
|
|
||||||
|
from .models import JobRecord, JobStage
|
||||||
|
|
||||||
|
|
||||||
|
class JobStore:
|
||||||
|
"""Simple persistent job store backed by a JSON file."""
|
||||||
|
|
||||||
|
def __init__(self, storage_root: Path):
|
||||||
|
self._storage_root = Path(storage_root)
|
||||||
|
self._jobs_dir = self._storage_root / "jobs"
|
||||||
|
self._jobs_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
self._index_path = self._jobs_dir / "index.json"
|
||||||
|
self._lock = threading.Lock()
|
||||||
|
self._jobs: Dict[str, JobRecord] = {}
|
||||||
|
self._load()
|
||||||
|
|
||||||
|
def _load(self) -> None:
|
||||||
|
if self._index_path.exists():
|
||||||
|
try:
|
||||||
|
data = json.loads(self._index_path.read_text())
|
||||||
|
self._jobs = {job_id: JobRecord.model_validate(job_data) for job_id, job_data in data.items()}
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
print(f"[JobStore] Failed to load job index: {exc}")
|
||||||
|
self._jobs = {}
|
||||||
|
|
||||||
|
def _persist(self) -> None:
|
||||||
|
serializable = {job_id: job.model_dump(mode="json") for job_id, job in self._jobs.items()}
|
||||||
|
tmp_path = self._index_path.with_suffix(".json.tmp")
|
||||||
|
tmp_path.write_text(json.dumps(serializable, indent=2, default=str))
|
||||||
|
tmp_path.replace(self._index_path)
|
||||||
|
|
||||||
|
def create(self, name: Optional[str], total_files: int) -> JobRecord:
|
||||||
|
with self._lock:
|
||||||
|
job_id = uuid.uuid4().hex
|
||||||
|
job = JobRecord(id=job_id, name=name, total_files=total_files)
|
||||||
|
self._jobs[job_id] = job
|
||||||
|
self._persist()
|
||||||
|
return job
|
||||||
|
|
||||||
|
def update(self, job_id: str, **kwargs) -> JobRecord:
|
||||||
|
with self._lock:
|
||||||
|
job = self._jobs[job_id]
|
||||||
|
for key, value in kwargs.items():
|
||||||
|
setattr(job, key, value)
|
||||||
|
job.updated_at = datetime.utcnow()
|
||||||
|
self._jobs[job_id] = job
|
||||||
|
self._persist()
|
||||||
|
return job
|
||||||
|
|
||||||
|
def get(self, job_id: str) -> JobRecord:
|
||||||
|
with self._lock:
|
||||||
|
return self._jobs[job_id]
|
||||||
|
|
||||||
|
def exists(self, job_id: str) -> bool:
|
||||||
|
with self._lock:
|
||||||
|
return job_id in self._jobs
|
||||||
|
|
||||||
|
def list_jobs(self) -> Dict[str, JobRecord]:
|
||||||
|
with self._lock:
|
||||||
|
return dict(self._jobs)
|
||||||
|
|
||||||
|
def mark_error(self, job_id: str, message: str) -> JobRecord:
|
||||||
|
return self.update(
|
||||||
|
job_id,
|
||||||
|
stage=JobStage.FAILED,
|
||||||
|
status_message=message,
|
||||||
|
error=message,
|
||||||
|
)
|
||||||
|
|
||||||
|
def cleanup(self, older_than_days: int) -> int:
|
||||||
|
"""Remove jobs older than the retention threshold."""
|
||||||
|
cutoff = datetime.utcnow() - timedelta(days=older_than_days)
|
||||||
|
removed = 0
|
||||||
|
with self._lock:
|
||||||
|
for job_id in list(self._jobs.keys()):
|
||||||
|
if self._jobs[job_id].created_at < cutoff:
|
||||||
|
removed += 1
|
||||||
|
del self._jobs[job_id]
|
||||||
|
if removed:
|
||||||
|
self._persist()
|
||||||
|
return removed
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["JobStore"]
|
||||||
|
|
||||||
@ -0,0 +1,189 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from fastapi import BackgroundTasks, Depends, FastAPI, File, Form, HTTPException, UploadFile
|
||||||
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
|
||||||
|
from .claude_client import ClaudeCausalExtractor
|
||||||
|
from .config import Settings, get_settings
|
||||||
|
from .jobs import JobStore
|
||||||
|
from .models import CreateJobResponse, JobGraphSummary, JobStage, JobStatusResponse
|
||||||
|
from .processors.graph_writer import GraphWriter
|
||||||
|
from .storage import StorageManager
|
||||||
|
from .workflows.pipeline import JobPipeline
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
app = FastAPI(
|
||||||
|
title="Multi Document Upload Service",
|
||||||
|
version="0.1.0",
|
||||||
|
description="Processes multi-format documents to build causal knowledge graphs using Claude.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
app.add_middleware(
|
||||||
|
CORSMiddleware,
|
||||||
|
allow_origins=["*"],
|
||||||
|
allow_credentials=True,
|
||||||
|
allow_methods=["*"],
|
||||||
|
allow_headers=["*"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ServiceContainer:
|
||||||
|
settings: Settings
|
||||||
|
storage: StorageManager
|
||||||
|
job_store: JobStore
|
||||||
|
graph_writer: GraphWriter
|
||||||
|
claude_extractor: ClaudeCausalExtractor
|
||||||
|
pipeline: JobPipeline
|
||||||
|
|
||||||
|
|
||||||
|
_container: ServiceContainer | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_container() -> ServiceContainer:
|
||||||
|
global _container
|
||||||
|
if _container is None:
|
||||||
|
settings = get_settings()
|
||||||
|
if not settings.anthropic_api_key:
|
||||||
|
raise HTTPException(status_code=500, detail="ANTHROPIC_API_KEY is not configured")
|
||||||
|
|
||||||
|
storage = StorageManager(settings.storage_root)
|
||||||
|
job_store = JobStore(settings.storage_root)
|
||||||
|
graph_writer = GraphWriter(settings.neo4j_uri, settings.neo4j_user, settings.neo4j_password)
|
||||||
|
claude_extractor = ClaudeCausalExtractor(
|
||||||
|
api_key=settings.anthropic_api_key,
|
||||||
|
model=settings.claude_model,
|
||||||
|
max_output_tokens=min(settings.claude_max_output_tokens, 4000),
|
||||||
|
)
|
||||||
|
pipeline = JobPipeline(
|
||||||
|
job_store=job_store,
|
||||||
|
storage=storage,
|
||||||
|
graph_writer=graph_writer,
|
||||||
|
claude_extractor=claude_extractor,
|
||||||
|
)
|
||||||
|
_container = ServiceContainer(
|
||||||
|
settings=settings,
|
||||||
|
storage=storage,
|
||||||
|
job_store=job_store,
|
||||||
|
graph_writer=graph_writer,
|
||||||
|
claude_extractor=claude_extractor,
|
||||||
|
pipeline=pipeline,
|
||||||
|
)
|
||||||
|
return _container
|
||||||
|
|
||||||
|
|
||||||
|
def get_dependencies() -> ServiceContainer:
|
||||||
|
return get_container()
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/jobs", response_model=CreateJobResponse, status_code=202)
|
||||||
|
async def create_job(
|
||||||
|
background_tasks: BackgroundTasks,
|
||||||
|
files: List[UploadFile] = File(...),
|
||||||
|
job_name: Optional[str] = Form(default=None),
|
||||||
|
container: ServiceContainer = Depends(get_dependencies),
|
||||||
|
) -> CreateJobResponse:
|
||||||
|
settings = container.settings
|
||||||
|
storage = container.storage
|
||||||
|
job_store = container.job_store
|
||||||
|
pipeline = container.pipeline
|
||||||
|
|
||||||
|
if not files:
|
||||||
|
raise HTTPException(status_code=400, detail="At least one file must be uploaded.")
|
||||||
|
if len(files) > settings.max_files_per_job:
|
||||||
|
raise HTTPException(status_code=400, detail="Too many files uploaded for a single job.")
|
||||||
|
|
||||||
|
total_size_bytes = 0
|
||||||
|
for file in files:
|
||||||
|
file.file.seek(0, 2)
|
||||||
|
total_size_bytes += file.file.tell()
|
||||||
|
file.file.seek(0)
|
||||||
|
if total_size_bytes > settings.max_upload_size_mb * 1024 * 1024:
|
||||||
|
raise HTTPException(status_code=400, detail="Uploaded files exceed maximum allowed size.")
|
||||||
|
|
||||||
|
job = job_store.create(job_name, total_files=len(files))
|
||||||
|
job.stage = JobStage.SAVING_FILES
|
||||||
|
|
||||||
|
saved_paths: List[str] = []
|
||||||
|
for upload in files:
|
||||||
|
file_record = storage.save_upload(job.id, upload)
|
||||||
|
saved_paths.append(file_record.stored_path)
|
||||||
|
job.files.append(file_record)
|
||||||
|
|
||||||
|
job_store.update(
|
||||||
|
job.id,
|
||||||
|
stage=JobStage.EXTRACTING,
|
||||||
|
status_message="Files saved; extraction queued.",
|
||||||
|
files=job.files,
|
||||||
|
)
|
||||||
|
|
||||||
|
background_tasks.add_task(pipeline.process_job, job.id, saved_paths)
|
||||||
|
|
||||||
|
return CreateJobResponse(
|
||||||
|
job_id=job.id,
|
||||||
|
stage=job.stage,
|
||||||
|
total_files=job.total_files,
|
||||||
|
created_at=job.created_at,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/jobs/{job_id}", response_model=JobStatusResponse)
|
||||||
|
async def get_job_status(job_id: str, container: ServiceContainer = Depends(get_dependencies)) -> JobStatusResponse:
|
||||||
|
job_store = container.job_store
|
||||||
|
if not job_store.exists(job_id):
|
||||||
|
raise HTTPException(status_code=404, detail="Job not found")
|
||||||
|
job = job_store.get(job_id)
|
||||||
|
return JobStatusResponse(
|
||||||
|
job_id=job.id,
|
||||||
|
stage=job.stage,
|
||||||
|
status_message=job.status_message,
|
||||||
|
total_files=job.total_files,
|
||||||
|
processed_files=job.processed_files,
|
||||||
|
error=job.error,
|
||||||
|
created_at=job.created_at,
|
||||||
|
updated_at=job.updated_at,
|
||||||
|
files=job.files,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/jobs/{job_id}/graph", response_model=JobGraphSummary)
|
||||||
|
async def get_job_graph(job_id: str, container: ServiceContainer = Depends(get_dependencies)) -> JobGraphSummary:
|
||||||
|
job_store = container.job_store
|
||||||
|
if not job_store.exists(job_id):
|
||||||
|
raise HTTPException(status_code=404, detail="Job not found")
|
||||||
|
job = job_store.get(job_id)
|
||||||
|
if job.stage != JobStage.COMPLETED:
|
||||||
|
raise HTTPException(status_code=409, detail="Job not completed yet")
|
||||||
|
return JobGraphSummary(
|
||||||
|
job_id=job.id,
|
||||||
|
relations=job.relations,
|
||||||
|
node_count=len({rel.cause for rel in job.relations} | {rel.effect for rel in job.relations}),
|
||||||
|
edge_count=len(job.relations),
|
||||||
|
generated_at=job.updated_at,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
async def healthcheck(container: ServiceContainer = Depends(get_dependencies)):
|
||||||
|
settings = container.settings
|
||||||
|
return {
|
||||||
|
"status": "ok",
|
||||||
|
"claude_model": settings.claude_model,
|
||||||
|
"max_input_tokens_per_min": settings.claude_max_input_tokens,
|
||||||
|
"max_output_tokens_per_min": settings.claude_max_output_tokens,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.on_event("shutdown")
|
||||||
|
async def shutdown_event() -> None:
|
||||||
|
container = _container
|
||||||
|
if container:
|
||||||
|
container.graph_writer.close()
|
||||||
|
|
||||||
@ -0,0 +1,84 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from enum import Enum
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
|
||||||
|
class JobStage(str, Enum):
|
||||||
|
RECEIVED = "received"
|
||||||
|
SAVING_FILES = "saving_files"
|
||||||
|
EXTRACTING = "extracting"
|
||||||
|
ANALYZING = "analyzing"
|
||||||
|
BUILDING_GRAPH = "building_graph"
|
||||||
|
COMPLETED = "completed"
|
||||||
|
FAILED = "failed"
|
||||||
|
|
||||||
|
|
||||||
|
class FileRecord(BaseModel):
|
||||||
|
id: str
|
||||||
|
filename: str
|
||||||
|
content_type: str | None = None
|
||||||
|
size_bytes: int
|
||||||
|
stored_path: str
|
||||||
|
extracted_path: str | None = None
|
||||||
|
error: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class CausalRelation(BaseModel):
|
||||||
|
cause: str
|
||||||
|
effect: str
|
||||||
|
confidence: float = Field(default=0.0, ge=0.0, le=1.0)
|
||||||
|
explanation: Optional[str] = None
|
||||||
|
source_file_id: Optional[str] = None
|
||||||
|
source_snippet: Optional[str] = None
|
||||||
|
metadata: Dict[str, Any] = Field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
|
class JobRecord(BaseModel):
|
||||||
|
id: str
|
||||||
|
name: str | None = None
|
||||||
|
stage: JobStage = JobStage.RECEIVED
|
||||||
|
status_message: str | None = None
|
||||||
|
files: List[FileRecord] = Field(default_factory=list)
|
||||||
|
total_files: int = 0
|
||||||
|
processed_files: int = 0
|
||||||
|
relations: List[CausalRelation] = Field(default_factory=list)
|
||||||
|
created_at: datetime = Field(default_factory=datetime.utcnow)
|
||||||
|
updated_at: datetime = Field(default_factory=datetime.utcnow)
|
||||||
|
error: str | None = None
|
||||||
|
metadata: Dict[str, Any] = Field(default_factory=dict)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_finished(self) -> bool:
|
||||||
|
return self.stage in {JobStage.COMPLETED, JobStage.FAILED}
|
||||||
|
|
||||||
|
|
||||||
|
class CreateJobResponse(BaseModel):
|
||||||
|
job_id: str
|
||||||
|
stage: JobStage
|
||||||
|
total_files: int
|
||||||
|
created_at: datetime
|
||||||
|
|
||||||
|
|
||||||
|
class JobStatusResponse(BaseModel):
|
||||||
|
job_id: str
|
||||||
|
stage: JobStage
|
||||||
|
status_message: str | None = None
|
||||||
|
total_files: int
|
||||||
|
processed_files: int
|
||||||
|
error: str | None = None
|
||||||
|
created_at: datetime
|
||||||
|
updated_at: datetime
|
||||||
|
files: List[FileRecord]
|
||||||
|
|
||||||
|
|
||||||
|
class JobGraphSummary(BaseModel):
|
||||||
|
job_id: str
|
||||||
|
relations: List[CausalRelation]
|
||||||
|
node_count: int
|
||||||
|
edge_count: int
|
||||||
|
generated_at: datetime
|
||||||
|
|
||||||
@ -0,0 +1,24 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Iterable, List
|
||||||
|
|
||||||
|
import tiktoken
|
||||||
|
|
||||||
|
|
||||||
|
class TextChunker:
|
||||||
|
def __init__(self, model_name: str, token_target: int = 800, overlap: int = 200):
|
||||||
|
self.encoder = tiktoken.encoding_for_model("gpt-4o") if "claude" not in model_name else tiktoken.get_encoding("cl100k_base")
|
||||||
|
self.token_target = token_target
|
||||||
|
self.overlap = overlap
|
||||||
|
|
||||||
|
def chunk(self, text: str) -> Iterable[str]:
|
||||||
|
tokens = self.encoder.encode(text)
|
||||||
|
step = max(self.token_target - self.overlap, 1)
|
||||||
|
chunks: List[str] = []
|
||||||
|
for start in range(0, len(tokens), step):
|
||||||
|
end = min(start + self.token_target, len(tokens))
|
||||||
|
chunk_tokens = tokens[start:end]
|
||||||
|
chunk_text = self.encoder.decode(chunk_tokens)
|
||||||
|
chunks.append(chunk_text)
|
||||||
|
return chunks
|
||||||
|
|
||||||
@ -0,0 +1,81 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
from neo4j import GraphDatabase, Transaction
|
||||||
|
|
||||||
|
from ..models import CausalRelation
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
MERGE_QUERY = """
|
||||||
|
MERGE (cause:Concept {name: $cause})
|
||||||
|
ON CREATE SET cause.created_at = timestamp(), cause.lastSeen = timestamp()
|
||||||
|
ON MATCH SET cause.lastSeen = timestamp()
|
||||||
|
MERGE (effect:Concept {name: $effect})
|
||||||
|
ON CREATE SET effect.created_at = timestamp(), effect.lastSeen = timestamp()
|
||||||
|
ON MATCH SET effect.lastSeen = timestamp()
|
||||||
|
MERGE (cause)-[r:CAUSES]->(effect)
|
||||||
|
ON CREATE SET r.confidence = $confidence,
|
||||||
|
r.explanation = $explanation,
|
||||||
|
r.source_file_id = $source_file_id,
|
||||||
|
r.source_snippet = $source_snippet,
|
||||||
|
r.job_id = $job_id,
|
||||||
|
r.model = $model,
|
||||||
|
r.created_at = timestamp(),
|
||||||
|
r.updated_at = timestamp()
|
||||||
|
ON MATCH SET r.confidence = $confidence,
|
||||||
|
r.explanation = $explanation,
|
||||||
|
r.source_file_id = $source_file_id,
|
||||||
|
r.source_snippet = $source_snippet,
|
||||||
|
r.job_id = $job_id,
|
||||||
|
r.model = $model,
|
||||||
|
r.updated_at = timestamp()
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class GraphWriter:
|
||||||
|
def __init__(self, uri: str, user: str, password: str):
|
||||||
|
self._driver = GraphDatabase.driver(uri, auth=(user, password))
|
||||||
|
|
||||||
|
def close(self) -> None:
|
||||||
|
self._driver.close()
|
||||||
|
|
||||||
|
def write_relations(self, job_id: str, relations: Iterable[CausalRelation]) -> None:
|
||||||
|
relations_list = list(relations)
|
||||||
|
if not relations_list:
|
||||||
|
logger.warning("No relations to write for job %s", job_id)
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info("Writing %d relations to Neo4j for job %s", len(relations_list), job_id)
|
||||||
|
|
||||||
|
with self._driver.session() as session:
|
||||||
|
def _write(tx: Transaction) -> None:
|
||||||
|
count = 0
|
||||||
|
for relation in relations_list:
|
||||||
|
if not relation.cause or not relation.effect:
|
||||||
|
logger.warning("Skipping relation with empty cause or effect: %s -> %s", relation.cause, relation.effect)
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
result = tx.run(
|
||||||
|
MERGE_QUERY,
|
||||||
|
cause=relation.cause.strip(),
|
||||||
|
effect=relation.effect.strip(),
|
||||||
|
confidence=float(relation.confidence) if relation.confidence else 0.0,
|
||||||
|
explanation=relation.explanation or "",
|
||||||
|
source_file_id=relation.source_file_id or "",
|
||||||
|
source_snippet=relation.source_snippet or "",
|
||||||
|
job_id=job_id,
|
||||||
|
model=relation.metadata.get("model") or "",
|
||||||
|
)
|
||||||
|
count += 1
|
||||||
|
logger.debug("Wrote relation: %s -> %s (confidence: %s)", relation.cause, relation.effect, relation.confidence)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception("Failed to write relation %s -> %s: %s", relation.cause, relation.effect, exc)
|
||||||
|
logger.info("Successfully wrote %d/%d relations to Neo4j", count, len(relations_list))
|
||||||
|
|
||||||
|
session.execute_write(_write)
|
||||||
|
logger.info("Persisted causal relations for job %s", job_id)
|
||||||
|
|
||||||
@ -0,0 +1,59 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import shutil
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable, Tuple
|
||||||
|
|
||||||
|
from fastapi import UploadFile
|
||||||
|
|
||||||
|
from .models import FileRecord
|
||||||
|
|
||||||
|
|
||||||
|
class StorageManager:
|
||||||
|
def __init__(self, root: Path):
|
||||||
|
self.root = Path(root)
|
||||||
|
self.upload_dir = self.root / "uploads"
|
||||||
|
self.extract_dir = self.root / "extracted"
|
||||||
|
self.images_dir = self.root / "images"
|
||||||
|
self.upload_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
self.extract_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
self.images_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
def save_upload(self, job_id: str, upload: UploadFile) -> FileRecord:
|
||||||
|
job_dir = self.upload_dir / job_id
|
||||||
|
job_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
destination = job_dir / upload.filename
|
||||||
|
upload.file.seek(0)
|
||||||
|
with destination.open("wb") as out_file:
|
||||||
|
shutil.copyfileobj(upload.file, out_file)
|
||||||
|
|
||||||
|
size_bytes = destination.stat().st_size
|
||||||
|
return FileRecord(
|
||||||
|
id=destination.stem,
|
||||||
|
filename=upload.filename,
|
||||||
|
content_type=upload.content_type,
|
||||||
|
size_bytes=size_bytes,
|
||||||
|
stored_path=str(destination),
|
||||||
|
)
|
||||||
|
|
||||||
|
def stage_extracted_content(self, job_id: str, file_name: str, content: str) -> Path:
|
||||||
|
job_dir = self.extract_dir / job_id
|
||||||
|
job_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
safe_name = f"{Path(file_name).stem}.txt"
|
||||||
|
destination = job_dir / safe_name
|
||||||
|
destination.write_text(content, encoding="utf-8")
|
||||||
|
return destination
|
||||||
|
|
||||||
|
def list_saved_files(self, job_id: str) -> Iterable[Tuple[str, Path]]:
|
||||||
|
job_dir = self.upload_dir / job_id
|
||||||
|
if not job_dir.exists():
|
||||||
|
return []
|
||||||
|
return [(file.name, file) for file in job_dir.iterdir() if file.is_file()]
|
||||||
|
|
||||||
|
def get_images_dir(self, job_id: str) -> Path:
|
||||||
|
"""Get or create directory for extracted images."""
|
||||||
|
images_dir = self.root / "images" / job_id
|
||||||
|
images_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
return images_dir
|
||||||
|
|
||||||
@ -0,0 +1,164 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable, List
|
||||||
|
|
||||||
|
from ..claude_client import ClaudeCausalExtractor
|
||||||
|
from ..config import get_settings
|
||||||
|
from ..extractors.auto import extract_text
|
||||||
|
from ..extractors.image_extractor import extract_images_from_file
|
||||||
|
from ..jobs import JobStore
|
||||||
|
from ..models import CausalRelation, JobStage
|
||||||
|
from ..processors.chunker import TextChunker
|
||||||
|
from ..processors.graph_writer import GraphWriter
|
||||||
|
from ..storage import StorageManager
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class JobPipeline:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
job_store: JobStore,
|
||||||
|
storage: StorageManager,
|
||||||
|
graph_writer: GraphWriter,
|
||||||
|
claude_extractor: ClaudeCausalExtractor,
|
||||||
|
):
|
||||||
|
self.job_store = job_store
|
||||||
|
self.storage = storage
|
||||||
|
self.graph_writer = graph_writer
|
||||||
|
self.claude_extractor = claude_extractor
|
||||||
|
settings = get_settings()
|
||||||
|
self.chunker = TextChunker(
|
||||||
|
model_name=settings.claude_model,
|
||||||
|
token_target=settings.chunk_token_target,
|
||||||
|
overlap=settings.chunk_token_overlap,
|
||||||
|
)
|
||||||
|
|
||||||
|
def process_job(self, job_id: str, saved_files: Iterable[str]) -> None:
|
||||||
|
job = self.job_store.get(job_id)
|
||||||
|
logger.info("Processing job %s with %d files", job_id, job.total_files)
|
||||||
|
|
||||||
|
relations: List[CausalRelation] = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.job_store.update(job_id, stage=JobStage.EXTRACTING, status_message="Extracting content")
|
||||||
|
for count, file_path in enumerate(saved_files, start=1):
|
||||||
|
file_path_obj = Path(file_path)
|
||||||
|
file_record = next((f for f in job.files if f.stored_path == file_path), None)
|
||||||
|
logger.info("Processing %s", file_path_obj.name)
|
||||||
|
source_file_id = file_record.id if file_record else file_path_obj.name
|
||||||
|
suffix = file_path_obj.suffix.lower()
|
||||||
|
|
||||||
|
# Check if this is a direct image upload
|
||||||
|
is_direct_image = suffix in {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Extract text from document (if not a direct image)
|
||||||
|
text = ""
|
||||||
|
if not is_direct_image:
|
||||||
|
try:
|
||||||
|
text = extract_text(file_path_obj)
|
||||||
|
|
||||||
|
# Process text if available
|
||||||
|
if text and text.strip():
|
||||||
|
# Validate text is readable
|
||||||
|
printable_chars = sum(1 for c in text if c.isprintable() or c.isspace())
|
||||||
|
total_chars = len(text)
|
||||||
|
if total_chars > 100 and printable_chars / total_chars < 0.3:
|
||||||
|
logger.warning("Text from %s appears to be binary, skipping text processing", file_path_obj.name)
|
||||||
|
text = ""
|
||||||
|
else:
|
||||||
|
extracted_path = self.storage.stage_extracted_content(job_id, file_path_obj.name, text)
|
||||||
|
if file_record:
|
||||||
|
file_record.extracted_path = str(extracted_path)
|
||||||
|
logger.info("Successfully extracted %d characters from %s", len(text), file_path_obj.name)
|
||||||
|
except Exception as text_exc:
|
||||||
|
logger.warning("Text extraction failed for %s: %s. Will continue with image extraction if available.", file_path_obj.name, text_exc)
|
||||||
|
text = ""
|
||||||
|
|
||||||
|
# Extract images from documents (PDF, DOCX, PPTX)
|
||||||
|
extracted_images: List[Path] = []
|
||||||
|
if suffix in {".pdf", ".docx", ".pptx", ".ppt"}:
|
||||||
|
try:
|
||||||
|
images_dir = self.storage.get_images_dir(job_id)
|
||||||
|
extracted_images = extract_images_from_file(file_path_obj, images_dir)
|
||||||
|
logger.info("Extracted %d images from %s", len(extracted_images), file_path_obj.name)
|
||||||
|
except Exception as img_exc:
|
||||||
|
logger.warning("Failed to extract images from %s: %s", file_path_obj.name, img_exc)
|
||||||
|
|
||||||
|
# For direct image uploads, add the file itself to images list
|
||||||
|
if is_direct_image:
|
||||||
|
extracted_images = [file_path_obj]
|
||||||
|
logger.info("Direct image upload detected: %s", file_path_obj.name)
|
||||||
|
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
logger.exception("Extraction failed for %s", file_path_obj)
|
||||||
|
if file_record:
|
||||||
|
file_record.error = str(exc)
|
||||||
|
continue
|
||||||
|
|
||||||
|
self.job_store.update(
|
||||||
|
job_id,
|
||||||
|
files=job.files,
|
||||||
|
processed_files=count,
|
||||||
|
status_message=f"Analyzing causal relations ({count}/{job.total_files})",
|
||||||
|
stage=JobStage.ANALYZING,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Process text content
|
||||||
|
if text and text.strip():
|
||||||
|
chunks = self.chunker.chunk(text)
|
||||||
|
text_relations = self.claude_extractor.analyze(chunks, source_file_id=source_file_id)
|
||||||
|
relations.extend(text_relations)
|
||||||
|
logger.info("Extracted %d relations from text in %s", len(text_relations), file_path_obj.name)
|
||||||
|
|
||||||
|
# Process images (extracted from documents or direct uploads)
|
||||||
|
if extracted_images:
|
||||||
|
for image_path in extracted_images:
|
||||||
|
try:
|
||||||
|
image_relations = self.claude_extractor.analyze_image(image_path, source_file_id=source_file_id)
|
||||||
|
relations.extend(image_relations)
|
||||||
|
logger.info("Extracted %d relations from image %s", len(image_relations), image_path.name)
|
||||||
|
except Exception as img_exc:
|
||||||
|
logger.warning("Failed to analyze image %s: %s", image_path, img_exc)
|
||||||
|
# Continue with other images
|
||||||
|
elif not text or not text.strip():
|
||||||
|
# No text and no images - file might be empty or unsupported
|
||||||
|
logger.warning("File %s has no extractable text or images", file_path_obj.name)
|
||||||
|
if file_record:
|
||||||
|
file_record.error = "No extractable content found (no text or images)"
|
||||||
|
|
||||||
|
# Write relations to Neo4j if any were found
|
||||||
|
if relations:
|
||||||
|
self.job_store.update(job_id, status_message="Writing to knowledge graph", stage=JobStage.BUILDING_GRAPH)
|
||||||
|
try:
|
||||||
|
self.graph_writer.write_relations(job_id, relations)
|
||||||
|
logger.info("Wrote %d relations to Neo4j for job %s", len(relations), job_id)
|
||||||
|
status_message = f"Completed with {len(relations)} causal relationship(s) written to Neo4j"
|
||||||
|
except Exception as graph_exc:
|
||||||
|
logger.exception("Failed to write relations to Neo4j for job %s: %s", job_id, graph_exc)
|
||||||
|
status_message = f"Completed with {len(relations)} relations extracted, but failed to write to Neo4j: {graph_exc}"
|
||||||
|
else:
|
||||||
|
logger.warning("Job %s completed with 0 relations - no causal relationships found", job_id)
|
||||||
|
# Check if any files failed to extract
|
||||||
|
failed_files = [f for f in job.files if f.error]
|
||||||
|
if failed_files:
|
||||||
|
status_message = f"Completed but {len(failed_files)} file(s) failed to extract. No relations found."
|
||||||
|
else:
|
||||||
|
status_message = "Completed but no causal relationships were found in the documents."
|
||||||
|
|
||||||
|
# Final update
|
||||||
|
self.job_store.update(
|
||||||
|
job_id,
|
||||||
|
stage=JobStage.COMPLETED,
|
||||||
|
status_message=status_message,
|
||||||
|
relations=relations,
|
||||||
|
processed_files=job.total_files,
|
||||||
|
)
|
||||||
|
logger.info("Job %s completed with %d relations", job_id, len(relations))
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
logger.exception("Job %s failed: %s", job_id, exc)
|
||||||
|
self.job_store.mark_error(job_id, f"Pipeline failed: {exc}")
|
||||||
|
|
||||||
Loading…
Reference in New Issue
Block a user