diff --git a/docker-compose.yml b/docker-compose.yml index 4617e1a..be74735 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -196,27 +196,45 @@ services: # retries: 5 # start_period: 60s - chromadb: - image: chromadb/chroma:latest - container_name: pipeline_chromadb + # chromadb: + # image: chromadb/chroma:latest + # container_name: pipeline_chromadb + # ports: + # - "8010:8000" + # environment: + # - CHROMA_SERVER_HOST=0.0.0.0 + # - CHROMA_SERVER_HTTP_PORT=8000 + # - IS_PERSISTENT=TRUE + # - PERSIST_DIRECTORY=/chroma/chroma + # - ANONYMIZED_TELEMETRY=TRUE + # volumes: + # - chromadb_data:/chroma/chroma + # networks: + # - pipeline_network + # healthcheck: + # test: ["CMD-SHELL", "timeout 5 bash -c ' 0 + sample_file = frontend_files[0] if frontend_files else None + sample_path = sample_file.path if sample_file else "" + sample_content = getattr(sample_file, 'content', '')[:1000] if sample_file else "" + + # Allocate persona - prefer state management if state files exist + if has_state_files: + # Try to get state management persona + persona = allocate_code_persona("store/state.ts", sample_content, "frontend_state") + if "state" not in persona.get("role", "").lower(): + # Fallback to UI persona + persona = allocate_code_persona(sample_path, sample_content, "frontend_ui") + else: + persona = allocate_code_persona(sample_path, sample_content, "frontend_ui") + + assignment_context = f"CTO has assigned you to analyze the frontend codebase for this project. You are analyzing {len(frontend_files)} frontend files including components, routing, state management, and configuration." + front_end_prompt = f""" -You are a Senior Frontend Architect and Technical Writer with 20+ years of experience. Analyze this frontend codebase and produce a comprehensive, technically precise report. The audience includes senior engineers and stakeholders who expect evidence-based, objective findings. +Analyze this frontend codebase and produce a comprehensive, technically precise report. The audience includes senior engineers and stakeholders who expect evidence-based, objective findings. STRICT STYLE RULES: - Use professional, technical language only. Do not use analogies, metaphors, storytelling, or colloquial comparisons. @@ -7211,6 +7232,9 @@ FINAL REQUIREMENTS: - Ensure total length between 2000-3000 words. """ + # Enhance prompt with persona + enhanced_prompt = build_code_analysis_persona_prompt(front_end_prompt, persona, assignment_context) + try: print(f"🤖 [FRONTEND AI] Calling Claude API for comprehensive frontend analysis...") print(f"🤖 [FRONTEND AI] Analyzing {len(frontend_files)} frontend files...") @@ -7220,7 +7244,7 @@ FINAL REQUIREMENTS: model=os.getenv("CLAUDE_MODEL", "claude-3-5-haiku-latest"), max_tokens=8000, # Increased from 6000 to 8000 for more detailed analysis temperature=0.1, - messages=[{"role": "user", "content": front_end_prompt}] + messages=[{"role": "user", "content": enhanced_prompt}] ) ai_analysis = message.content[0].text.strip() @@ -7230,7 +7254,7 @@ FINAL REQUIREMENTS: if not ai_analysis or len(ai_analysis) < 100: print("⚠️ [FRONTEND AI] AI analysis too short, regenerating...") # Retry with more emphasis on detail - retry_prompt = front_end_prompt + "\n\nIMPORTANT: Provide a VERY DETAILED analysis. The previous response was too short. Please provide at least 2000 words of detailed explanation." + retry_prompt = enhanced_prompt + "\n\nIMPORTANT: Provide a VERY DETAILED analysis. The previous response was too short. Please provide at least 2000 words of detailed explanation." message = self.client.messages.create( model=os.getenv("CLAUDE_MODEL", "claude-3-5-haiku-latest"), max_tokens=8000, diff --git a/services/ai-analysis-service/enhanced_chunking.py b/services/ai-analysis-service/enhanced_chunking.py index 9aaba44..2dceb99 100644 --- a/services/ai-analysis-service/enhanced_chunking.py +++ b/services/ai-analysis-service/enhanced_chunking.py @@ -524,7 +524,11 @@ class ChunkAnalyzer: def _build_chunk_analysis_prompt(self, file_path: str, chunk: ChunkInfo, chunk_index: int, total_chunks: int, context_memories: Dict[str, Any]) -> str: - """Build comprehensive analysis prompt for a chunk.""" + """Build comprehensive analysis prompt for a chunk with persona.""" + from persona_system import allocate_code_persona, build_code_analysis_persona_prompt + + # Allocate persona based on file path and chunk content + persona = allocate_code_persona(file_path, chunk.content, chunk.chunk_type) # Build context information context_info = "" @@ -538,8 +542,10 @@ class ChunkAnalyzer: for practice in context_memories['best_practices'][:3]: context_info += f"- {practice['content'][:100]}...\n" + assignment_context = f"CTO has assigned you to analyze chunk {chunk_index + 1} of {total_chunks} from file: {file_path}. This is a {chunk.chunk_type} chunk covering lines {chunk.start_line}-{chunk.end_line}." + prompt = f""" -You are a senior software engineer analyzing chunk {chunk_index + 1} of {total_chunks} from file: {file_path} +Analyzing chunk {chunk_index + 1} of {total_chunks} from file: {file_path} CHUNK INFORMATION: - Chunk Type: {chunk.chunk_type} @@ -564,7 +570,10 @@ Provide a focused analysis of this specific chunk, considering: Focus on actionable insights for this specific code section. """ - return prompt + + # Enhance with persona + enhanced_prompt = build_code_analysis_persona_prompt(prompt, persona, assignment_context) + return enhanced_prompt def _detect_language_from_path(self, file_path: str) -> str: """Detect language from file path.""" diff --git a/services/ai-analysis-service/persona_system.py b/services/ai-analysis-service/persona_system.py new file mode 100644 index 0000000..7ea28b1 --- /dev/null +++ b/services/ai-analysis-service/persona_system.py @@ -0,0 +1,755 @@ +""" +World-Class Persona System for AI Analysis +Simulates real-world team allocation with domain-specific experts from top companies. +""" + +from typing import Dict, List, Optional, Tuple +import re + + +# ============================================================================ +# CODE ANALYSIS PERSONAS (for AI Analysis Service) +# ============================================================================ + +CODE_ANALYSIS_PERSONAS = { + # BACKEND DOMAINS + "backend_api": { + "role": "Senior Backend API Architect", + "companies": ["Google", "Amazon", "Stripe"], + "expertise": ["REST APIs", "GraphQL", "gRPC", "API Gateway", "Microservices"], + "experience_years": "18+", + "achievements": [ + "Designed APIs at Google Cloud Platform handling 10M+ requests/day", + "Built scalable API infrastructure at Amazon AWS serving millions of customers", + "Led API architecture at Stripe processing billions in transactions" + ], + "detection_keywords": ["api", "controller", "route", "endpoint", "service", "rest", "graphql"], + "focus_areas": [ + "API design patterns and best practices", + "API versioning and backward compatibility", + "Rate limiting and throttling strategies", + "API documentation quality", + "Security vulnerabilities in API endpoints" + ] + }, + + "backend_database": { + "role": "Senior Database Architect", + "companies": ["Amazon", "Oracle", "MongoDB"], + "expertise": ["SQL", "NoSQL", "Database Design", "Query Optimization", "Data Modeling"], + "experience_years": "20+", + "achievements": [ + "Designed database systems at Amazon handling petabytes of data", + "Optimized databases at Oracle for enterprise-scale applications", + "Built distributed databases at MongoDB for global scale" + ], + "detection_keywords": ["database", "db", "model", "schema", "migration", "repository", "orm", "query"], + "focus_areas": [ + "Database schema design and normalization", + "Query performance and optimization", + "Data integrity and constraints", + "Indexing strategies", + "Transaction management" + ] + }, + + "backend_business": { + "role": "Senior Backend Business Logic Architect", + "companies": ["Microsoft", "Salesforce", "SAP"], + "expertise": ["Business Logic", "Domain Modeling", "Design Patterns", "Service Layer"], + "experience_years": "17+", + "achievements": [ + "Architected business logic systems at Microsoft for enterprise applications", + "Designed domain models at Salesforce for CRM platforms", + "Built service layers at SAP for ERP systems" + ], + "detection_keywords": ["service", "business", "logic", "domain", "entity", "dto", "handler"], + "focus_areas": [ + "Code organization and structure", + "Design patterns implementation", + "Business logic maintainability", + "Domain modeling quality", + "Service layer architecture" + ] + }, + + # FRONTEND DOMAINS + "frontend_ui": { + "role": "Senior Frontend UI Architect", + "companies": ["Apple", "Meta", "Netflix"], + "expertise": ["React", "Vue", "Angular", "Component Design", "UI/UX"], + "experience_years": "15+", + "achievements": [ + "Built user interfaces at Apple used by millions daily", + "Led React architecture at Meta (Facebook) for large-scale applications", + "Designed performance-optimized UIs at Netflix for 200M+ users" + ], + "detection_keywords": ["component", "ui", "view", "page", "jsx", "tsx", "vue", "template"], + "focus_areas": [ + "Component architecture and reusability", + "User experience and accessibility", + "UI performance optimization", + "Design system consistency", + "Responsive design implementation" + ] + }, + + "frontend_state": { + "role": "Senior Frontend State Management Architect", + "companies": ["Meta", "Netflix", "Airbnb"], + "expertise": ["Redux", "Zustand", "Context API", "State Management", "Data Flow"], + "experience_years": "14+", + "achievements": [ + "Architected state management at Meta for complex applications", + "Designed data flow patterns at Netflix for real-time updates", + "Built state systems at Airbnb for booking platforms" + ], + "detection_keywords": ["store", "state", "redux", "context", "recoil", "zustand", "mobx"], + "focus_areas": [ + "State architecture and patterns", + "Data flow optimization", + "State synchronization", + "Performance in state updates", + "State management best practices" + ] + }, + + # DEVOPS DOMAINS + "devops_ci_cd": { + "role": "Senior DevOps CI/CD Architect", + "companies": ["Google", "Netflix", "Uber"], + "expertise": ["CI/CD", "Jenkins", "GitHub Actions", "GitLab CI", "Deployment Automation"], + "experience_years": "12+", + "achievements": [ + "Built CI/CD pipelines at Google handling 50K+ deployments/day", + "Designed deployment systems at Netflix for zero-downtime releases", + "Architected automation at Uber for global scale" + ], + "detection_keywords": ["ci", "cd", "pipeline", "jenkins", "github-actions", "gitlab", "deploy"], + "focus_areas": [ + "CI/CD pipeline efficiency", + "Deployment strategy and automation", + "Quality gates and testing", + "Rollback strategies", + "Build optimization" + ] + }, + + "devops_infrastructure": { + "role": "Senior Infrastructure Architect", + "companies": ["Amazon", "Google", "Microsoft"], + "expertise": ["Kubernetes", "Docker", "Terraform", "Cloud Infrastructure", "Scalability"], + "experience_years": "16+", + "achievements": [ + "Designed infrastructure at Amazon AWS for global scale", + "Built container orchestration at Google for millions of containers", + "Architected cloud systems at Microsoft Azure with 99.99% uptime" + ], + "detection_keywords": ["docker", "kubernetes", "terraform", "infrastructure", "cloud", "aws", "gcp", "azure"], + "focus_areas": [ + "Infrastructure scalability", + "System reliability and uptime", + "Cost optimization", + "Security in infrastructure", + "Monitoring and observability" + ] + }, + + # SECURITY DOMAINS + "security_engineer": { + "role": "Senior Security Engineer", + "companies": ["Google", "Microsoft", "Cloudflare"], + "expertise": ["Security", "Vulnerability Assessment", "Penetration Testing", "Security Architecture"], + "experience_years": "15+", + "achievements": [ + "Led security initiatives at Google protecting billions of users", + "Designed security systems at Microsoft for enterprise applications", + "Built security infrastructure at Cloudflare for DDoS protection" + ], + "detection_keywords": ["security", "auth", "encryption", "jwt", "oauth", "ssl", "tls", "cors"], + "focus_areas": [ + "Security vulnerabilities and threats", + "Authentication and authorization", + "Data encryption and protection", + "Security best practices", + "Compliance and regulations" + ] + }, + + # DATA DOMAINS + "data_engineer": { + "role": "Senior Data Engineer", + "companies": ["Google", "Netflix", "Uber"], + "expertise": ["Data Pipelines", "ETL", "Big Data", "Data Warehousing", "Spark"], + "experience_years": "13+", + "achievements": [ + "Built data pipelines at Google processing petabytes daily", + "Designed ETL systems at Netflix for real-time analytics", + "Architected data infrastructure at Uber for millions of rides" + ], + "detection_keywords": ["data", "pipeline", "etl", "warehouse", "spark", "hadoop", "kafka"], + "focus_areas": [ + "Data architecture and pipelines", + "ETL performance and optimization", + "Data quality and validation", + "Scalability in data processing", + "Data governance" + ] + }, + + "ml_engineer": { + "role": "Senior ML/AI Engineer", + "companies": ["OpenAI", "Anthropic", "Google DeepMind"], + "expertise": ["Machine Learning", "Deep Learning", "AI Systems", "Model Training"], + "experience_years": "12+", + "achievements": [ + "Developed ML models at OpenAI for language understanding", + "Built AI systems at Anthropic for safety-critical applications", + "Designed training pipelines at Google DeepMind for large-scale models" + ], + "detection_keywords": ["ml", "ai", "model", "training", "neural", "tensorflow", "pytorch", "learning"], + "focus_areas": [ + "ML model architecture", + "Training pipeline optimization", + "Model performance and accuracy", + "Scalability in ML systems", + "AI safety and ethics" + ] + }, + + # TESTING DOMAINS + "qa_automation": { + "role": "Senior QA Automation Architect", + "companies": ["Google", "Microsoft", "Amazon"], + "expertise": ["Test Automation", "Selenium", "Cypress", "Jest", "Testing Strategy"], + "experience_years": "14+", + "achievements": [ + "Built test automation at Google for thousands of test cases", + "Designed testing frameworks at Microsoft for enterprise software", + "Architected QA systems at Amazon for e-commerce platforms" + ], + "detection_keywords": ["test", "spec", "jest", "cypress", "selenium", "pytest", "testing"], + "focus_areas": [ + "Test coverage and quality", + "Automation strategy", + "Test maintainability", + "Performance testing", + "Testing best practices" + ] + }, + + "performance_engineer": { + "role": "Senior Performance Engineer", + "companies": ["Google", "Netflix", "Amazon"], + "expertise": ["Performance Optimization", "Load Testing", "Profiling", "Scalability"], + "experience_years": "16+", + "achievements": [ + "Optimized systems at Google handling billions of requests", + "Designed performance solutions at Netflix for streaming at scale", + "Built performance infrastructure at Amazon for peak traffic" + ], + "detection_keywords": ["performance", "load", "stress", "benchmark", "profiling", "optimization"], + "focus_areas": [ + "Performance bottlenecks", + "Optimization strategies", + "Scalability concerns", + "Resource utilization", + "Performance testing" + ] + }, + + # CTO (for synthesis) + "cto": { + "role": "Chief Technology Officer", + "companies": ["Google", "Microsoft", "Amazon"], + "expertise": ["Strategic Planning", "System Architecture", "Team Leadership", "Technology Strategy"], + "experience_years": "25+", + "achievements": [ + "Former VP of Engineering at Google, leading teams of 500+ engineers", + "CTO at Microsoft Azure, responsible for cloud infrastructure strategy", + "Strategic advisor at Amazon Web Services for enterprise architecture" + ], + "focus_areas": [ + "Strategic technology insights", + "System-wide risk assessment", + "Architectural recommendations", + "Cross-domain synthesis", + "Executive-level analysis" + ] + } +} + + +# ============================================================================ +# DOCUMENT ANALYSIS PERSONAS (for Multi-Document Upload Service) +# ============================================================================ + +DOCUMENT_ANALYSIS_PERSONAS = { + "technical_doc_analyst": { + "role": "Senior Technical Documentation Analyst", + "companies": ["Google", "Stripe", "Microsoft"], + "expertise_domain": "technical documentation and API specifications", + "document_types": ["API docs", "technical specs", "developer guides"], + "experience_years": "15+", + "achievements": [ + "Analyzed technical documentation at Google for millions of API integrations", + "Led documentation analysis at Stripe for developer experience", + "Mapped technical relationships at Microsoft for enterprise systems" + ], + "focus_areas": [ + "Technical dependencies and relationships", + "System integration points", + "API contract relationships", + "Technical process flows", + "Code-to-documentation mappings" + ], + "visual_focus_areas": [ + "API flow diagrams", + "System integration diagrams", + "Technical architecture flows" + ], + "detection_keywords": ["api", "technical", "specification", "documentation", "guide", "reference", "developer"] + }, + + "business_process_analyst": { + "role": "Senior Business Process Analyst", + "companies": ["McKinsey", "Deloitte", "Accenture"], + "expertise_domain": "business processes and stakeholder requirements", + "document_types": ["business requirements", "user stories", "business plans"], + "experience_years": "18+", + "achievements": [ + "Analyzed business processes at McKinsey for Fortune 500 companies", + "Led process mapping at Deloitte for enterprise transformations", + "Mapped stakeholder relationships at Accenture for global projects" + ], + "focus_areas": [ + "Business process flows", + "Requirement dependencies", + "Stakeholder impact chains", + "Business decision consequences", + "Organizational impact analysis" + ], + "visual_focus_areas": [ + "Business process diagrams", + "Stakeholder impact maps", + "Decision flowcharts" + ], + "detection_keywords": ["business", "requirement", "stakeholder", "user story", "process", "workflow", "business plan"] + }, + + "system_architecture_analyst": { + "role": "Senior System Architecture Document Analyst", + "companies": ["Google", "Amazon", "Microsoft"], + "expertise_domain": "system architecture and design documents", + "document_types": ["architecture docs", "design documents", "system designs"], + "experience_years": "20+", + "achievements": [ + "Analyzed architecture documents at Google for large-scale distributed systems", + "Mapped system relationships at Amazon for cloud infrastructure", + "Led architecture analysis at Microsoft for enterprise solutions" + ], + "focus_areas": [ + "Architecture relationships", + "Component dependencies", + "System interaction flows", + "Design decision impacts", + "Scalability relationships" + ], + "visual_focus_areas": [ + "Architecture diagrams", + "Component interaction diagrams", + "System dependency maps" + ], + "detection_keywords": ["architecture", "design", "system", "component", "diagram", "architectural"] + }, + + "requirements_analyst": { + "role": "Senior Requirements & Specification Analyst", + "companies": ["IBM", "Oracle", "SAP"], + "expertise_domain": "requirements and functional specifications", + "document_types": ["requirements docs", "functional specs", "feature specs"], + "experience_years": "17+", + "achievements": [ + "Analyzed requirements at IBM for enterprise software implementations", + "Mapped specifications at Oracle for database systems", + "Led requirement analysis at SAP for ERP platforms" + ], + "focus_areas": [ + "Requirement dependencies", + "Feature relationships", + "Specification impacts", + "Change propagation", + "Implementation dependencies" + ], + "visual_focus_areas": [ + "Requirement traceability diagrams", + "Feature dependency maps", + "Impact analysis charts" + ], + "detection_keywords": ["requirement", "specification", "feature", "functional", "traceability", "spec"] + }, + + "process_flow_analyst": { + "role": "Senior Process Flow Analyst", + "companies": ["Amazon", "Netflix", "Uber"], + "expertise_domain": "operational processes and workflows", + "document_types": ["process docs", "workflows", "operational manuals"], + "experience_years": "14+", + "achievements": [ + "Analyzed processes at Amazon for fulfillment operations", + "Mapped workflows at Netflix for content delivery", + "Led process analysis at Uber for ride-sharing operations" + ], + "focus_areas": [ + "Process step relationships", + "Workflow dependencies", + "Sequential cause-effects", + "Decision impacts", + "Operational dependencies" + ], + "visual_focus_areas": [ + "Process flowcharts", + "Workflow diagrams", + "Decision trees", + "Operational flow maps" + ], + "detection_keywords": ["process", "workflow", "procedure", "operational", "manual", "step", "flow"] + }, + + "visual_architecture_analyst": { + "role": "Senior Visual Architecture Analyst", + "companies": ["Google", "Microsoft", "Apple"], + "expertise_domain": "visual diagrams and architecture drawings", + "document_types": ["diagrams", "flowcharts", "architecture drawings"], + "experience_years": "16+", + "achievements": [ + "Analyzed visual diagrams at Google for complex system mappings", + "Mapped architecture drawings at Microsoft for enterprise solutions", + "Led visual analysis at Apple for product architecture" + ], + "focus_areas": [ + "Visual relationship extraction", + "Diagram dependency mapping", + "Flow analysis", + "Component interactions", + "Visual pattern recognition" + ], + "visual_focus_areas": [ + "All types of visual diagrams", + "Architecture drawings", + "Flowcharts and process diagrams", + "Component and sequence diagrams" + ], + "detection_keywords": ["diagram", "flowchart", "visual", "drawing", "chart", "map", "image"] + } +} + + +# ============================================================================ +# DOCUMENT TYPE MAPPING +# ============================================================================ + +DOCUMENT_PERSONA_MAPPING = { + # Technical Documents + "api_documentation": "technical_doc_analyst", + "technical_specification": "technical_doc_analyst", + "code_documentation": "technical_doc_analyst", + "developer_guide": "technical_doc_analyst", + + # Business Documents + "business_requirements": "business_process_analyst", + "user_stories": "business_process_analyst", + "business_plan": "business_process_analyst", + "product_specification": "business_process_analyst", + "stakeholder_document": "business_process_analyst", + + # Architecture Documents + "architecture_document": "system_architecture_analyst", + "system_design": "system_architecture_analyst", + "design_document": "system_architecture_analyst", + "technical_design": "system_architecture_analyst", + + # Requirements Documents + "requirements_document": "requirements_analyst", + "functional_specification": "requirements_analyst", + "feature_specification": "requirements_analyst", + + # Process Documents + "process_document": "process_flow_analyst", + "workflow_document": "process_flow_analyst", + "procedure_guide": "process_flow_analyst", + "operational_manual": "process_flow_analyst", + + # Visual/Diagram Documents + "architecture_diagram": "visual_architecture_analyst", + "flowchart": "visual_architecture_analyst", + "sequence_diagram": "visual_architecture_analyst", + "component_diagram": "visual_architecture_analyst", + "process_diagram": "visual_architecture_analyst", + "system_diagram": "visual_architecture_analyst", +} + + +# ============================================================================ +# PERSONA ALLOCATION FUNCTIONS +# ============================================================================ + +def allocate_code_persona(file_path: str, content: str, chunk_type: str = "module") -> Dict: + """ + Intelligently allocates code analysis persona based on file path, content, and type. + Returns persona config with prompt context. + """ + file_lower = file_path.lower() + content_lower = content.lower()[:2000] if content else "" # Sample content + + # Score each persona based on detection rules + persona_scores = {} + + for persona_id, persona_config in CODE_ANALYSIS_PERSONAS.items(): + if persona_id == "cto": # Skip CTO for individual analysis + continue + + score = 0 + detection_keywords = persona_config.get("detection_keywords", []) + + # Check file path (higher weight) + for keyword in detection_keywords: + if keyword in file_lower: + score += 15 + + # Check content (medium weight) + for keyword in detection_keywords: + if keyword in content_lower: + score += 8 + + # Check chunk type + if chunk_type and chunk_type.lower() in detection_keywords: + score += 10 + + # Domain-specific boosts + if "test" in file_lower and "qa" in persona_id: + score += 20 + if "security" in file_lower and "security" in persona_id: + score += 20 + if "performance" in file_lower and "performance" in persona_id: + score += 20 + + if score > 0: + persona_scores[persona_id] = score + + # Select top persona + if persona_scores: + selected_id = max(persona_scores, key=persona_scores.get) + return CODE_ANALYSIS_PERSONAS[selected_id] + + # Default fallback to backend business logic + return CODE_ANALYSIS_PERSONAS.get("backend_business", {}) + + +def allocate_document_persona(file_path: str, content: str, file_type: str = "text") -> Dict: + """ + Intelligently allocates document analysis persona based on file path, content, and type. + Returns persona config for document analysis. + """ + file_lower = file_path.lower() + content_lower = content.lower()[:2000] if content else "" + + # Check if it's an image/diagram + if file_type == "image" or any(ext in file_lower for ext in [".png", ".jpg", ".jpeg", ".gif", ".svg", ".pdf"]): + return DOCUMENT_ANALYSIS_PERSONAS.get("visual_architecture_analyst", {}) + + # Score each persona based on detection rules + persona_scores = {} + + for persona_id, persona_config in DOCUMENT_ANALYSIS_PERSONAS.items(): + score = 0 + detection_keywords = persona_config.get("detection_keywords", []) + + # Check file path (higher weight) + for keyword in detection_keywords: + if keyword in file_lower: + score += 15 + + # Check content (medium weight) + for keyword in detection_keywords: + if keyword in content_lower: + score += 8 + + # Check document type mapping + for doc_type, mapped_persona in DOCUMENT_PERSONA_MAPPING.items(): + if doc_type in file_lower and mapped_persona == persona_id: + score += 20 + + if score > 0: + persona_scores[persona_id] = score + + # Select top persona + if persona_scores: + selected_id = max(persona_scores, key=persona_scores.get) + return DOCUMENT_ANALYSIS_PERSONAS[selected_id] + + # Default fallback to technical doc analyst + return DOCUMENT_ANALYSIS_PERSONAS.get("technical_doc_analyst", {}) + + +def get_cto_persona() -> Dict: + """Returns CTO persona for synthesis and high-level analysis.""" + return CODE_ANALYSIS_PERSONAS.get("cto", {}) + + +# ============================================================================ +# PROMPT BUILDING FUNCTIONS +# ============================================================================ + +def build_persona_intro(persona: Dict, assignment_context: str = "", analysis_type: str = "code") -> str: + """ + Builds persona introduction section for prompts. + Works for both code and document analysis. + """ + if not persona: + return "" + + role = persona.get("role", "Senior Engineer") + companies = persona.get("companies", []) + experience = persona.get("experience_years", "15+") + achievements = persona.get("achievements", []) + focus_areas = persona.get("focus_areas", []) + + # Build company background + company_bg = "" + if companies: + company_bg = f"- Previously worked at {', '.join(companies[:2])}" + if len(companies) > 2: + company_bg += f" and {companies[2]}" + + # Build achievements section + achievements_text = "" + if achievements: + achievements_text = "\n".join([f"- {achievement}" for achievement in achievements[:2]]) + + # Build focus areas + focus_text = "" + if focus_areas: + focus_text = "\n".join([f"- {focus}" for focus in focus_areas[:5]]) + + intro = f"""You are {role} with {experience} years of experience. + +COMPANY BACKGROUND: +{company_bg} + +KEY ACHIEVEMENTS: +{achievements_text} + +YOUR ASSIGNMENT: +{assignment_context if assignment_context else 'Analyze the provided code/document for quality, issues, and recommendations.'} + +YOUR FOCUS AREAS: +{focus_text} + +--- +""" + return intro + + +def build_code_analysis_persona_prompt(base_prompt: str, persona: Dict, + assignment_context: str = "") -> str: + """ + Enhances code analysis prompt with persona context. + """ + if not persona: + return base_prompt + + persona_intro = build_persona_intro(persona, assignment_context, "code") + return persona_intro + base_prompt + + +def build_document_analysis_persona_prompt(base_prompt: str, persona: Dict, + document_type: str = "document", + assignment_context: str = "") -> str: + """ + Enhances document analysis prompt with persona context. + """ + if not persona: + return base_prompt + + role = persona.get("role", "Senior Analyst") + companies = persona.get("companies", []) + expertise_domain = persona.get("expertise_domain", "document analysis") + experience = persona.get("experience_years", "15+") + achievements = persona.get("achievements", []) + focus_areas = persona.get("focus_areas", []) + + company_bg = f"- Previously worked at {', '.join(companies[:2])}" if companies else "" + achievements_text = "\n".join([f"- {achievement}" for achievement in achievements[:2]]) if achievements else "" + focus_text = "\n".join([f"- {focus}" for focus in focus_areas[:5]]) if focus_areas else "" + + intro = f"""You are {role}, a specialist in analyzing {expertise_domain} with {experience} years of experience. + +COMPANY BACKGROUND: +{company_bg} + +KEY ACHIEVEMENTS: +{achievements_text} + +YOUR SPECIALIZATION: +You excel at identifying: +{focus_text} + +YOUR ASSIGNMENT: +{assignment_context if assignment_context else f'Analyze this {document_type} to extract causal relationships and dependencies.'} + +--- +""" + return intro + base_prompt + + +def build_cto_synthesis_prompt(base_prompt: str, team_findings: List[Dict] = None) -> str: + """ + Builds CTO-level synthesis prompt with team allocation context. + """ + cto_persona = get_cto_persona() + + if not cto_persona: + return base_prompt + + role = cto_persona.get("role", "Chief Technology Officer") + companies = cto_persona.get("companies", []) + experience = cto_persona.get("experience_years", "25+") + achievements = cto_persona.get("achievements", []) + focus_areas = cto_persona.get("focus_areas", []) + + company_bg = f"- Former VP of Engineering at {companies[0] if companies else 'Google'}, leading teams of 500+ engineers" + if len(companies) > 1: + company_bg += f"\n- CTO at {companies[1]}, responsible for cloud infrastructure strategy" + + achievements_text = "\n".join([f"- {achievement}" for achievement in achievements[:2]]) if achievements else "" + focus_text = "\n".join([f"- {focus}" for focus in focus_areas[:5]]) if focus_areas else "" + + team_allocation = "" + if team_findings: + team_allocation = "\n\nTEAM ALLOCATION:\n" + team_allocation += "You have allocated your expert team to analyze different domains:\n" + for finding in team_findings[:5]: + domain = finding.get("domain", "unknown") + team_allocation += f"- {domain}: Expert analysis completed\n" + + intro = f"""You are {role} with {experience} years of experience. + +COMPANY BACKGROUND: +{company_bg} + +KEY ACHIEVEMENTS: +{achievements_text} +{team_allocation} + +YOUR ROLE: +You have received this project and allocated your expert team to analyze different domains. +Now, synthesize all team findings into strategic recommendations. + +YOUR FOCUS AREAS: +{focus_text} + +--- +""" + return intro + base_prompt + diff --git a/services/ai-analysis-service/server.py b/services/ai-analysis-service/server.py index 9e1998f..7af0750 100644 --- a/services/ai-analysis-service/server.py +++ b/services/ai-analysis-service/server.py @@ -2673,8 +2673,10 @@ def build_intelligent_chunk_prompt(chunk: Dict, analysis_state: Optional[Dict] = """ Build comprehensive prompt for analyzing a semantically grouped chunk. Generates detailed module-level analysis with context awareness. - Now includes progressive context from previous chunks. + Now includes progressive context from previous chunks and world-class persona. """ + from persona_system import allocate_code_persona, build_code_analysis_persona_prompt + chunk_name = chunk.get('name', 'unknown') chunk_type = chunk.get('chunk_type', 'module') files_batch = chunk.get('files', []) @@ -2694,15 +2696,22 @@ def build_intelligent_chunk_prompt(chunk: Dict, analysis_state: Optional[Dict] = optimized_files.append((file_path, optimized_content)) + # Allocate appropriate persona based on files in chunk + # Use the first file to determine persona (or combine if multiple domains) + primary_file_path = optimized_files[0][0] if optimized_files else "" + primary_content = optimized_files[0][1] if optimized_files else "" + persona = allocate_code_persona(primary_file_path, primary_content, chunk_type) + # Build context from previous analyses (progressive learning) context_section = build_context_from_state(analysis_state, chunk) + # Build assignment context + assignment_context = f"CTO has assigned you to analyze the '{chunk_name}' module/chunk for this project. This is a {chunk_type} type chunk containing {len(optimized_files)} files." + # Build comprehensive prompt with module context prompt_parts = [ f"# COMPREHENSIVE ANALYSIS: {chunk_name.upper()}", f"Chunk Type: {chunk_type}", - "", - "You are a senior software architect with 30+ years of experience. Analyze this module/chunk comprehensively.", "" ] @@ -2794,7 +2803,12 @@ def build_intelligent_chunk_prompt(chunk: Dict, analysis_state: Optional[Dict] = "Focus on providing detailed, actionable insights that help understand the complete module context." ]) - return "\n".join(prompt_parts) + base_prompt = "\n".join(prompt_parts) + + # Enhance with persona + enhanced_prompt = build_code_analysis_persona_prompt(base_prompt, persona, assignment_context) + + return enhanced_prompt def build_smart_batch_prompt(files_batch: List[Tuple[str, str]]) -> str: """Legacy function: Build prompt for simple batch (backward compatibility).""" @@ -4719,13 +4733,13 @@ def build_synthesis_prompt(analysis_state: Dict, all_chunk_analyses: List[Dict] """ Build comprehensive prompt for cross-module synthesis analysis. Synthesizes all individual module analyses into system-level insights. + Uses CTO persona for executive-level synthesis. """ + from persona_system import get_cto_persona, build_cto_synthesis_prompt + prompt_parts = [ "# CROSS-MODULE SYNTHESIS ANALYSIS", "", - "You are a senior software architect with 30+ years of experience. Your task is to synthesize", - "findings from multiple module-level analyses into comprehensive system-level insights.", - "", "## CONTEXT: PREVIOUSLY ANALYZED MODULES", "" ] @@ -4842,7 +4856,19 @@ def build_synthesis_prompt(analysis_state: Dict, all_chunk_analyses: List[Dict] "across all analyzed modules, not just repeating individual module findings." ]) - return "\n".join(prompt_parts) + base_prompt = "\n".join(prompt_parts) + + # Get team findings for CTO context + team_findings = [] + if all_chunk_analyses: + for chunk_analysis in all_chunk_analyses: + module_name = chunk_analysis.get('module_name', 'unknown') + team_findings.append({"domain": module_name, "analysis": chunk_analysis}) + + # Enhance with CTO persona + enhanced_prompt = build_cto_synthesis_prompt(base_prompt, team_findings) + + return enhanced_prompt def parse_synthesis_response(response_text: str) -> Dict: """Parse synthesis response from Claude API.""" diff --git a/services/git-integration/src/routes/github-oauth.js b/services/git-integration/src/routes/github-oauth.js index 985161d..a589511 100644 --- a/services/git-integration/src/routes/github-oauth.js +++ b/services/git-integration/src/routes/github-oauth.js @@ -141,17 +141,19 @@ router.get('/auth/github/callback', async (req, res) => { setImmediate(async () => { try { console.log('[GitHub OAuth] Starting background repository attachment for:', repoContext.repoUrl); + console.log('[GitHub OAuth] Using newly stored token for user:', user_id); const GitHubIntegrationService = require('../services/github-integration.service'); const database = require('../config/database'); const githubService = new GitHubIntegrationService(); const { owner, repo, branch } = githubService.parseGitHubUrl(repoContext.repoUrl); - // Get metadata using authenticated Octokit - const repositoryData = await githubService.fetchRepositoryMetadata(owner, repo); + // Get metadata using authenticated Octokit with the specific user's token + // Pass userId to ensure we use the newly stored token + const repositoryData = await githubService.fetchRepositoryMetadata(owner, repo, false, user_id); let actualBranch = repoContext.branchName || branch || repositoryData.default_branch || 'main'; - // Attempt analysis and sync with fallback - const codebaseAnalysis = await githubService.analyzeCodebase(owner, repo, actualBranch, false); + // Attempt analysis and sync with fallback - use userId to ensure correct token + const codebaseAnalysis = await githubService.analyzeCodebase(owner, repo, actualBranch, false, user_id); const insertQuery = ` INSERT INTO all_repositories ( repository_url, repository_name, owner_name, @@ -170,14 +172,14 @@ router.get('/auth/github/callback', async (req, res) => { JSON.stringify(codebaseAnalysis), 'syncing', repositoryData.visibility === 'private', - repoContext.userId || null, + user_id || repoContext.userId || null, // Use user_id from OAuth callback (most reliable) 'github' // This is GitHub OAuth callback, so provider is always github ]; const insertResult = await database.query(insertQuery, insertValues); const repositoryRecord = insertResult.rows[0]; - // Clone repository - const downloadResult = await githubService.syncRepositoryWithFallback(owner, repo, actualBranch, repositoryRecord.id, repositoryData.visibility !== 'private'); + // Clone repository - use userId to ensure correct token + const downloadResult = await githubService.syncRepositoryWithFallback(owner, repo, actualBranch, repositoryRecord.id, repositoryData.visibility !== 'private', user_id); const finalSyncStatus = downloadResult.success ? 'synced' : 'error'; await database.query('UPDATE all_repositories SET sync_status = $1, updated_at = NOW() WHERE id = $2', [finalSyncStatus, repositoryRecord.id]); diff --git a/services/git-integration/src/routes/vcs.routes.js b/services/git-integration/src/routes/vcs.routes.js index dfe6770..528fee7 100644 --- a/services/git-integration/src/routes/vcs.routes.js +++ b/services/git-integration/src/routes/vcs.routes.js @@ -162,13 +162,29 @@ router.post('/:provider/attach-repository', async (req, res) => { const provider = getProvider(req); const { template_id, repository_url, branch_name } = req.body; const userId = req.headers['x-user-id'] || req.query.user_id || req.body.user_id || (req.user && (req.user.id || req.user.userId)); + + console.log(`[VCS Attach] Extracted userId:`, userId, `from headers:`, req.headers['x-user-id'], `query:`, req.query.user_id, `body:`, req.body.user_id); // Validate input - only repository_url is required (like GitHub) if (!repository_url) { return res.status(400).json({ success: false, message: 'Repository URL is required' }); } - const { owner, repo, branch } = provider.parseRepoUrl(repository_url); + // Clean and normalize the repository URL (trim whitespace, decode URL encoding) + let cleanedUrl = repository_url.trim(); + // Decode URL-encoded characters (like %20 for spaces) + try { + cleanedUrl = decodeURIComponent(cleanedUrl); + } catch (e) { + // If decoding fails, use original URL + console.warn(`[VCS Attach] Failed to decode URL, using original: ${cleanedUrl}`); + } + // Trim again after decoding + cleanedUrl = cleanedUrl.trim(); + + console.log(`[VCS Attach] Original URL: ${repository_url}, Cleaned URL: ${cleanedUrl}`); + + const { owner, repo, branch } = provider.parseRepoUrl(cleanedUrl); // Enhanced flow: Detect private repos and redirect to OAuth immediately const providerKey = (req.params.provider || '').toLowerCase(); @@ -247,8 +263,45 @@ router.post('/:provider/attach-repository', async (req, res) => { // For public repos or authenticated private repos, proceed with normal flow const accessCheck = await provider.checkRepositoryAccess(owner, repo, userId); + + console.log(`[VCS Attach] Access check result for ${owner}/${repo}:`, { + hasAccess: accessCheck.hasAccess, + requiresAuth: accessCheck.requiresAuth, + authError: accessCheck.authError, + error: accessCheck.error, + exists: accessCheck.exists, + github_username: accessCheck.github_username + }); if (!accessCheck.hasAccess) { + // If access check failed but requires auth, trigger OAuth flow + if (accessCheck.requiresAuth || accessCheck.authError) { + const oauthService = getOAuthService(providerKey); + if (oauthService) { + console.log(`🔒 [VCS Attach] Token exists but cannot access repository (or no valid token), redirecting to OAuth: ${repository_url}`); + console.log(`🔒 [VCS Attach] Reason: ${accessCheck.error || 'Authentication required'}, userId: ${userId}`); + + // Generate OAuth URL with repository context in state + const stateBase = Math.random().toString(36).substring(7); + const state = `${stateBase}|uid=${userId || 'unknown'}|repo=${encodeURIComponent(repository_url)}|branch=${encodeURIComponent(branch_name || 'main')}|private_repo=true`; + + const authUrl = oauthService.getAuthUrl(state, userId); + + console.log(`🔒 [VCS Attach] Generated OAuth URL for ${providerKey}, returning requires_auth response`); + + return res.json({ + success: false, + message: `${providerKey.charAt(0).toUpperCase() + providerKey.slice(1)} authentication required for private repository`, + requires_auth: true, + is_private_repo: true, + auth_url: authUrl, + state: state + }); + } + } + + // If it's not an auth issue, return 404 + console.log(`[VCS Attach] Access check failed without auth requirement, returning 404`); return res.status(404).json({ success: false, message: accessCheck.error || 'Repository not accessible' }); } diff --git a/services/git-integration/src/services/github-integration.service.js b/services/git-integration/src/services/github-integration.service.js index c602078..64348a9 100644 --- a/services/git-integration/src/services/github-integration.service.js +++ b/services/git-integration/src/services/github-integration.service.js @@ -21,8 +21,8 @@ class GitHubIntegrationService { } // Get authenticated Octokit instance - async getAuthenticatedOctokit() { - return await this.oauthService.getAuthenticatedOctokit(); + async getAuthenticatedOctokit(userId = null) { + return await this.oauthService.getAuthenticatedOctokit(userId); } // Extract owner, repo, and branch from GitHub URL using parse-github-url library @@ -31,8 +31,15 @@ class GitHubIntegrationService { throw new Error('URL must be a non-empty string'); } - // Normalize the URL first + // Normalize the URL first - trim and decode URL encoding let normalizedUrl = url.trim(); + // Decode URL-encoded characters (like %20 for spaces) + try { + normalizedUrl = decodeURIComponent(normalizedUrl).trim(); + } catch (e) { + // If decoding fails, just trim + normalizedUrl = normalizedUrl.trim(); + } // Remove trailing slashes and .git extensions normalizedUrl = normalizedUrl.replace(/\/+$/, '').replace(/\.git$/, ''); @@ -216,7 +223,7 @@ class GitHubIntegrationService { }; } - // No token found - try unauthenticated access first to check if it's public + // No token found that can access this repo - try unauthenticated access to check if it's public try { const unauthenticatedOctokit = new Octokit({ userAgent: 'CodeNuk-GitIntegration/1.0.0', @@ -234,13 +241,18 @@ class GitHubIntegrationService { }; } catch (unauthenticatedError) { if (unauthenticatedError.status === 404) { - // Repository truly doesn't exist + // 404 from unauthenticated access could mean: + // 1. Repository truly doesn't exist + // 2. Repository is private and requires authentication + // Since we already tried to find a token and none could access it, + // and we're being called from a private repo flow, assume it requires auth + console.log(`🔒 [GitHub] 404 from unauthenticated access - assuming private repo requires authentication`); return { - exists: false, + exists: null, // Unknown - could be missing or private isPrivate: null, hasAccess: false, - requiresAuth: false, - error: 'Repository not found' + requiresAuth: true, // Changed from false to true - trigger OAuth + error: 'Repository not found or requires authentication' }; } else if (unauthenticatedError.status === 401 || unauthenticatedError.status === 403) { // Repository exists but requires authentication (private) - generate auth URL @@ -289,13 +301,13 @@ class GitHubIntegrationService { } // Get repository information from GitHub - async fetchRepositoryMetadata(owner, repo, skipAuth = false) { + async fetchRepositoryMetadata(owner, repo, skipAuth = false, userId = null) { // If skipAuth is true, try with unauthenticated octokit first to check visibility let octokit; if (skipAuth) { octokit = this.octokit; // Use unauthenticated instance } else { - octokit = await this.getAuthenticatedOctokit(); + octokit = await this.getAuthenticatedOctokit(userId); } const safe = async (fn, fallback) => { @@ -309,26 +321,41 @@ class GitHubIntegrationService { let repoData; try { + console.log(`🔍 [GitHub] fetchRepositoryMetadata: skipAuth=${skipAuth}, calling octokit.repos.get for ${owner}/${repo}`); const response = await octokit.repos.get({ owner, repo }); - if (skipAuth) { - if (response.status === 401 || response.status === 403) { - throw new Error('Authentication required to access repository'); - } else if (response.status === 404) { - throw new Error('Repository not found'); - } - } repoData = response.data; + console.log(`✅ [GitHub] Successfully fetched repository data: ${repoData?.full_name || 'no full_name'}`); + + // Validate we got real data + if (!repoData || !repoData.full_name) { + console.log(`❌ [GitHub] Invalid repository data received, throwing error`); + throw new Error('Invalid repository data received'); + } } catch (error) { - console.log(`🔍 [GitHub] Error in fetchRepositoryMetadata:`, error.message, error.status); + // Check error status from various possible locations + const status = error.status || error.response?.status || error.code; + const errorMessage = error.message || ''; + const is404 = status === 404 || status === '404' || errorMessage.includes('404') || errorMessage.includes('Not Found'); + const isAuthError = status === 401 || status === 403 || status === '401' || status === '403'; + + console.log(`🔍 [GitHub] Error in fetchRepositoryMetadata CATCH BLOCK:`, errorMessage, `Status: ${status || 'unknown'}`, `is404: ${is404}`, `isAuthError: ${isAuthError}`, `skipAuth: ${skipAuth}`); + console.log(`🔍 [GitHub] Error object:`, JSON.stringify({ + status: error.status, + responseStatus: error.response?.status, + code: error.code, + message: error.message, + name: error.name + })); + if (skipAuth) { - // For GitHub, any error when skipAuth=true likely means private repo - if (error.status === 401 || error.status === 403 || error.status === 404) { - throw new Error('Authentication required to access repository'); - } - // For other errors, also assume private repo + // For GitHub, any error when skipAuth=true means private repo or doesn't exist + // Always throw authentication required - let the caller decide if it's truly missing or private + console.log(`🔒 [GitHub] skipAuth=true, THROWING authentication required error - NOT using safe fallback`); throw new Error('Authentication required to access repository'); } - // For other errors, use safe fallback + + // For authenticated requests, use safe fallback (but only if skipAuth is false) + console.log(`⚠️ [GitHub] skipAuth=false, using safe fallback`); repoData = await safe( async () => { const response = await octokit.repos.get({ owner, repo }); @@ -336,6 +363,12 @@ class GitHubIntegrationService { }, {} ); + + // If safe fallback also failed, throw + if (!repoData || !repoData.full_name) { + console.log(`❌ [GitHub] Safe fallback also failed, throwing Repository not found`); + throw new Error('Repository not found'); + } } const languages = await safe( @@ -364,7 +397,7 @@ class GitHubIntegrationService { } // Analyze codebase structure - async analyzeCodebase(owner, repo, branch, isPublicRepo = false) { + async analyzeCodebase(owner, repo, branch, isPublicRepo = false, userId = null) { try { // Use appropriate octokit instance based on repository type let octokit; @@ -374,8 +407,8 @@ class GitHubIntegrationService { userAgent: 'CodeNuk-GitIntegration/1.0.0', }); } else { - // For private repos, use authenticated octokit - octokit = await this.getAuthenticatedOctokit(); + // For private repos, use authenticated octokit with userId + octokit = await this.getAuthenticatedOctokit(userId); } // Get the commit SHA for the branch @@ -519,7 +552,7 @@ class GitHubIntegrationService { } // Git-based: clone or update local repo and re-index into DB - async syncRepositoryWithGit(owner, repo, branch, repositoryId, isPublicRepo = false) { + async syncRepositoryWithGit(owner, repo, branch, repositoryId, isPublicRepo = false, userId = null) { const database = require('../config/database'); const localPath = this.gitRepoService.getLocalRepoPath(owner, repo, branch); let storageRecord = null; @@ -544,7 +577,7 @@ class GitHubIntegrationService { console.warn(`Failed to clone public repo without auth: ${error.message}`); // Fallback to authenticated clone if available try { - const tokenRecord = await this.oauthService.getToken(); + const tokenRecord = userId ? await this.oauthService.getTokenForUser(userId) : await this.oauthService.getToken(); if (tokenRecord?.access_token) { repoPath = await this.gitRepoService.cloneIfMissingWithAuth( owner, @@ -560,7 +593,7 @@ class GitHubIntegrationService { } else { // For private repos, try authenticated clone first try { - const tokenRecord = await this.oauthService.getToken(); + const tokenRecord = userId ? await this.oauthService.getTokenForUser(userId) : await this.oauthService.getToken(); if (tokenRecord?.access_token) { repoPath = await this.gitRepoService.cloneIfMissingWithAuth( owner, @@ -628,7 +661,7 @@ class GitHubIntegrationService { try { // Try to ensure repo exists for the preferred branch try { - const tokenRecord = await this.oauthService.getToken().catch(() => null); + const tokenRecord = userId ? await this.oauthService.getTokenForUser(userId).catch(() => null) : await this.oauthService.getToken().catch(() => null); if (tokenRecord?.access_token) { repoPath = await this.gitRepoService.cloneIfMissingWithAuth(owner, repo, preferredBranch, 'github.com', tokenRecord.access_token, 'oauth2'); } else { @@ -637,7 +670,7 @@ class GitHubIntegrationService { } catch (cloneErr) { // If the branch doesn't exist (e.g., refs/heads not found), try the alternate branch try { - const tokenRecordAlt = await this.oauthService.getToken().catch(() => null); + const tokenRecordAlt = userId ? await this.oauthService.getTokenForUser(userId).catch(() => null) : await this.oauthService.getToken().catch(() => null); repoPath = tokenRecordAlt?.access_token ? await this.gitRepoService.cloneIfMissingWithAuth(owner, repo, alternateBranch, 'github.com', tokenRecordAlt.access_token, 'oauth2') : await this.gitRepoService.cloneIfMissing(owner, repo, alternateBranch); @@ -679,7 +712,7 @@ class GitHubIntegrationService { try { // Ensure repo exists similarly to diff flow try { - const tokenRecord = await this.oauthService.getToken().catch(() => null); + const tokenRecord = userId ? await this.oauthService.getTokenForUser(userId).catch(() => null) : await this.oauthService.getToken().catch(() => null); if (tokenRecord?.access_token) { repoPath = await this.gitRepoService.cloneIfMissingWithAuth(owner, repo, preferredBranch, 'github.com', tokenRecord.access_token, 'oauth2'); } else { @@ -687,7 +720,7 @@ class GitHubIntegrationService { } } catch (_) { try { - const tokenRecordAlt = await this.oauthService.getToken().catch(() => null); + const tokenRecordAlt = userId ? await this.oauthService.getTokenForUser(userId).catch(() => null) : await this.oauthService.getToken().catch(() => null); repoPath = tokenRecordAlt?.access_token ? await this.gitRepoService.cloneIfMissingWithAuth(owner, repo, alternateBranch, 'github.com', tokenRecordAlt.access_token, 'oauth2') : await this.gitRepoService.cloneIfMissing(owner, repo, alternateBranch); @@ -720,15 +753,15 @@ class GitHubIntegrationService { } // Try git-based sync first, fall back to GitHub API download on failure - async syncRepositoryWithFallback(owner, repo, branch, repositoryId, isPublicRepo = false) { + async syncRepositoryWithFallback(owner, repo, branch, repositoryId, isPublicRepo = false, userId = null) { // First attempt: full git clone/fetch and index - const gitResult = await this.syncRepositoryWithGit(owner, repo, branch, repositoryId, isPublicRepo); + const gitResult = await this.syncRepositoryWithGit(owner, repo, branch, repositoryId, isPublicRepo, userId); if (gitResult && gitResult.success) { return { method: 'git', ...gitResult }; } // Fallback: API-based download and storage - const apiResult = await this.downloadRepositoryWithStorage(owner, repo, branch, repositoryId, isPublicRepo); + const apiResult = await this.downloadRepositoryWithStorage(owner, repo, branch, repositoryId, isPublicRepo, userId); if (apiResult && apiResult.success) { return { method: 'api', ...apiResult, git_error: gitResult?.error }; } @@ -737,7 +770,7 @@ class GitHubIntegrationService { } // Download repository files locally and store in database - async downloadRepositoryWithStorage(owner, repo, branch, repositoryId, isPublicRepo = false) { + async downloadRepositoryWithStorage(owner, repo, branch, repositoryId, isPublicRepo = false, userId = null) { const targetDir = path.join( process.env.ATTACHED_REPOS_DIR, `${owner}__${repo}__${branch}` @@ -765,8 +798,8 @@ class GitHubIntegrationService { userAgent: 'CodeNuk-GitIntegration/1.0.0', }); } else { - // For private repos, use authenticated octokit - octokit = await this.getAuthenticatedOctokit(); + // For private repos, use authenticated octokit with userId + octokit = await this.getAuthenticatedOctokit(userId); } // Get the commit SHA for the branch diff --git a/services/git-integration/src/services/github-oauth.js b/services/git-integration/src/services/github-oauth.js index bf251ea..6960b7f 100644 --- a/services/git-integration/src/services/github-oauth.js +++ b/services/git-integration/src/services/github-oauth.js @@ -199,8 +199,16 @@ class GitHubOAuthService { } // Create authenticated Octokit instance - async getAuthenticatedOctokit() { - const tokenRecord = await this.getToken(); + async getAuthenticatedOctokit(userId = null) { + // If userId is provided, get the newest token for that user + // Otherwise, get the newest token overall + let tokenRecord; + if (userId) { + tokenRecord = await this.getTokenForUser(userId); + console.log(`[GitHub OAuth] Using token for user ${userId}: ${tokenRecord?.github_username || 'none'}`); + } else { + tokenRecord = await this.getToken(); + } if (!tokenRecord) { throw new Error('No GitHub token found. Please authenticate with GitHub first.'); diff --git a/services/git-integration/src/services/provider-registry.js b/services/git-integration/src/services/provider-registry.js index d842d80..8832ea6 100644 --- a/services/git-integration/src/services/provider-registry.js +++ b/services/git-integration/src/services/provider-registry.js @@ -15,7 +15,11 @@ class GithubAdapter { return this.impl.parseGitHubUrl(url); } - async checkRepositoryAccess(owner, repo) { + async checkRepositoryAccess(owner, repo, userId = null) { + // Use user-specific method if userId is provided + if (userId) { + return await this.impl.checkRepositoryAccessWithUser(owner, repo, userId); + } return await this.impl.checkRepositoryAccess(owner, repo); } diff --git a/services/multi-document-upload-service/.dockerignore b/services/multi-document-upload-service/.dockerignore new file mode 100644 index 0000000..5fc6e85 --- /dev/null +++ b/services/multi-document-upload-service/.dockerignore @@ -0,0 +1,58 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +*.egg-info/ +dist/ +build/ +*.egg + +# Virtual environments +venv/ +env/ +ENV/ +.venv + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# Documentation +*.md +!README.md + +# Testing +.pytest_cache/ +.coverage +htmlcov/ +*.log + +# Storage and temporary files +storage/ +*.tmp +*.temp + +# Git +.git/ +.gitignore + +# Docker +Dockerfile* +docker-compose*.yml +.dockerignore + +# Environment files +.env +.env.local +*.env + +# OS +.DS_Store +Thumbs.db + + diff --git a/services/multi-document-upload-service/Dockerfile b/services/multi-document-upload-service/Dockerfile index a741f09..3c36ac9 100644 --- a/services/multi-document-upload-service/Dockerfile +++ b/services/multi-document-upload-service/Dockerfile @@ -1,29 +1,60 @@ -FROM python:3.11-slim +# Build stage - install dependencies that require compilation +FROM python:3.11-slim as builder ENV PYTHONDONTWRITEBYTECODE=1 \ PYTHONUNBUFFERED=1 WORKDIR /app +# Install build dependencies only RUN apt-get update && \ apt-get install -y --no-install-recommends \ build-essential \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Copy and install Python dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir --user -r requirements.txt && \ + pip cache purge + +# Download SpaCy English model +RUN python -m spacy download en_core_web_sm + +# Runtime stage - minimal image with only runtime dependencies +FROM python:3.11-slim + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PYTHONPATH=/app/src \ + PATH=/root/.local/bin:$PATH \ + MULTI_DOC_STORAGE_ROOT=/app/storage \ + MULTI_DOC_CLAUDE_MODEL=claude-3-5-haiku-latest \ + CLAUDE_MODEL=claude-3-5-haiku-latest \ + PORT=8024 + +WORKDIR /app + +# Install only runtime dependencies (no build tools) +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ poppler-utils \ tesseract-ocr \ ffmpeg \ libmagic1 \ - && rm -rf /var/lib/apt/lists/* + curl \ + # Required for some Python packages at runtime + libgomp1 \ + libglib2.0-0 \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean -COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt +# Copy Python packages from builder stage (includes spacy model) +COPY --from=builder /root/.local /root/.local +# Copy application code COPY src ./src -ENV PYTHONPATH=/app/src \ - MULTI_DOC_STORAGE_ROOT=/app/storage \ - MULTI_DOC_CLAUDE_MODEL=claude-3-5-sonnet-20241022 \ - PORT=8024 - EXPOSE 8024 CMD ["sh", "-c", "uvicorn multi_document_upload_service.main:app --host 0.0.0.0 --port ${PORT:-8024}"] diff --git a/services/multi-document-upload-service/FIX_EMPTY_GRAPH.md b/services/multi-document-upload-service/FIX_EMPTY_GRAPH.md deleted file mode 100644 index 3110aa1..0000000 --- a/services/multi-document-upload-service/FIX_EMPTY_GRAPH.md +++ /dev/null @@ -1,144 +0,0 @@ -# Fix: Empty Graph in Neo4j (No Relationships Found) - -## Problem - -When querying Neo4j for `CAUSES` relationships, you get "(no changes, no records)" because: - -1. **PDF extraction failed** - Missing dependencies (`unstructured[pdf]`) -2. **0 relations extracted** - No text was extracted, so no analysis happened -3. **0 relations written** - Nothing was written to Neo4j (correct behavior) - -## Root Cause - -The service completed with 0 relations because: -- PDF file extraction failed: `partition_pdf() is not available because one or more dependencies are not installed` -- No text was extracted from the PDF -- No chunks were created -- No Claude analysis happened -- 0 relations were extracted -- 0 relations were written to Neo4j - -## Solution - -### Step 1: Update Dependencies - -The `requirements.txt` has been updated to include: -``` -unstructured[pdf]>=0.15.0 -unstructured[docx]>=0.15.0 -unstructured[pptx]>=0.15.0 -unstructured[xlsx]>=0.15.0 -``` - -### Step 2: Rebuild the Service - -```bash -cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine - -# Rebuild the service with new dependencies -docker-compose build multi-document-upload-service - -# Restart the service -docker-compose restart multi-document-upload-service - -# Check logs to verify it's working -docker-compose logs -f multi-document-upload-service -``` - -### Step 3: Verify Dependencies - -```bash -# Check if unstructured[pdf] is installed -docker-compose exec multi-document-upload-service pip list | grep unstructured -``` - -### Step 4: Re-upload Documents - -1. Go to Project Builder in the frontend -2. Click on "Upload Documents for Knowledge Graph" -3. Upload a PDF or other document -4. Wait for processing to complete -5. Check Neo4j for relationships - -### Step 5: Check Neo4j - -Run these queries in Neo4j Browser: - -```cypher -// Check if any nodes exist -MATCH (n) -RETURN count(n) as node_count - -// Check for CAUSES relationships -MATCH (n:Concept)-[r:CAUSES]->(m:Concept) -RETURN n.name as cause, m.name as effect, r.confidence as confidence -LIMIT 50 -``` - -## Expected Behavior After Fix - -1. **PDF extraction succeeds** - Text is extracted from PDF files -2. **Text is chunked** - Document is split into manageable chunks -3. **Claude analyzes** - Causal relationships are extracted -4. **Relations are written** - Relationships are stored in Neo4j -5. **Query returns results** - Neo4j query shows relationships - -## Verification Steps - -1. **Check service logs**: - ```bash - docker-compose logs multi-document-upload-service | grep -i "extracted\|relation\|neo4j" - ``` - -2. **Check job status**: - ```bash - curl http://localhost:8000/api/multi-docs/jobs/{job_id} - ``` - Should show: `"processed_files": 1` and relations count > 0 - -3. **Check Neo4j**: - ```cypher - MATCH (n:Concept)-[r:CAUSES]->(m:Concept) - RETURN count(r) as relation_count - ``` - -## Improvements Made - -1. ✅ **Added PDF dependencies** - `unstructured[pdf]`, `unstructured[docx]`, etc. -2. ✅ **Added fallback extractors** - Uses `pdfplumber` if unstructured fails -3. ✅ **Better error handling** - Shows actual errors in job status -4. ✅ **Improved logging** - More detailed logs for debugging -5. ✅ **Better Neo4j query** - Validates data before writing - -## Troubleshooting - -If you still see 0 relations after rebuilding: - -1. **Check extraction logs**: - ```bash - docker-compose logs multi-document-upload-service | grep -i "extract" - ``` - -2. **Check Claude analysis**: - ```bash - docker-compose logs multi-document-upload-service | grep -i "claude\|analyze" - ``` - -3. **Check Neo4j connection**: - ```bash - docker-compose logs multi-document-upload-service | grep -i "neo4j\|graph" - ``` - -4. **Verify document has causal language**: - - Not all documents contain causal relationships - - Try uploading a document with clear cause-effect statements - - Example: "Smoking causes lung cancer" or "Rain causes flooding" - -## Next Steps - -1. Rebuild the service with new dependencies -2. Re-upload documents -3. Check Neo4j for relationships -4. If still no results, check service logs for errors -5. Verify the document contains causal language - diff --git a/services/multi-document-upload-service/NEO4J_DIAGNOSTIC_QUERIES.md b/services/multi-document-upload-service/NEO4J_DIAGNOSTIC_QUERIES.md deleted file mode 100644 index 1b96d67..0000000 --- a/services/multi-document-upload-service/NEO4J_DIAGNOSTIC_QUERIES.md +++ /dev/null @@ -1,176 +0,0 @@ -# Neo4j Diagnostic Queries - -## Issue: No relationships found in Neo4j - -If you're seeing "(no changes, no records)" when querying for `CAUSES` relationships, here are diagnostic queries to check what's actually in the database. - -## Diagnostic Queries - -### 1. Check if any nodes exist -```cypher -MATCH (n) -RETURN count(n) as node_count -LIMIT 1 -``` - -### 2. Check if Concept nodes exist -```cypher -MATCH (n:Concept) -RETURN count(n) as concept_count, - collect(DISTINCT labels(n)) as labels, - collect(DISTINCT keys(n)) as properties -LIMIT 10 -``` - -### 3. Check all relationship types -```cypher -CALL db.relationshipTypes() YIELD relationshipType -RETURN relationshipType -``` - -### 4. Check all node labels -```cypher -CALL db.labels() YIELD label -RETURN label -``` - -### 5. Check all relationships (any type) -```cypher -MATCH (n)-[r]->(m) -RETURN type(r) as relationship_type, - count(r) as count, - labels(n) as from_labels, - labels(m) as to_labels -LIMIT 50 -``` - -### 6. Check for CAUSES relationships specifically -```cypher -MATCH (n)-[r:CAUSES]->(m) -RETURN n, r, m -LIMIT 50 -``` - -### 7. Check for relationships with lowercase "causes" -```cypher -MATCH (n)-[r]->(m) -WHERE type(r) =~ '(?i)causes' -RETURN type(r) as relationship_type, n, r, m -LIMIT 50 -``` - -### 8. Check all nodes and their relationships -```cypher -MATCH (n) -OPTIONAL MATCH (n)-[r]->(m) -RETURN n, labels(n) as node_labels, - type(r) as relationship_type, - m, labels(m) as target_labels -LIMIT 50 -``` - -### 9. Check for nodes created by the service (by job_id property) -```cypher -MATCH (n)-[r]->(m) -WHERE r.job_id IS NOT NULL -RETURN n, r, m, r.job_id as job_id -LIMIT 50 -``` - -### 10. Check database statistics -```cypher -MATCH (n) -RETURN count(n) as total_nodes, - size([(n)-[r]->() | r]) as total_relationships -``` - -## Common Issues and Solutions - -### Issue 1: No nodes at all -**Symptom**: Query 1 returns 0 nodes -**Cause**: Service hasn't written anything to Neo4j, or connection failed -**Solution**: -- Check service logs: `docker-compose logs multi-document-upload-service` -- Verify Neo4j connection in service configuration -- Check if job completed with 0 relations (extraction failed) - -### Issue 2: Nodes exist but no relationships -**Symptom**: Query 1 returns nodes, but Query 6 returns no relationships -**Cause**: Relationships weren't created, or different relationship type -**Solution**: -- Check Query 5 to see what relationship types actually exist -- Check service logs for graph writing errors -- Verify the job actually extracted relations (check job status) - -### Issue 3: Different relationship type -**Symptom**: Query 5 shows relationships but not `CAUSES` -**Cause**: Service might be using a different relationship type -**Solution**: -- Check Query 3 to see all relationship types -- Update query to use the correct relationship type - -### Issue 4: Different node labels -**Symptom**: Query 6 returns no results, but Query 2 shows different labels -**Cause**: Service might be using different node labels -**Solution**: -- Check Query 2 to see what labels exist -- Update query to match actual labels - -## Expected Structure - -After a successful upload, you should see: - -### Nodes -- **Label**: `Concept` -- **Properties**: `name`, `lastSeen` - -### Relationships -- **Type**: `CAUSES` -- **Properties**: `confidence`, `explanation`, `source_file_id`, `source_snippet`, `job_id`, `model`, `updated_at` - -### Example Query -```cypher -MATCH (cause:Concept)-[r:CAUSES]->(effect:Concept) -RETURN cause.name as cause, - effect.name as effect, - r.confidence as confidence, - r.job_id as job_id, - r.source_file_id as source_file -LIMIT 50 -``` - -## Troubleshooting Steps - -1. **Check service logs**: - ```bash - docker-compose logs -f multi-document-upload-service - ``` - -2. **Check if job completed successfully**: - ```bash - curl http://localhost:8000/api/multi-docs/jobs/{job_id} - ``` - -3. **Check Neo4j connection**: - ```bash - docker-compose logs neo4j | grep -i error - ``` - -4. **Verify Neo4j is running**: - ```bash - docker-compose ps neo4j - ``` - -5. **Test Neo4j connection manually**: - ```bash - docker-compose exec neo4j cypher-shell -u neo4j -p password "MATCH (n) RETURN count(n)" - ``` - -## Next Steps - -1. Run the diagnostic queries above -2. Check the service logs for errors -3. Verify the job status via API -4. Re-upload documents after fixing dependencies -5. Check if relations were actually extracted (job status should show relation count) - diff --git a/services/multi-document-upload-service/QUICK_TEST.md b/services/multi-document-upload-service/QUICK_TEST.md deleted file mode 100644 index af656eb..0000000 --- a/services/multi-document-upload-service/QUICK_TEST.md +++ /dev/null @@ -1,85 +0,0 @@ -# Quick Testing Guide - Multi-Document Upload - -## 🚀 Quick Start Testing - -### 1. Start Services -```bash -cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine -docker-compose up -d multi-document-upload-service neo4j redis postgres api-gateway -``` - -### 2. Verify Services -```bash -# Check health -curl http://localhost:8024/health -curl http://localhost:8000/api/multi-docs/health -``` - -### 3. Test via Frontend - -1. **Open Frontend**: `http://localhost:3001` -2. **Login** (if required) -3. **Go to Project Builder** -4. **Complete Steps 1-2** (Project Type & Features) -5. **Step 3: Multi Docs Upload** appears -6. **Upload files**: - - Click upload area - - Select multiple files (PDF, DOCX, etc.) - - Click "Start Upload" -7. **Watch Progress**: - - Progress bar updates - - Status messages appear - - Polls every 4 seconds -8. **Auto-proceeds** when completed - -### 4. Verify in Neo4j - -```bash -# Open Neo4j Browser: http://localhost:7474 -# Login: neo4j / password - -# Query causal relationships: -MATCH (n)-[r:CAUSES]->(m) -RETURN n, r, m -LIMIT 50 -``` - -## 📝 Test Checklist - -- [ ] Service starts successfully -- [ ] Health endpoint works -- [ ] Frontend component renders -- [ ] File upload works -- [ ] Progress updates correctly -- [ ] Job completes successfully -- [ ] Neo4j graph contains relationships -- [ ] Error handling works -- [ ] Skip button works - -## 🔍 Debug Commands - -```bash -# View service logs -docker-compose logs -f multi-document-upload-service - -# Check job status (replace {job_id}) -curl http://localhost:8000/api/multi-docs/jobs/{job_id} - -# Check graph summary -curl http://localhost:8000/api/multi-docs/jobs/{job_id}/graph -``` - -## ⚠️ Common Issues - -1. **502 Bad Gateway**: Service not running → `docker-compose ps` -2. **413 Too Large**: File too big → Reduce file size -3. **No progress**: Check browser console → Check network tab -4. **No relationships**: Check Claude API key → Check service logs - -## 🎯 Expected Flow - -``` -Upload Files → Job Created → Files Saved → Content Extracted → -Claude Analysis → Graph Built → Completed → Auto-proceed to Next Step -``` - diff --git a/services/multi-document-upload-service/README.md b/services/multi-document-upload-service/README.md index cab9672..3621a7e 100644 --- a/services/multi-document-upload-service/README.md +++ b/services/multi-document-upload-service/README.md @@ -1,36 +1,996 @@ -# Multi Document Upload Service +COMPLETE END-TO-END FLOW: Multi-Document Analysis to Report Generation +Let me give you the most detailed explanation possible with theory, diagrams, and step-by-step breakdown. -This service accepts large batches of heterogeneous documents, extracts causal -relationships with Claude Sonnet 3.5, and writes them into Neo4j as a -knowledge graph. +🎯 SYSTEM OVERVIEW +What We're Building: +A system that takes 100+ documents (PDFs, DOCX, PPT, images, etc.) and generates a comprehensive onboarding report by understanding causal relationships and connections across all documents. +Key Components: -## Features +Document Storage - Store uploaded files +Content Extraction - Get text from different formats +Causal Analysis - Understand cause-effect relationships (with Claude) +Knowledge Graph - Store relationships in Neo4j +Vector Database - Enable semantic search in Qdrant +Report Generation - Create final report (with Claude) -- Multipart upload endpoint (`POST /jobs`) capable of handling dozens of files - and mixed formats (PDF, DOCX, PPTX, XLSX/CSV, JSON/XML, images, audio/video). -- Content extraction powered by the `unstructured` library with fallbacks. -- Chunking tuned for Claude Sonnet (800 token target, 200 overlap). -- High-accuracy causal extraction using Anthropic Claude with provenance. -- Neo4j graph writer that upserts `Concept` nodes and `CAUSES` edges. -- Status endpoint (`GET /jobs/{id}`) and graph summary endpoint - (`GET /jobs/{id}/graph`). -## Configuration +📊 COMPLETE ARCHITECTURE DIAGRAM -Environment variables: +┌─────────────────────────────────────────────────────────────────────────────┐ +│ USER INTERFACE │ +│ ┌────────────────────────┐ ┌────────────────────────┐ │ +│ │ Upload Documents │ │ Generate Report │ │ +│ │ (100+ files) │ │ Button │ │ +│ └───────────┬────────────┘ └────────────┬───────────┘ │ +└──────────────┼───────────────────────────────────────┼─────────────────────┘ + │ │ + ▼ ▼ +┌──────────────────────────────────────────────────────────────────────────────┐ +│ APPLICATION LAYER │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ DOCUMENT UPLOAD SERVICE │ │ +│ │ • Validate file types │ │ +│ │ • Calculate file hash (deduplication) │ │ +│ │ • Store metadata in PostgreSQL │ │ +│ │ • Save files to storage (Local) │ │ +│ └────────────────────────────┬────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ EXTRACTION ORCHESTRATOR │ │ +│ │ • Routes files to appropriate extractors │ │ +│ │ • Manages extraction queue │ │ +│ │ • Handles failures and retries │ │ +│ └─┬───────────────┬───────────────┬──────────────┬────────────────────┘ │ +│ │ │ │ │ │ +│ ▼ ▼ ▼ ▼ │ +│ ┌─────┐ ┌──────┐ ┌──────┐ ┌───────┐ │ +│ │ PDF │ │ DOCX │ │ PPTX │ │ Image │ │ +│ │Extr.│ │Extr. │ │Extr. │ │Extr. │ │ +│ └──┬──┘ └───┬──┘ └───┬──┘ └───┬───┘ │ +│ │ │ │ │ │ +│ └──────────────┴──────────────┴──────────────┘ │ +│ │ │ +│ ▼ │ +│ [Extracted Text for each document] │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ 🤖 CLAUDE AI - CAUSAL EXTRACTION │ │ +│ │ For each document: │ │ +│ │ Input: Extracted text + metadata │ │ +│ │ Output: List of causal relationships │ │ +│ │ │ │ +│ │ Example Output: │ │ +│ │ { │ │ +│ │ "cause": "Budget cut by 30%", │ │ +│ │ "effect": "ML features postponed", │ │ +│ │ "confidence": 0.92, │ │ +│ │ "entities": ["Finance Team", "ML Team"] │ │ +│ │ } │ │ +│ └────────────────────────────┬────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ [Causal Relationships Database] │ +│ (Temporary PostgreSQL table) │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ 🤖 CLAUDE AI - ENTITY RESOLUTION │ │ +│ │ Resolve entity mentions across all documents │ │ +│ │ │ │ +│ │ Input: All entity mentions ["John", "J. Smith", "John Smith"] │ │ +│ │ Output: Resolved entities {"John Smith": ["John", "J. Smith"]} │ │ +│ └────────────────────────────┬────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ KNOWLEDGE GRAPH BUILDER │ │ +│ │ Build Neo4j graph from causal relationships │ │ +│ └────────────────────────────┬────────────────────────────────────────┘ │ +└────────────────────────────────┼──────────────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────────────────────────────┐ +│ STORAGE LAYER │ +│ │ +│ ┌────────────────┐ ┌────────────────┐ ┌────────────────┐ │ +│ │ PostgreSQL │ │ Neo4j │ │ Qdrant │ │ +│ │ │ │ │ │ │ │ +│ │ • Metadata │ │ • Nodes: │ │ • Vectors │ │ +│ │ • File paths │ │ - Events │ │ • Enriched │ │ +│ │ • Status │ │ - Entities │ │ chunks │ │ +│ │ │ │ - Documents │ │ • Metadata │ │ +│ │ │ │ │ │ │ │ +│ │ │ │ • Edges: │ │ │ │ +│ │ │ │ - CAUSES │ │ │ │ +│ │ │ │ - INVOLVES │ │ │ │ +│ └────────────────┘ │ - MENTIONS │ │ │ │ +│ └────────────────┘ └────────────────┘ │ +│ │ │ │ +└─────────────────────────────────┼─────────────────────┼───────────────────────┘ + │ │ + ▼ ▼ +┌──────────────────────────────────────────────────────────────────────────────┐ +│ KG TO QDRANT ENRICHMENT PIPELINE │ +│ │ +│ ┌────────────────────────────────────────────────────────────────┐ │ +│ │ 1. Query Neo4j for causal chains │ │ +│ │ MATCH (a)-[:CAUSES*1..3]->(b) │ │ +│ │ │ │ +│ │ 2. Convert to enriched text chunks │ │ +│ │ "Budget cut → ML postponed → Timeline shifted" │ │ +│ │ │ │ +│ │ 3. Generate embeddings (OpenAI) │ │ +│ │ │ │ +│ │ 4. Store in Qdrant with metadata from KG │ │ +│ │ - Original causal chain │ │ +│ │ - Entities involved │ │ +│ │ - Confidence scores │ │ +│ │ - Source documents │ │ +│ └────────────────────────────────────────────────────────────────┘ │ +└──────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────────────────────────────┐ +│ REPORT GENERATION PHASE │ +│ │ +│ User clicks "Generate Report" │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────┐ │ +│ │ RETRIEVAL ORCHESTRATOR │ │ +│ │ │ │ +│ │ Step 1: Semantic Search (Qdrant) │ │ +│ │ Query: "project overview timeline decisions" │ │ +│ │ Returns: Top 50 most relevant chunks │ │ +│ │ │ │ +│ │ Step 2: Graph Traversal (Neo4j) │ │ +│ │ Query: Critical causal chains with confidence > 0.8 │ │ +│ │ Returns: Important decision paths │ │ +│ │ │ │ +│ │ Step 3: Entity Analysis (Neo4j) │ │ +│ │ Query: Key people, teams, projects │ │ +│ │ Returns: Entity profiles │ │ +│ └───────────────────────────┬─────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ [Aggregated Context Package] │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────┐ │ +│ │ 🤖 CLAUDE AI - FINAL REPORT GENERATION │ │ +│ │ │ │ +│ │ Input: │ │ +│ │ • 50 semantic chunks from Qdrant │ │ +│ │ • 20 causal chains from Neo4j │ │ +│ │ • Entity profiles │ │ +│ │ • Report template │ │ +│ │ │ │ +│ │ Prompt: │ │ +│ │ "You are creating an onboarding report. │ │ +│ │ Based on 100+ documents, synthesize: │ │ +│ │ - Project overview │ │ +│ │ - Key decisions and WHY they were made │ │ +│ │ - Critical causal chains │ │ +│ │ - Timeline and milestones │ │ +│ │ - Current status and next steps" │ │ +│ │ │ │ +│ │ Output: Comprehensive Markdown report │ │ +│ └───────────────────────────┬─────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────┐ │ +│ │ PDF GENERATION │ │ +│ │ • Convert Markdown to PDF │ │ +│ │ • Add formatting, table of contents │ │ +│ │ • Include citations to source documents │ │ +│ └───────────────────────────┬─────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ [Final PDF Report] │ +│ │ │ +│ ▼ │ +│ Download to user │ +└──────────────────────────────────────────────────────────────────────────────┘ -- `ANTHROPIC_API_KEY` (required) -- `MULTI_DOC_CLAUDE_MODEL` (default `claude-3-5-sonnet-20241022`) -- `NEO4J_URI` (default `bolt://localhost:7687`) -- `NEO4J_USER` / `NEO4J_PASSWORD` (default `neo4j` / `neo4j`) -- `MULTI_DOC_STORAGE_ROOT` (default `storage` inside project) -## Run locally +📚 COMPLETE THEORY-WISE STEP-BY-STEP FLOW +Let me explain the entire system in pure theory - how it works, why each step exists, and what problem it solves. -```bash -uvicorn multi_document_upload_service.main:app --reload --host 0.0.0.0 --port 8035 +🎯 THE BIG PICTURE (Theory) +The Problem: +A new person joins a project that has 100+ documents (meeting notes, technical specs, design docs, emails, presentations). Reading all of them would take weeks. They need to understand: + +WHAT happened in the project +WHY decisions were made (causal relationships) +WHO is involved +WHEN things happened +HOW everything connects + +The Solution: +Build an intelligent system that: + +Reads all documents automatically +Understands cause-and-effect relationships +Connects related information across documents +Generates a comprehensive summary report + + +🔄 COMPLETE FLOW (Theory Explanation) + +STAGE 1: DOCUMENT INGESTION +Theory: Why This Stage Exists +Problem: We have 100+ documents in different formats (PDF, Word, PowerPoint, Excel, images). We need to get them into the system. +Goal: + +Accept all document types +Organize them +Prevent duplicates +Track processing status + +What Happens: + +USER ACTION: +└─> User uploads 100 files through web interface + +SYSTEM ACTIONS: + +Step 1.1: FILE VALIDATION +├─> Check: Is this a supported file type? +├─> Check: Is file size acceptable? +└─> Decision: Accept or Reject + +Step 1.2: DEDUPLICATION +├─> Calculate unique hash (fingerprint) of file content +├─> Check: Have we seen this exact file before? +└─> Decision: Store as new OR link to existing + +Step 1.3: METADATA STORAGE +├─> Store: filename, type, upload date, size +├─> Store: who uploaded it, when +└─> Assign: unique document ID + +Step 1.4: PHYSICAL STORAGE +├─> Save file to disk/cloud storage +└─> Record: where file is stored + +Step 1.5: QUEUE FOR PROCESSING +├─> Add document to processing queue +└─> Status: "waiting for extraction" + +STAGE 2: CONTENT EXTRACTION +Theory: Why This Stage Exists +Problem: Documents are in binary formats (PDF, DOCX, PPTX). We can't directly read them - we need to extract the text content. +Goal: Convert all documents into plain text that can be analyzed +What Happens: + +PROCESSING QUEUE: +└─> System picks next document from queue + +Step 2.1: IDENTIFY FILE TYPE +├─> Read: document.type +└─> Route to appropriate extractor + +Step 2.2a: IF PDF +├─> Use: PyMuPDF library +├─> Process: Read each page +├─> Extract: Text content +└─> Output: Plain text string + +Step 2.2b: IF DOCX (Word) +├─> Use: python-docx library +├─> Process: Read paragraphs, tables +├─> Extract: Text content +└─> Output: Plain text string + +Step 2.2c: IF PPTX (PowerPoint) +├─> Use: python-pptx library +├─> Process: Read each slide +├─> Extract: Title, content, notes +└─> Output: Plain text string + +Step 2.2d: IF CSV/XLSX (Spreadsheet) +├─> Use: pandas library +├─> Process: Read rows and columns +├─> Convert: To text representation +└─> Output: Structured text + +Step 2.2e: IF IMAGE (PNG, JPG) +├─> Use: Claude Vision API (AI model) +├─> Process: Analyze image content +├─> Extract: Description of diagram/chart +└─> Output: Text description + +Step 2.3: TEXT CLEANING +├─> Remove: Extra whitespace +├─> Fix: Encoding issues +├─> Preserve: Important structure +└─> Output: Clean text + +Step 2.4: STORE EXTRACTED TEXT +├─> Save: To database +├─> Link: To original document +└─> Update status: "text_extracted" + +Example: +Input (PDF file): +[Binary PDF data - cannot be read directly] +Output (Extracted Text): + +"Project Alpha - Q3 Meeting Minutes +Date: August 15, 2024 + +Discussion: +Due to budget constraints, we decided to postpone +the machine learning features. This will impact +our December launch timeline. + +Action Items: +- Revise project roadmap +- Notify stakeholders +- Adjust resource allocation" + +Why This Stage? + +Different formats need different tools - One size doesn't fit all +Extract only text - Remove formatting, images (except for image docs) +Standardize - All docs become plain text for next stage +Images are special - They need AI (Claude Vision) to understand + +STAGE 3: CAUSAL RELATIONSHIP EXTRACTION ⭐ (CRITICAL!) +Theory: Why This Stage Exists +Problem: Having text is not enough. We need to understand WHY things happened. +Example: + +Just knowing "ML features postponed" is not useful +Knowing "Budget cut → ML features postponed → Timeline delayed" is MUCH more useful + +Goal: Extract cause-and-effect relationships from text +What Is A Causal Relationship? +A causal relationship has three parts: +CAUSE → EFFECT + +Example 1: +Cause: "Budget reduced by 30%" +Effect: "ML features postponed" + +Example 2: +Cause: "John Smith left the company" +Effect: "Sarah Chen became lead developer" + +Example 3: +Cause: "User feedback showed confusion" +Effect: "We redesigned the onboarding flow" + +How We Extract Them: + +INPUT: Extracted text from document + +Step 3.1: BASIC NLP DETECTION (SpaCy) +├─> Look for: Causal keywords +│ Examples: "because", "due to", "as a result", +│ "led to", "caused", "therefore" +├─> Find: Sentences containing these patterns +└─> Output: Potential causal relationships (low confidence) + +Step 3.2: AI-POWERED EXTRACTION (Claude API) ⭐ +├─> Send: Full document text to Claude AI +├─> Ask Claude: "Find ALL causal relationships in this text" +├─> Claude analyzes: +│ • Explicit relationships ("because X, therefore Y") +│ • Implicit relationships (strongly implied) +│ • Context and background +│ • Who/what is involved +├─> Claude returns: Structured list of relationships +└─> Output: High-quality causal relationships (high confidence) + +Step 3.3: STRUCTURE THE OUTPUT +For each relationship, extract: +├─> Cause: What triggered this? +├─> Effect: What was the result? +├─> Context: Additional background +├─> Entities: Who/what is involved? (people, teams, projects) +├─> Confidence: How certain are we? (0.0 to 1.0) +├─> Source: Which document and sentence? +└─> Date: When did this happen? + +Step 3.4: STORE RELATIONSHIPS +├─> Save: To temporary database table +└─> Link: To source document + +Example: Claude's Analysis + +Input Text: + +"In the Q3 review meeting, the CFO announced a 30% +budget reduction due to decreased market demand +As a result, the engineering team decided to +postpone machine learning features for Project Alpha. +This means our December launch will be delayed +until March 2025." + + +Claude's Output: + +[ + { + "cause": "Market demand decreased", + "effect": "CFO reduced budget by 30%", + "context": "Q3 financial review", + "entities": ["CFO", "Finance Team"], + "confidence": 0.95, + "source_sentence": "30% budget reduction due to decreased market demand", + "date": "Q3 2024" + }, + { + "cause": "Budget reduced by 30%", + "effect": "Machine learning features postponed", + "context": "Project Alpha roadmap adjustment", + "entities": ["Engineering Team", "Project Alpha", "ML Team"], + "confidence": 0.92, + "source_sentence": "decided to postpone machine learning features", + "date": "Q3 2024" + }, + { + "cause": "ML features postponed", + "effect": "Launch delayed from December to March", + "context": "Timeline impact", + "entities": ["Project Alpha"], + "confidence": 0.90, + "source_sentence": "December launch will be delayed until March 2025", + "date": "2024-2025" + } +] ``` -Ensure Neo4j is reachable and Anthropic credentials are exported before -starting the service. +### **Why Use Both NLP AND Claude?** +| Method | Pros | Cons | Use Case | +|--------|------|------|----------| +| **NLP (SpaCy)** | Fast, cheap, runs locally | Misses implicit relationships, lower accuracy | Quick first pass, simple docs | +| **Claude AI** | Understands context, finds implicit relationships, high accuracy | Costs money, requires API | Complex docs, deep analysis | + +**Strategy:** Use NLP first for quick scan, then Claude for deep analysis. + +### **Why This Stage Is Critical:** + +Without causal extraction, you just have a pile of facts: +- ❌ "Budget was cut" +- ❌ "ML features postponed" +- ❌ "Timeline changed" + +With causal extraction, you understand the story: +- ✅ Market demand dropped → Budget cut → ML postponed → Timeline delayed + +This is **the heart of your system** - it's what makes it intelligent. + +--- + +## **STAGE 4: ENTITY RESOLUTION** 🤖 + +### **Theory: Why This Stage Exists** + +**Problem:** Same people/things are mentioned differently across documents. + +**Examples:** +- "John Smith", "John", "J. Smith", "Smith" → Same person +- "Project Alpha", "Alpha", "The Alpha Project" → Same project +- "ML Team", "Machine Learning Team", "AI Team" → Same team (maybe) + +**Goal:** Identify that these different mentions refer to the same entity. + +### **What Happens:** +``` +INPUT: All causal relationships from all documents + +Step 4.1: COLLECT ALL ENTITIES +├─> Scan: All causal relationships +├─> Extract: Every entity mentioned +└─> Result: List of entity mentions + ["John", "John Smith", "J. Smith", "Sarah", "S. Chen", + "Project Alpha", "Alpha", "ML Team", ...] + +Step 4.2: GROUP BY ENTITY TYPE +├─> People: ["John", "John Smith", "Sarah", ...] +├─> Projects: ["Project Alpha", "Alpha", ...] +├─> Teams: ["ML Team", "AI Team", ...] +└─> Organizations: ["Finance Dept", "Engineering", ...] + +Step 4.3: AI-POWERED RESOLUTION (Claude API) ⭐ +├─> Send: All entity mentions to Claude +├─> Ask Claude: "Which mentions refer to the same real-world entity?" +├─> Claude analyzes: +│ • Name similarities +│ • Context clues +│ • Role descriptions +│ • Co-occurrence patterns +└─> Claude returns: Grouped entities + +Step 4.4: CREATE CANONICAL NAMES +├─> Choose: Best name for each entity +├─> Example: "John Smith" becomes canonical for ["John", "J. Smith"] +└─> Store: Mapping table +``` + +### **Example:** + +**Input (mentions across all docs):** +``` +Document 1: "John led the meeting" +Document 2: "J. Smith approved the budget" +Document 3: "John Smith will present next week" +Document 4: "Smith suggested the new approach" + +Claude's Resolution: + +{ + "entities": { + "John Smith": { + "canonical_name": "John Smith", + "mentions": ["John", "J. Smith", "John Smith", "Smith"], + "type": "Person", + "role": "Project Lead", + "confidence": 0.95 + } + } +} +``` + +### **Why This Matters:** + +Without entity resolution: +- ❌ System thinks "John" and "John Smith" are different people +- ❌ Can't track someone's involvement across documents +- ❌ Relationships are fragmented + +With entity resolution: +- ✅ System knows they're the same person +- ✅ Can see full picture of someone's involvement +- ✅ Relationships are connected + +--- + +## **STAGE 5: KNOWLEDGE GRAPH CONSTRUCTION** 📊 + +### **Theory: Why This Stage Exists** + +**Problem:** We have hundreds of causal relationships. How do we organize them? How do we find connections? + +**Solution:** Build a **graph** - a network of nodes (things) and edges (relationships). + +### **What Is A Knowledge Graph?** + +Think of it like a map: +- **Nodes** = Places (events, people, projects) +- **Edges** = Roads (relationships between them) +``` +Example Graph: + + (Budget Cut) + │ + │ CAUSES + ▼ + (ML Postponed) + │ + │ CAUSES + ▼ + (Timeline Delayed) + │ + │ AFFECTS + ▼ + (Project Alpha) + │ + │ INVOLVES + ▼ + (Engineering Team) +``` + +### **What Happens:** +``` +INPUT: Causal relationships + Resolved entities + +Step 5.1: CREATE EVENT NODES +For each causal relationship: +├─> Create Node: Cause event +├─> Create Node: Effect event +└─> Properties: text, date, confidence + +Example: +Node1: {type: "Event", text: "Budget reduced by 30%"} +Node2: {type: "Event", text: "ML features postponed"} + +Step 5.2: CREATE ENTITY NODES +For each resolved entity: +├─> Create Node: Entity +└─> Properties: name, type, role + +Example: +Node3: {type: "Person", name: "John Smith", role: "Lead"} +Node4: {type: "Project", name: "Project Alpha"} + +Step 5.3: CREATE DOCUMENT NODES +For each source document: +└─> Create Node: Document + Properties: filename, date, type + +Example: +Node5: {type: "Document", name: "Q3_meeting.pdf"} + +Step 5.4: CREATE RELATIONSHIPS (Edges) +├─> CAUSES: Event1 → Event2 +├─> INVOLVED_IN: Person → Event +├─> MENTIONS: Document → Entity +├─> AFFECTS: Event → Project +└─> Properties: confidence, source, date + +Example Relationships: +(Budget Cut) -[CAUSES]-> (ML Postponed) +(John Smith) -[INVOLVED_IN]-> (Budget Cut) +(Q3_meeting.pdf) -[MENTIONS]-> (John Smith) + +Step 5.5: STORE IN NEO4J +├─> Connect: To Neo4j database +├─> Create: All nodes +├─> Create: All relationships +└─> Index: For fast querying +``` + +### **Visual Example:** + +**Before (Just Text):** +``` +"Budget cut → ML postponed" +"ML postponed → Timeline delayed" +"John Smith involved in budget decision" +``` + +**After (Knowledge Graph):** +``` + (John Smith) + │ + │ INVOLVED_IN + ▼ + (Budget Cut) ──MENTIONED_IN──> (Q3_meeting.pdf) + │ + │ CAUSES + ▼ + (ML Postponed) ──AFFECTS──> (Project Alpha) + │ + │ CAUSES + ▼ + (Timeline Delayed) ──INVOLVES──> (Engineering Team) +``` + +### **Why Use A Graph?** + +| Question | Without Graph | With Graph | +|----------|---------------|------------| +| "Why was ML postponed?" | Search all docs manually | Follow CAUSES edge backwards | +| "What did budget cut affect?" | Re-read everything | Follow CAUSES edges forward | +| "What is John involved in?" | Search his name everywhere | Follow INVOLVED_IN edges | +| "How are events connected?" | Hard to see | Visual path through graph | + +**Key Benefit:** The graph shows **HOW** everything connects, not just WHAT exists. + +--- + +## **STAGE 6: GRAPH TO VECTOR DATABASE** 🔄 + +### **Theory: Why This Stage Exists** + +**Problem:** +- Neo4j is great for finding relationships ("What caused X?") +- But it's NOT good for semantic search ("Find docs about machine learning") + +**Solution:** We need BOTH: +- **Neo4j** = Find causal chains and connections +- **Qdrant** = Find relevant content by meaning + +### **Why We Need Both:** + +**Neo4j (Graph Database):** +``` +Good for: "Show me the chain of events that led to timeline delay" +Answer: Budget Cut → ML Postponed → Timeline Delayed +``` + +**Qdrant (Vector Database):** +``` +Good for: "Find all content related to machine learning" +Answer: [50 relevant chunks from across all documents] +``` + +### **What Happens:** +``` +INPUT: Complete Knowledge Graph in Neo4j + +Step 6.1: EXTRACT CAUSAL CHAINS +├─> Query Neo4j: "Find all causal paths" +│ Example: MATCH (a)-[:CAUSES*1..3]->(b) +├─> Get: Sequences of connected events +└─> Result: List of causal chains + +Example chains: +1. Market demand ↓ → Budget cut → ML postponed +2. John left → Sarah promoted → Team restructured +3. User feedback → Design change → Timeline adjusted + +Step 6.2: CONVERT TO NARRATIVE TEXT +Take each chain and write it as a story: + +Before: [Node1] → [Node2] → [Node3] + +After: "Due to decreased market demand, the CFO +reduced the budget by 30%. This led to the +postponement of machine learning features, which +ultimately delayed the December launch to March." + +WHY? Because we need text to create embeddings! + +Step 6.3: ENRICH WITH CONTEXT +Add information from the graph: +├─> Who was involved? +├─> When did it happen? +├─> Which documents mention this? +├─> What projects were affected? +└─> How confident are we? + +Enriched text: +"[CAUSAL CHAIN] +Due to decreased market demand, the CFO reduced +the budget by 30%. This led to ML postponement. + +[METADATA] +Date: Q3 2024 +Involved: CFO, Engineering Team, Project Alpha +Sources: Q3_meeting.pdf, budget_report.xlsx +Confidence: 0.92" + +Step 6.4: CREATE EMBEDDINGS +├─> Use: OpenAI Embedding API +├─> Input: Enriched text +├─> Output: Vector (1536 numbers) +│ Example: [0.123, -0.456, 0.789, ...] +└─> This vector represents the "meaning" of the text + +Step 6.5: STORE IN QDRANT +For each enriched chunk: +├─> Vector: The embedding +├─> Payload: The original text + all metadata +│ { +│ "text": "enriched narrative", +│ "type": "causal_chain", +│ "entities": ["CFO", "Project Alpha"], +│ "sources": ["Q3_meeting.pdf"], +│ "confidence": 0.92, +│ "graph_path": "Node1->Node2->Node3" +│ } +└─> Store: In Qdrant collection +``` + +### **What Are Embeddings?** + +Think of embeddings as **coordinates in meaning-space**: +``` +Text: "machine learning features" +Embedding: [0.2, 0.8, 0.1, -0.3, ...] ← 1536 numbers + +Text: "AI capabilities" +Embedding: [0.19, 0.82, 0.09, -0.29, ...] ← Similar numbers! + +Text: "budget reporting" +Embedding: [-0.6, 0.1, 0.9, 0.4, ...] ← Very different numbers +``` + +Similar meanings → Similar vectors → Qdrant finds them together! + +### **Example Flow:** + +**From Neo4j:** +``` +Chain: (Budget Cut) → (ML Postponed) → (Timeline Delayed) +``` + +**Convert to Text:** +``` +"Budget reduced by 30% → ML features postponed → +December launch delayed to March" +``` + +**Enrich:** +``` +"[Causal Chain] Budget reduced by 30% led to ML +features being postponed, which delayed the December +launch to March 2025. + +Involved: CFO, Engineering Team, Project Alpha +Sources: Q3_meeting.pdf, roadmap.pptx +Confidence: 0.91 +Date: August-September 2024" +``` + +**Create Embedding:** +``` +[0.234, -0.567, 0.891, 0.123, ...] ← 1536 numbers + +Store in Qdrant: + +{ + "id": "chain_001", + "vector": [0.234, -0.567, ...], + "payload": { + "text": "enriched narrative...", + "type": "causal_chain", + "entities": ["CFO", "Engineering Team"], + "sources": ["Q3_meeting.pdf"], + "confidence": 0.91 + } +} +``` + +### **Why This Stage?** + +Now we have the **best of both worlds**: + +| Need | Use | +|------|-----| +| "Find content about machine learning" | Qdrant semantic search | +| "Show me the causal chain" | Neo4j graph traversal | +| "Why did timeline delay?" | Start with Qdrant, then Neo4j for details | +| "Generate comprehensive report" | Pull from BOTH | + +--- + +## **STAGE 7: REPORT GENERATION** 📝 (FINAL STAGE) + +### **Theory: Why This Stage Exists** + +**Goal:** Take everything we've learned from 100+ documents and create ONE comprehensive, readable report. + +### **What Happens:** +``` +USER ACTION: +└─> User clicks "Generate Onboarding Report" + +Step 7.1: DEFINE REPORT REQUIREMENTS +What should the report include? +├─> Project overview +├─> Key decisions and WHY they were made +├─> Important people and their roles +├─> Timeline of events +├─> Current status +└─> Next steps + +Step 7.2: SEMANTIC SEARCH (Qdrant) +Query 1: "project overview goals objectives" +├─> Qdrant returns: Top 20 relevant chunks +└─> Covers: High-level project information + +Query 2: "timeline milestones dates schedule" +├─> Qdrant returns: Top 15 relevant chunks +└─> Covers: Timeline information + +Query 3: "decisions architecture technical" +├─> Qdrant returns: Top 15 relevant chunks +└─> Covers: Technical decisions + +Total: ~50 most relevant chunks from Qdrant + +Step 7.3: GRAPH TRAVERSAL (Neo4j) +Query 1: Get critical causal chains +├─> MATCH (a)-[:CAUSES*2..4]->(b) +├─> WHERE confidence > 0.8 +└─> Returns: Top 20 important decision chains + +Query 2: Get key entities +├─> MATCH (e:Entity)-[:INVOLVED_IN]->(events) +├─> Count events per entity +└─> Returns: Most involved people/teams/projects + +Query 3: Get recent timeline +├─> MATCH (e:Event) WHERE e.date > '2024-01-01' +├─> Order by date +└─> Returns: Chronological event list + +Step 7.4: AGGREGATE CONTEXT +Combine everything: +├─> 50 semantic chunks from Qdrant +├─> 20 causal chains from Neo4j +├─> Key entities and their profiles +├─> Timeline of events +└─> Metadata about sources + +Total Context Size: ~30,000-50,000 tokens + +Step 7.5: PREPARE PROMPT FOR CLAUDE +Structure the prompt: +┌─────────────────────────────────────┐ +│ SYSTEM: You are an expert technical │ +│ writer creating an onboarding report│ +│ │ +│ USER: Based on these 100+ documents,│ +│ create a comprehensive report. │ +│ │ +│ # SEMANTIC CONTEXT: │ +│ [50 chunks from Qdrant] │ +│ │ +│ # CAUSAL CHAINS: │ +│ [20 decision chains from Neo4j] │ +│ │ +│ # KEY ENTITIES: │ +│ [People, teams, projects] │ +│ │ +│ # TIMELINE: │ +│ [Chronological events] │ +│ │ +│ Generate report with sections: │ +│ 1. Executive Summary │ +│ 2. Project Overview │ +│ 3. Key Decisions (with WHY) │ +│ 4. Timeline │ +│ 5. Current Status │ +│ 6. Next Steps │ +└─────────────────────────────────────┘ + +Step 7.6: CALL CLAUDE API ⭐ +├─> Send: Complete prompt to Claude +├─> Claude processes: +│ • Reads all context +│ • Identifies key themes +│ • Synthesizes information +│ • Creates narrative structure +│ • Explains causal relationships +│ • Writes clear, coherent report +└─> Returns: Markdown-formatted report + +Step 7.7: POST-PROCESS REPORT +├─> Add: Table of contents +├─> Add: Citations to source documents +├─> Add: Confidence indicators +├─> Format: Headings, bullet points, emphasis +└─> Result: Final Markdown report + +Step 7.8: CONVERT TO PDF +├─> Use: Markdown-to-PDF library +├─> Add: Styling and formatting +├─> Add: Page numbers, headers +└─> Result: Professional PDF report + +Step 7.9: DELIVER TO USER +├─> Save: PDF to storage +├─> Generate: Download link +└─> Show: Success message with download button + + +## **🔄 COMPLETE DATA FLOW SUMMARY** +``` +Documents (100+) + ↓ +[Extract Text] → Plain Text + ↓ +[Claude: Causal Extraction] → Relationships List + ↓ +[Claude: Entity Resolution] → Resolved Entities + ↓ +[Build Graph] → Neo4j Knowledge Graph + ↓ +[Convert + Enrich] → Narrative Chunks + ↓ +[Create Embeddings] → Vectors + ↓ +[Store] → Qdrant Vector DB + ↓ +[User Request] → "Generate Report" + ↓ +[Query Qdrant] → Relevant Chunks + + +[Query Neo4j] → Causal Chains + ↓ +[Claude: Synthesis] → Final Report + ↓ +[Convert] → PDF + ↓ +[Deliver] → User Downloads Report +``` \ No newline at end of file diff --git a/services/multi-document-upload-service/REBUILD_INSTRUCTIONS.md b/services/multi-document-upload-service/REBUILD_INSTRUCTIONS.md deleted file mode 100644 index 5b84c8c..0000000 --- a/services/multi-document-upload-service/REBUILD_INSTRUCTIONS.md +++ /dev/null @@ -1,152 +0,0 @@ -# Rebuild Instructions - Multi-Document Upload Service - -## Issue: Empty Graph in Neo4j - -**Problem**: Query returns "(no changes, no records)" because the job completed with 0 relations. - -**Root Cause**: PDF extraction failed due to missing dependencies (`unstructured[pdf]`). - -## Fixes Applied - -1. ✅ Added PDF dependencies (`unstructured[pdf]`, `unstructured[docx]`, etc.) -2. ✅ Added fallback extractors (pdfplumber, python-docx, python-pptx) -3. ✅ Improved error handling and logging -4. ✅ Fixed Neo4j query syntax -5. ✅ Better status messages - -## Rebuild Steps - -### Step 1: Rebuild the Service - -```bash -cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine - -# Stop the service -docker-compose stop multi-document-upload-service - -# Rebuild with new dependencies -docker-compose build --no-cache multi-document-upload-service - -# Start the service -docker-compose up -d multi-document-upload-service - -# Check logs to verify it's starting correctly -docker-compose logs -f multi-document-upload-service -``` - -### Step 2: Verify Dependencies - -```bash -# Check if unstructured[pdf] is installed -docker-compose exec multi-document-upload-service pip list | grep unstructured - -# You should see: -# unstructured -# unstructured-pdf -# unstructured-docx -# etc. -``` - -### Step 3: Test the Service - -```bash -# Check health endpoint -curl http://localhost:8024/health - -# Should return: -# { -# "status": "ok", -# "claude_model": "claude-3-5-haiku-latest", -# ... -# } -``` - -### Step 4: Re-upload Documents - -1. Open frontend: `http://localhost:3001/project-builder` -2. Go to Step 1: Project Type -3. Find "Upload Documents for Knowledge Graph" section -4. Upload a PDF or other document -5. Wait for processing to complete -6. Check status - should show relation count > 0 - -### Step 5: Verify in Neo4j - -Run these queries in Neo4j Browser (`http://localhost:7474`): - -```cypher -// Check if any nodes exist -MATCH (n) -RETURN count(n) as node_count - -// Check for CAUSES relationships -MATCH (n:Concept)-[r:CAUSES]->(m:Concept) -RETURN n.name as cause, - m.name as effect, - r.confidence as confidence, - r.job_id as job_id -LIMIT 50 -``` - -## Expected Results - -After rebuilding and re-uploading: - -1. **PDF extraction succeeds** ✅ -2. **Text is extracted** ✅ -3. **Relations are extracted** ✅ -4. **Relations are written to Neo4j** ✅ -5. **Query returns results** ✅ - -## Troubleshooting - -If you still see 0 relations: - -1. **Check service logs**: - ```bash - docker-compose logs multi-document-upload-service | tail -50 - ``` - -2. **Check extraction logs**: - ```bash - docker-compose logs multi-document-upload-service | grep -i "extract\|pdf" - ``` - -3. **Check Claude analysis**: - ```bash - docker-compose logs multi-document-upload-service | grep -i "claude\|analyze\|relation" - ``` - -4. **Check Neo4j connection**: - ```bash - docker-compose logs multi-document-upload-service | grep -i "neo4j\|graph\|write" - ``` - -5. **Verify document has causal language**: - - Not all documents contain causal relationships - - Try uploading a document with clear cause-effect statements - - Example: "Smoking causes lung cancer" - -## Quick Test - -Test with a simple text file: - -1. Create a test file `test_causal.txt`: - ``` - Smoking cigarettes causes lung cancer. - Heavy rain causes flooding. - Exercise improves health. - ``` - -2. Upload it via the frontend -3. Check Neo4j for relationships -4. Should see 3 causal relationships - -## Next Steps - -1. Rebuild the service -2. Re-upload documents -3. Check Neo4j for relationships -4. If still no results, check service logs -5. Verify the document contains causal language - diff --git a/services/multi-document-upload-service/TESTING_GUIDE.md b/services/multi-document-upload-service/TESTING_GUIDE.md deleted file mode 100644 index cfd7294..0000000 --- a/services/multi-document-upload-service/TESTING_GUIDE.md +++ /dev/null @@ -1,300 +0,0 @@ -# Multi-Document Upload Service - Frontend Testing Guide - -## Prerequisites - -1. **Backend Services Running**: - ```bash - cd /home/tech4biz/Desktop/prakash/codenuk/backend_new1/codenuk_backend_mine - docker-compose up -d - ``` - -2. **Verify Services are Running**: - - API Gateway: `http://localhost:8000/health` - - Multi-Document Upload Service: `http://localhost:8024/health` - - Neo4j: `http://localhost:7474` (Browser interface) - - Frontend: `http://localhost:3001` (or your frontend port) - -3. **Check Service Health**: - ```bash - # Check API Gateway - curl http://localhost:8000/health - - # Check Multi-Document Upload Service directly - curl http://localhost:8024/health - - # Check via API Gateway proxy - curl http://localhost:8000/api/multi-docs/health - ``` - -## Frontend Testing Steps - -### Step 1: Navigate to Project Builder - -1. Open your browser and go to: `http://localhost:3001` (or your frontend URL) -2. Log in if required -3. Click on **"Project Builder"** in the navigation - -### Step 2: Go to Multi Docs Upload Step - -1. In the Project Builder, you should see the workflow steps: - - **Step 1**: Project Type - - **Step 2**: Features - - **Step 3**: Multi Docs Upload ← **This is the new step** - - **Step 4**: Business Context - - **Step 5**: Generate - - **Step 6**: Architecture - -2. Complete Steps 1 and 2 (Project Type and Features selection) -3. You will automatically be taken to **Step 3: Multi Docs Upload** - -### Step 3: Upload Documents - -1. **Click on the upload area** or **drag and drop files** -2. **Select multiple files** (you can mix different formats): - - PDF files (`.pdf`) - - Word documents (`.doc`, `.docx`) - - PowerPoint (`.ppt`, `.pptx`) - - Excel files (`.xls`, `.xlsx`) - - JSON files (`.json`) - - XML files (`.xml`) - - Markdown files (`.md`) - - Images (`.png`, `.jpg`, `.jpeg`) - will use OCR - - Audio files (`.mp3`, `.wav`) - will be transcribed - - Video files (`.mp4`, `.avi`) - will be transcribed - -3. **View selected files**: You should see a list of all selected files with: - - File icon - - File name - - Remove button for each file - -4. **Click "Start Upload"** button - -### Step 4: Monitor Upload Progress - -After clicking "Start Upload", you should see: - -1. **Upload Status**: - - Button shows "Uploading..." with spinner - - Progress bar appears - - Stage messages appear: - - "Job received" - - "Saving files" - - "Extracting document content" - - "Calling Claude for causal relations" - - "Writing to Neo4j knowledge graph" - - "Completed" - -2. **Progress Indicators**: - - Progress percentage (0-100%) - - Status message showing current stage - - Processed files count vs total files count - -3. **Polling**: The frontend automatically polls the job status every 4 seconds - -### Step 5: Verify Results - -Once the job is completed: - -1. **Check Neo4j Graph**: - - Open Neo4j Browser: `http://localhost:7474` - - Login with: - - Username: `neo4j` - - Password: `password` - - Run Cypher query to see the graph: - ```cypher - MATCH (n)-[r:CAUSES]->(m) - RETURN n, r, m - LIMIT 50 - ``` - -2. **Check Job Status via API**: - ```bash - # Replace {job_id} with the actual job ID from the frontend - curl http://localhost:8000/api/multi-docs/jobs/{job_id} - ``` - -3. **Get Graph Summary**: - ```bash - curl http://localhost:8000/api/multi-docs/jobs/{job_id}/graph - ``` - -## Testing Different Scenarios - -### Scenario 1: Single PDF File -- Upload one PDF file -- Verify it processes correctly -- Check Neo4j for causal relationships - -### Scenario 2: Multiple Mixed Format Files -- Upload 3-5 files of different formats (PDF, DOCX, JSON, image) -- Verify all files are processed -- Check that progress updates correctly - -### Scenario 3: Large Files -- Upload a large PDF (10+ MB) -- Verify it handles large files correctly -- Check processing time - -### Scenario 4: Error Handling -- Try uploading an unsupported file type -- Verify error message appears -- Check that the error is displayed clearly - -### Scenario 5: Skip Option -- Upload files -- Click "Skip" button before completion -- Verify you can proceed to the next step -- Job continues processing in the background - -## Browser Developer Tools - -### Check Network Requests - -1. **Open Developer Tools** (F12) -2. **Go to Network tab** -3. **Filter by "multi-docs"** -4. **Monitor requests**: - - `POST /api/multi-docs/jobs` - Upload files - - `GET /api/multi-docs/jobs/{job_id}` - Poll job status - - `GET /api/multi-docs/jobs/{job_id}/graph` - Get graph summary - -### Check Console Logs - -1. **Open Console tab** -2. **Look for**: - - Upload progress logs - - Job status updates - - Any error messages - -### Check Response Data - -Verify the API responses: - -```javascript -// Upload response should be: -{ - "job_id": "uuid-here", - "stage": "received", - "total_files": 3, - "created_at": "2024-01-01T00:00:00Z" -} - -// Status response should be: -{ - "job_id": "uuid-here", - "stage": "extracting", - "status_message": "Extracting document content", - "total_files": 3, - "processed_files": 1, - "error": null, - "created_at": "2024-01-01T00:00:00Z", - "updated_at": "2024-01-01T00:01:00Z", - "files": [...] -} -``` - -## Troubleshooting - -### Issue: Upload fails with 502 Bad Gateway -**Solution**: -- Check if multi-document-upload-service is running: - ```bash - docker-compose ps multi-document-upload-service - ``` -- Check service logs: - ```bash - docker-compose logs multi-document-upload-service - ``` - -### Issue: Upload fails with 413 Request Entity Too Large -**Solution**: -- Check file sizes (max 500MB total per job) -- Reduce number of files or file sizes -- Check API Gateway body size limits - -### Issue: Status polling stops working -**Solution**: -- Check browser console for errors -- Verify job ID is correct -- Check if job completed or failed -- Check network tab for failed requests - -### Issue: No causal relationships found -**Solution**: -- Check Claude API key is configured correctly -- Check service logs for Claude API errors -- Verify documents contain causal language -- Check Neo4j connection - -### Issue: Frontend shows "Failed" status -**Solution**: -- Check the error message in the frontend -- Check backend service logs: - ```bash - docker-compose logs -f multi-document-upload-service - ``` -- Verify all dependencies are running (Neo4j, Redis, Postgres) - -## Expected Behavior - -### Successful Flow: -1. ✅ Files upload successfully -2. ✅ Job ID is returned -3. ✅ Status polling starts automatically -4. ✅ Progress updates every 4 seconds -5. ✅ Stage changes are displayed -6. ✅ Progress bar updates -7. ✅ Job completes successfully -8. ✅ Frontend automatically proceeds to next step -9. ✅ Neo4j contains causal relationships - -### Error Flow: -1. ✅ Error message is displayed clearly -2. ✅ User can retry upload -3. ✅ User can skip and proceed -4. ✅ Error details are logged in console - -## API Endpoints Reference - -### Upload Files -```bash -POST /api/multi-docs/jobs -Content-Type: multipart/form-data - -Form Data: -- files: File[] (multiple files) -- job_name: string (optional) -``` - -### Get Job Status -```bash -GET /api/multi-docs/jobs/{job_id} -``` - -### Get Graph Summary -```bash -GET /api/multi-docs/jobs/{job_id}/graph -``` - -### Health Check -```bash -GET /api/multi-docs/health -``` - -## Next Steps After Testing - -1. **Verify Neo4j Graph**: Check that causal relationships are stored correctly -2. **Check Storage**: Verify files are stored in the persistent volume -3. **Monitor Performance**: Check processing times for different file types -4. **Test Error Scenarios**: Verify error handling works correctly -5. **Test Large Batches**: Upload 50+ files to test scalability - -## Support - -If you encounter issues: -1. Check service logs: `docker-compose logs multi-document-upload-service` -2. Check API Gateway logs: `docker-compose logs api-gateway` -3. Check Neo4j logs: `docker-compose logs neo4j` -4. Verify all environment variables are set correctly -5. Check network connectivity between services - diff --git a/services/multi-document-upload-service/requirements.txt b/services/multi-document-upload-service/requirements.txt index 00a9795..5e86a49 100644 --- a/services/multi-document-upload-service/requirements.txt +++ b/services/multi-document-upload-service/requirements.txt @@ -8,10 +8,6 @@ pydantic-settings>=2.2.1 aiofiles>=23.2.1 tenacity>=8.2.3 python-dotenv>=1.0.1 -unstructured[pdf]>=0.15.0 -unstructured[docx]>=0.15.0 -unstructured[pptx]>=0.15.0 -unstructured[xlsx]>=0.15.0 pdfplumber>=0.11.0 python-docx>=1.1.0 python-pptx>=0.6.23 @@ -30,5 +26,13 @@ beautifulsoup4>=4.12.3 lxml>=5.2.1 sqlalchemy>=2.0.25 httpx>=0.27.0 -tiktoken>=0.7.0 +dowhy>=0.11.0 +qdrant-client>=1.7.0 +sentence-transformers>=2.2.0 +numpy>=1.24.0 +scipy>=1.11.0 +networkx>=3.1 +spacy>=3.7.0 +markdown>=3.5.0 +weasyprint>=60.0 diff --git a/services/multi-document-upload-service/src/multi_document_upload_service/claude_client.py b/services/multi-document-upload-service/src/multi_document_upload_service/claude_client.py deleted file mode 100644 index cc2e6df..0000000 --- a/services/multi-document-upload-service/src/multi_document_upload_service/claude_client.py +++ /dev/null @@ -1,328 +0,0 @@ -from __future__ import annotations - -import base64 -import json -import logging -import re -from pathlib import Path -from typing import Iterable, List - -from anthropic import Anthropic, BadRequestError -from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential, RetryCallState - -from .models import CausalRelation - -logger = logging.getLogger(__name__) - - -def is_billing_error(exception: Exception) -> bool: - """Check if the exception is a billing/credit related error that shouldn't be retried.""" - if isinstance(exception, BadRequestError): - error_message = str(exception).lower() - billing_keywords = ["credit", "balance", "too low", "billing", "upgrade", "purchase credits"] - return any(keyword in error_message for keyword in billing_keywords) - return False - - -def should_retry_exception(retry_state: RetryCallState) -> bool: - """Custom retry condition that excludes billing errors.""" - exception = retry_state.outcome.exception() - if exception is None: - return False - # Don't retry billing errors - they won't be resolved by retrying - if is_billing_error(exception): - return False - # Retry other exceptions - return True - - -CLAUDE_PROMPT_TEMPLATE = """You are an expert analyst extracting causal relationships from documents. - -Given the following text chunk, identify all explicit or strongly implied cause and effect pairs. -Return JSON with the schema: -[ - { - "cause": "", - "effect": "", - "confidence": 0-1 float, - "explanation": "", - "source_snippet": "" - } -] - -Only include items when the causal direction is clear. -If none are found, return an empty list []. - -Text chunk: -``` -<<>> -```""" - -IMAGE_PROMPT_TEMPLATE = """You are an expert analyst extracting causal relationships from images, diagrams, and visual content. - -Analyze this image/diagram for causal relationships. Look for: -- Architecture flows (A → B → C) -- Dependency relationships -- Cause-effect chains in diagrams -- Process flows -- System interactions -- Data flows -- Sequential relationships -- Visual connections between components - -Return JSON with the schema: -[ - { - "cause": "", - "effect": "", - "confidence": 0-1 float, - "explanation": "", - "source_snippet": "" - } -] - -Only include items when the causal direction is clear from the visual structure. -If none are found, return an empty list [].""" - - -class ClaudeCausalExtractor: - def __init__(self, api_key: str, model: str, max_output_tokens: int = 4000): - self.client = Anthropic(api_key=api_key) - self.model = model - self.max_output_tokens = max_output_tokens - - @retry( - retry=should_retry_exception, - wait=wait_exponential(multiplier=1, min=1, max=10), - stop=stop_after_attempt(3), - reraise=True, - ) - def analyze_chunk(self, chunk: str, source_file_id: str) -> List[CausalRelation]: - logger.debug("Analyzing chunk with Claude model %s", self.model) - - # Validate chunk is not empty and is readable text - if not chunk or not chunk.strip(): - logger.warning("Empty or whitespace-only chunk, skipping") - return [] - - # Check if chunk contains mostly readable text (not binary data) - # Simple heuristic: if >50% of characters are non-printable or control chars, skip it - printable_chars = sum(1 for c in chunk if c.isprintable() or c.isspace()) - if len(chunk) > 100 and printable_chars / len(chunk) < 0.5: - logger.warning("Chunk appears to contain binary data, skipping analysis") - return [] - - # Use string replacement with a unique placeholder to avoid KeyError with braces in content - # This prevents Python's .format() from interpreting braces in the chunk text as format placeholders - prompt_text = CLAUDE_PROMPT_TEMPLATE.replace("<<>>", chunk) - - try: - message = self.client.messages.create( - model=self.model, - max_tokens=self.max_output_tokens, - temperature=0.0, - system="You extract causal (cause→effect) relations with high precision.", - messages=[ - { - "role": "user", - "content": [{"type": "text", "text": prompt_text}], - } - ], - ) - except BadRequestError as e: - # Check if it's a billing error - if is_billing_error(e): - error_msg = ( - "Anthropic API credit balance is too low. " - "Please go to Plans & Billing to upgrade or purchase credits. " - f"Error: {str(e)}" - ) - logger.error(error_msg) - raise RuntimeError(error_msg) from e - # Re-raise other BadRequestErrors - raise - - content_blocks = message.content or [] - raw_text = "".join(block.text for block in content_blocks if hasattr(block, "text")) # type: ignore[attr-defined] - if not raw_text: - return [] - - # Try to extract JSON from markdown code blocks if present - json_text = raw_text.strip() - - # Look for JSON in markdown code blocks (```json ... ```) - json_match = re.search(r'```(?:json)?\s*(\[.*?\])\s*```', json_text, re.DOTALL) - if json_match: - json_text = json_match.group(1) - else: - # Look for JSON array/object at the start or end - json_match = re.search(r'(\[.*?\]|{.*?})', json_text, re.DOTALL) - if json_match: - json_text = json_match.group(1) - - try: - data = json.loads(json_text) - if not isinstance(data, list): - logger.warning("Claude response is not a list: %s", type(data)) - return [] - - relations: List[CausalRelation] = [] - for item in data: - if not isinstance(item, dict): - continue - cause = item.get("cause", "").strip() - effect = item.get("effect", "").strip() - if not cause or not effect: - continue # Skip invalid relations - - relations.append( - CausalRelation( - cause=cause, - effect=effect, - confidence=float(item.get("confidence", 0.0)), - explanation=item.get("explanation"), - source_file_id=source_file_id, - source_snippet=item.get("source_snippet"), - metadata={"model": self.model}, - ) - ) - logger.info("Extracted %d relations from Claude response", len(relations)) - return relations - except json.JSONDecodeError as e: - logger.warning("Failed to parse Claude response as JSON: %s. Raw text: %s", e, raw_text[:200]) - return [] - - def analyze(self, chunks: Iterable[str], source_file_id: str) -> List[CausalRelation]: - relations: List[CausalRelation] = [] - for chunk in chunks: - relations.extend(self.analyze_chunk(chunk, source_file_id=source_file_id)) - return relations - - @retry( - retry=should_retry_exception, - wait=wait_exponential(multiplier=1, min=1, max=10), - stop=stop_after_attempt(3), - reraise=True, - ) - def analyze_image(self, image_path: Path, source_file_id: str) -> List[CausalRelation]: - """ - Analyze an image using Claude Vision API to extract causal relationships. - Sends image directly to Claude (no OCR). - """ - logger.info("Analyzing image with Claude Vision: %s", image_path.name) - - try: - # Read and encode image as base64 - with open(image_path, "rb") as image_file: - image_data = image_file.read() - - # Determine media type - suffix = image_path.suffix.lower() - media_type_map = { - ".png": "image/png", - ".jpg": "image/jpeg", - ".jpeg": "image/jpeg", - ".gif": "image/gif", - ".webp": "image/webp", - } - media_type = media_type_map.get(suffix, "image/png") - - # Encode to base64 - base64_image = base64.b64encode(image_data).decode("utf-8") - - # Prepare content for Claude Vision API - content = [ - { - "type": "image", - "source": { - "type": "base64", - "media_type": media_type, - "data": base64_image, - }, - }, - { - "type": "text", - "text": IMAGE_PROMPT_TEMPLATE, - }, - ] - - # Call Claude Vision API - try: - message = self.client.messages.create( - model=self.model, # Claude models support vision - max_tokens=self.max_output_tokens, - temperature=0.0, - system="You extract causal (cause→effect) relations from visual content with high precision.", - messages=[ - { - "role": "user", - "content": content, - } - ], - ) - except BadRequestError as e: - # Check if it's a billing error - if is_billing_error(e): - error_msg = ( - "Anthropic API credit balance is too low. " - "Please go to Plans & Billing to upgrade or purchase credits. " - f"Error: {str(e)}" - ) - logger.error(error_msg) - raise RuntimeError(error_msg) from e - # Re-raise other BadRequestErrors - raise - - # Parse response - content_blocks = message.content or [] - raw_text = "".join(block.text for block in content_blocks if hasattr(block, "text")) # type: ignore[attr-defined] - if not raw_text: - logger.warning("No text response from Claude Vision for image %s", image_path.name) - return [] - - # Extract JSON from response - json_text = raw_text.strip() - json_match = re.search(r'```(?:json)?\s*(\[.*?\])\s*```', json_text, re.DOTALL) - if json_match: - json_text = json_match.group(1) - else: - json_match = re.search(r'(\[.*?\]|{.*?})', json_text, re.DOTALL) - if json_match: - json_text = json_match.group(1) - - try: - data = json.loads(json_text) - if not isinstance(data, list): - logger.warning("Claude Vision response is not a list: %s", type(data)) - return [] - - relations: List[CausalRelation] = [] - for item in data: - if not isinstance(item, dict): - continue - cause = item.get("cause", "").strip() - effect = item.get("effect", "").strip() - if not cause or not effect: - continue - - relations.append( - CausalRelation( - cause=cause, - effect=effect, - confidence=float(item.get("confidence", 0.0)), - explanation=item.get("explanation"), - source_file_id=source_file_id, - source_snippet=item.get("source_snippet") or f"Image: {image_path.name}", - metadata={"model": self.model, "content_type": "image", "image_path": str(image_path)}, - ) - ) - logger.info("Extracted %d relations from image %s", len(relations), image_path.name) - return relations - except json.JSONDecodeError as e: - logger.warning("Failed to parse Claude Vision response as JSON: %s. Raw text: %s", e, raw_text[:200]) - return [] - - except Exception as exc: - logger.exception("Failed to analyze image %s: %s", image_path, exc) - return [] - diff --git a/services/multi-document-upload-service/src/multi_document_upload_service/config.py b/services/multi-document-upload-service/src/multi_document_upload_service/config.py index 54c4b07..5d67e98 100644 --- a/services/multi-document-upload-service/src/multi_document_upload_service/config.py +++ b/services/multi-document-upload-service/src/multi_document_upload_service/config.py @@ -20,7 +20,7 @@ class Settings(BaseSettings): model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore") anthropic_api_key: str | None = Field(default=None, validation_alias="ANTHROPIC_API_KEY") - claude_model: str = Field(default=os.getenv("MULTI_DOC_CLAUDE_MODEL", "claude-3-5-sonnet-20241022")) + claude_model: str = Field(default=os.getenv("MULTI_DOC_CLAUDE_MODEL", os.getenv("CLAUDE_MODEL", "claude-3-5-haiku-latest"))) claude_max_input_tokens: int = Field(default=200_000) claude_max_output_tokens: int = Field(default=16_000) @@ -37,6 +37,27 @@ class Settings(BaseSettings): job_retention_days: int = Field(default=30) + # Qwen2.5-VL API configuration + qwen_api_key: str | None = Field(default=None, validation_alias="QWEN_API_KEY") + qwen_api_url: str = Field(default=os.getenv("QWEN_API_URL", "https://api.example.com/v1/chat/completions")) + qwen_model: str = Field(default=os.getenv("QWEN_MODEL", "qwen2.5-vl")) + + # DoWhy configuration + dowhy_enabled: bool = Field(default=True) + dowhy_confidence_threshold: float = Field(default=0.05) + + # Embedding configuration + embedding_model: str = Field(default="sentence-transformers/all-MiniLM-L6-v2") + embedding_dimension: int = Field(default=384) + + # Qdrant configuration + qdrant_url: str = Field(default=os.getenv("QDRANT_URL", "http://localhost:6333")) + qdrant_collection_name: str = Field(default="kg_embeddings") + qdrant_vector_size: int = Field(default=384) + + # Report generation configuration + report_format: str = Field(default="markdown") + def ensure_storage_dirs(self) -> None: (self.storage_root / "jobs").mkdir(parents=True, exist_ok=True) (self.storage_root / "uploads").mkdir(parents=True, exist_ok=True) diff --git a/services/multi-document-upload-service/src/multi_document_upload_service/extractors/auto.py b/services/multi-document-upload-service/src/multi_document_upload_service/extractors/auto.py deleted file mode 100644 index fb87e18..0000000 --- a/services/multi-document-upload-service/src/multi_document_upload_service/extractors/auto.py +++ /dev/null @@ -1,168 +0,0 @@ -from __future__ import annotations - -import logging -from pathlib import Path -from typing import List - -logger = logging.getLogger(__name__) - -# Try to import unstructured, but fall back to alternatives if not available -try: - from unstructured.partition.auto import partition - HAS_UNSTRUCTURED = True -except ImportError: - HAS_UNSTRUCTURED = False - logger.warning("unstructured not available, will use fallback extractors") - -# Fallback extractors -try: - import pdfplumber - HAS_PDFPLUMBER = True -except ImportError: - HAS_PDFPLUMBER = False - -try: - from docx import Document as DocxDocument - HAS_DOCX = True -except ImportError: - HAS_DOCX = False - -try: - from pptx import Presentation - HAS_PPTX = True -except ImportError: - HAS_PPTX = False - -# Image processing libraries -try: - from PIL import Image - import pytesseract - HAS_OCR = True -except ImportError: - HAS_OCR = False - logger.warning("OCR libraries not available, image extraction will be limited") - - -def extract_text(path: Path) -> str: - """ - Extract text from a file using multiple strategies. - Falls back through: unstructured -> format-specific -> plain text read. - """ - suffix = path.suffix.lower() - - # Validate PDF file before processing - if suffix == ".pdf": - # Quick validation: check if file starts with PDF magic bytes - try: - with path.open("rb") as f: - header = f.read(4) - if header != b"%PDF": - raise ValueError( - f"File {path.name} does not appear to be a valid PDF. " - f"PDF files must start with '%PDF' magic bytes. " - f"Got: {header[:20] if len(header) > 0 else 'empty file'}" - ) - except Exception as exc: - if isinstance(exc, ValueError): - raise - logger.warning("Could not validate PDF header: %s", exc) - - # Image files - return empty text (will be processed directly with Claude Vision) - # We skip OCR and send images directly to Claude Vision API - if suffix in {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}: - logger.info("Image file detected: %s. Will be processed directly with Claude Vision (no OCR)", path.name) - # Return empty string - images will be handled separately in pipeline - return "" - - # Plain text files - direct read - if suffix in {".txt", ".md", ".json", ".xml", ".html", ".csv"}: - try: - return path.read_text(encoding="utf-8", errors="ignore") - except Exception as exc: - logger.warning("Failed to read %s as text: %s", path, exc) - raise - - # Try unstructured first (if available) - if HAS_UNSTRUCTURED: - try: - elements = partition(filename=str(path)) - lines: List[str] = [] - for element in elements: - text = getattr(element, "text", None) - if text: - lines.append(text.strip()) - if lines: - logger.info("Extracted %d lines using unstructured", len(lines)) - return "\n".join(lines) - except Exception as exc: - logger.warning("unstructured extraction failed for %s: %s", path, exc) - # Continue to fallback methods - - # Fallback: PDF with pdfplumber - if suffix == ".pdf" and HAS_PDFPLUMBER: - try: - with pdfplumber.open(path) as pdf: - text_parts = [] - for page in pdf.pages: - page_text = page.extract_text() - if page_text: - text_parts.append(page_text) - if text_parts: - logger.info("Extracted PDF using pdfplumber") - return "\n".join(text_parts) - except Exception as exc: - logger.warning("pdfplumber extraction failed for %s: %s", path, exc) - - # Fallback: DOCX - if suffix == ".docx" and HAS_DOCX: - try: - doc = DocxDocument(path) - paragraphs = [p.text for p in doc.paragraphs if p.text.strip()] - if paragraphs: - logger.info("Extracted DOCX using python-docx") - return "\n".join(paragraphs) - except Exception as exc: - logger.warning("python-docx extraction failed for %s: %s", path, exc) - - # Fallback: PPTX - if suffix in {".pptx", ".ppt"} and HAS_PPTX: - try: - prs = Presentation(path) - text_parts = [] - for slide in prs.slides: - for shape in slide.shapes: - if hasattr(shape, "text") and shape.text: - text_parts.append(shape.text.strip()) - if text_parts: - logger.info("Extracted PPTX using python-pptx") - return "\n".join(text_parts) - except Exception as exc: - logger.warning("python-pptx extraction failed for %s: %s", path, exc) - - # Last resort: try to read as text anyway, but validate it's readable - try: - content = path.read_text(encoding="utf-8", errors="ignore") - if content.strip(): - # Check if content is actually readable text (not binary data) - # Simple heuristic: if >30% of characters are printable, consider it text - printable_chars = sum(1 for c in content if c.isprintable() or c.isspace()) - total_chars = len(content) - - if total_chars > 0 and printable_chars / total_chars > 0.3: - logger.warning("Read %s as plain text (may contain binary data)", path) - return content - else: - logger.error("Content from %s appears to be binary data, cannot extract text", path) - raise ValueError(f"File {path} appears to be binary or corrupted. Cannot extract readable text.") - except Exception as exc: - if isinstance(exc, ValueError): - raise - logger.warning("Failed to read %s as text: %s", path, exc) - - # If all else fails, raise an error - raise ValueError( - f"Could not extract text from {path}. " - f"File type may not be supported, file may be corrupted, or dependencies are missing. " - f"Supported formats: PDF, DOCX, PPTX, XLSX, TXT, MD, JSON, XML, HTML, CSV, PNG, JPG, JPEG (with OCR)" - ) - diff --git a/services/multi-document-upload-service/src/multi_document_upload_service/extractors/pymupdf_extractor.py b/services/multi-document-upload-service/src/multi_document_upload_service/extractors/pymupdf_extractor.py new file mode 100644 index 0000000..94ee5e8 --- /dev/null +++ b/services/multi-document-upload-service/src/multi_document_upload_service/extractors/pymupdf_extractor.py @@ -0,0 +1,320 @@ +from __future__ import annotations + +import logging +import re +from dataclasses import dataclass +from pathlib import Path +from typing import List, Optional + +logger = logging.getLogger(__name__) + +try: + import fitz # PyMuPDF + HAS_PYMUPDF = True +except ImportError: + HAS_PYMUPDF = False + logger.warning("PyMuPDF not available") + +try: + from docx import Document as DocxDocument + HAS_DOCX = True +except ImportError: + HAS_DOCX = False + logger.warning("python-docx not available") + +try: + from pptx import Presentation + HAS_PPTX = True +except ImportError: + HAS_PPTX = False + logger.warning("python-pptx not available") + +try: + import pandas as pd + HAS_PANDAS = True +except ImportError: + HAS_PANDAS = False + logger.warning("pandas not available") + + +@dataclass +class ExtractedText: + """Structured text extraction with context.""" + text: str + page_number: int + metadata: dict + context: Optional[str] = None # Surrounding context + + +def extract_text_with_context(path: Path) -> List[ExtractedText]: + """ + Extract text from PDF using PyMuPDF with page-level context. + Returns structured text with metadata. + """ + if not HAS_PYMUPDF: + raise ImportError("PyMuPDF is required for text extraction") + + if not path.exists(): + raise FileNotFoundError(f"File not found: {path}") + + if path.suffix.lower() != ".pdf": + # For non-PDF files, fall back to simple text reading + try: + text = path.read_text(encoding="utf-8", errors="ignore") + return [ExtractedText( + text=text, + page_number=1, + metadata={"file_type": path.suffix, "filename": path.name}, + context=None + )] + except Exception as exc: + logger.warning("Failed to read %s as text: %s", path, exc) + raise + + extracted_pages: List[ExtractedText] = [] + + try: + doc = fitz.open(path) + + for page_num in range(len(doc)): + page = doc[page_num] + + # Extract text + text = page.get_text() + + # Extract metadata + metadata = { + "page_number": page_num + 1, + "page_count": len(doc), + "filename": path.name, + "file_type": "pdf", + "page_rect": { + "width": page.rect.width, + "height": page.rect.height + } + } + + # Extract context (surrounding pages for better understanding) + context = None + if page_num > 0: + prev_page = doc[page_num - 1] + prev_text = prev_page.get_text()[:500] # Last 500 chars of previous page + context = f"Previous page context: {prev_text}" + + if text.strip(): + extracted_pages.append(ExtractedText( + text=text, + page_number=page_num + 1, + metadata=metadata, + context=context + )) + + doc.close() + logger.info("Extracted text from %d pages in %s", len(extracted_pages), path.name) + return extracted_pages + + except Exception as exc: + logger.exception("Failed to extract text from PDF %s: %s", path, exc) + raise + + +def extract_text_from_docx(path: Path) -> str: + """ + Extract text from DOCX file using python-docx. + Reads paragraphs and tables as per README Step 2.2b. + """ + if not HAS_DOCX: + raise ImportError("python-docx is required for DOCX extraction") + + try: + doc = DocxDocument(path) + text_parts = [] + + # Extract paragraphs + for paragraph in doc.paragraphs: + if paragraph.text.strip(): + text_parts.append(paragraph.text.strip()) + + # Extract tables + for table in doc.tables: + table_text = [] + for row in table.rows: + row_text = [] + for cell in row.cells: + if cell.text.strip(): + row_text.append(cell.text.strip()) + if row_text: + table_text.append(" | ".join(row_text)) + if table_text: + text_parts.append("\n".join(table_text)) + + result = "\n\n".join(text_parts) + logger.info("Extracted %d characters from DOCX %s", len(result), path.name) + return result + except Exception as exc: + logger.exception("Failed to extract text from DOCX %s: %s", path, exc) + raise + + +def extract_text_from_pptx(path: Path) -> str: + """ + Extract text from PPTX file using python-pptx. + Reads slides, titles, and notes as per README Step 2.2c. + """ + if not HAS_PPTX: + raise ImportError("python-pptx is required for PPTX extraction") + + try: + prs = Presentation(path) + text_parts = [] + + for slide_num, slide in enumerate(prs.slides, 1): + slide_text = [] + + # Extract slide title + if slide.shapes.title and slide.shapes.title.text: + slide_text.append(f"Slide {slide_num} Title: {slide.shapes.title.text.strip()}") + + # Extract content from shapes + for shape in slide.shapes: + if hasattr(shape, "text") and shape.text.strip(): + # Skip title (already extracted) + if not (slide.shapes.title and shape == slide.shapes.title): + slide_text.append(shape.text.strip()) + + # Extract notes (if available) + if hasattr(slide, "notes_slide") and slide.notes_slide: + notes_text = "" + for shape in slide.notes_slide.shapes: + if hasattr(shape, "text") and shape.text.strip(): + notes_text += shape.text.strip() + " " + if notes_text.strip(): + slide_text.append(f"Notes: {notes_text.strip()}") + + if slide_text: + text_parts.append("\n".join(slide_text)) + + result = "\n\n".join(text_parts) + logger.info("Extracted %d characters from PPTX %s (%d slides)", + len(result), path.name, len(prs.slides)) + return result + except Exception as exc: + logger.exception("Failed to extract text from PPTX %s: %s", path, exc) + raise + + +def extract_text_from_spreadsheet(path: Path) -> str: + """ + Extract text from CSV/XLSX file using pandas. + Reads rows and columns, converts to text representation as per README Step 2.2d. + """ + if not HAS_PANDAS: + raise ImportError("pandas is required for spreadsheet extraction") + + try: + suffix = path.suffix.lower() + text_parts = [] + + if suffix == ".csv": + df = pd.read_csv(path, encoding="utf-8", errors="ignore") + elif suffix in {".xlsx", ".xls"}: + # Read first sheet by default + df = pd.read_excel(path, engine="openpyxl" if suffix == ".xlsx" else None) + else: + raise ValueError(f"Unsupported spreadsheet format: {suffix}") + + # Convert DataFrame to text representation + # Add column headers + headers = " | ".join(str(col) for col in df.columns) + text_parts.append(f"Columns: {headers}") + + # Add rows (limit to first 1000 rows to avoid huge output) + max_rows = min(1000, len(df)) + for idx, row in df.head(max_rows).iterrows(): + row_values = " | ".join(str(val) if pd.notna(val) else "" for val in row) + text_parts.append(f"Row {idx + 1}: {row_values}") + + if len(df) > max_rows: + text_parts.append(f"... ({len(df) - max_rows} more rows)") + + result = "\n".join(text_parts) + logger.info("Extracted %d characters from spreadsheet %s (%d rows)", + len(result), path.name, len(df)) + return result + except Exception as exc: + logger.exception("Failed to extract text from spreadsheet %s: %s", path, exc) + raise + + +def clean_text(text: str) -> str: + """ + Clean extracted text as per README Step 2.3. + - Remove extra whitespace + - Fix encoding issues + - Preserve important structure + """ + if not text: + return "" + + # Fix encoding issues (remove non-printable characters except newlines and tabs) + cleaned = "".join(char for char in text if char.isprintable() or char in "\n\t\r") + + # Remove extra whitespace (but preserve paragraph breaks) + # Replace multiple spaces with single space + cleaned = re.sub(r'[ \t]+', ' ', cleaned) + + # Normalize line breaks (preserve double newlines for paragraphs) + cleaned = re.sub(r'\r\n', '\n', cleaned) # Windows line breaks + cleaned = re.sub(r'\r', '\n', cleaned) # Old Mac line breaks + + # Preserve paragraph structure (double newlines) + # But remove excessive blank lines (more than 2 consecutive) + cleaned = re.sub(r'\n{3,}', '\n\n', cleaned) + + # Remove leading/trailing whitespace from each line + lines = [line.strip() for line in cleaned.split('\n')] + cleaned = '\n'.join(lines) + + # Remove leading/trailing whitespace overall + cleaned = cleaned.strip() + + return cleaned + + +def extract_all_text(path: Path) -> str: + """ + Extract all text from a file based on type (as per README Step 2). + Routes to appropriate extractor: PDF, DOCX, PPTX, CSV/XLSX, or plain text. + """ + suffix = path.suffix.lower() + + # Step 2.2a: PDF + if suffix == ".pdf" and HAS_PYMUPDF: + extracted_pages = extract_text_with_context(path) + text = "\n\n".join([page.text for page in extracted_pages]) + + # Step 2.2b: DOCX (Word) + elif suffix == ".docx" and HAS_DOCX: + text = extract_text_from_docx(path) + + # Step 2.2c: PPTX (PowerPoint) + elif suffix in {".pptx", ".ppt"} and HAS_PPTX: + text = extract_text_from_pptx(path) + + # Step 2.2d: CSV/XLSX (Spreadsheet) + elif suffix in {".csv", ".xlsx", ".xls"} and HAS_PANDAS: + text = extract_text_from_spreadsheet(path) + + # Fallback: Plain text files + else: + try: + text = path.read_text(encoding="utf-8", errors="ignore") + except Exception as exc: + logger.warning("Failed to read %s as text: %s", path, exc) + raise + + # Step 2.3: TEXT CLEANING + text = clean_text(text) + + return text + diff --git a/services/multi-document-upload-service/src/multi_document_upload_service/extractors/qwen_vision.py b/services/multi-document-upload-service/src/multi_document_upload_service/extractors/qwen_vision.py new file mode 100644 index 0000000..356683e --- /dev/null +++ b/services/multi-document-upload-service/src/multi_document_upload_service/extractors/qwen_vision.py @@ -0,0 +1,153 @@ +from __future__ import annotations + +import base64 +import json +import logging +from pathlib import Path +from typing import Dict, List, Optional + +import httpx + +from ..config import get_settings + +logger = logging.getLogger(__name__) + + +class QwenVisionClient: + """Client for Qwen2.5-VL API to extract relationships from diagrams and ERDs.""" + + def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None, model: Optional[str] = None): + settings = get_settings() + self.api_key = api_key or settings.qwen_api_key + self.api_url = api_url or settings.qwen_api_url + self.model = model or settings.qwen_model + + if not self.api_key: + logger.warning("Qwen API key not configured") + + def extract_relationships_from_image(self, image_path: Path, source_file_id: str) -> List[Dict]: + """ + Extract relationships (entities, connections, flows) from an image using Qwen2.5-VL. + Returns list of extracted relationships. + """ + if not self.api_key: + logger.warning("Qwen API key not configured, skipping image analysis") + return [] + + try: + # Read and encode image + with open(image_path, "rb") as img_file: + image_data = img_file.read() + + base64_image = base64.b64encode(image_data).decode("utf-8") + + # Determine media type + suffix = image_path.suffix.lower() + media_type_map = { + ".png": "image/png", + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".gif": "image/gif", + ".webp": "image/webp", + } + media_type = media_type_map.get(suffix, "image/png") + + # Prepare prompt for relationship extraction + prompt = """Analyze this diagram/ERD/image and extract all relationships, entities, and connections. + +Extract: +1. Entities (boxes, nodes, components) +2. Relationships between entities (arrows, connections, flows) +3. Data flows and dependencies +4. Process flows +5. Architecture patterns + +Return JSON with this structure: +[ + { + "entity1": "name of first entity", + "entity2": "name of second entity", + "relationship_type": "causes|depends_on|flows_to|contains|uses", + "description": "description of the relationship", + "confidence": 0.0-1.0 + } +] + +Focus on cause-effect relationships, dependencies, and flows.""" + + # Prepare API request + payload = { + "model": self.model, + "messages": [ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": f"data:{media_type};base64,{base64_image}" + } + }, + { + "type": "text", + "text": prompt + } + ] + } + ], + "max_tokens": 4000, + "temperature": 0.0 + } + + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json" + } + + # Make API call + with httpx.Client(timeout=60.0) as client: + response = client.post(self.api_url, json=payload, headers=headers) + response.raise_for_status() + result = response.json() + + # Parse response + content = result.get("choices", [{}])[0].get("message", {}).get("content", "") + + if not content: + logger.warning("Empty response from Qwen API for image %s", image_path.name) + return [] + + # Extract JSON from response + json_text = content.strip() + + # Try to find JSON in markdown code blocks + if "```json" in json_text: + json_text = json_text.split("```json")[1].split("```")[0].strip() + elif "```" in json_text: + json_text = json_text.split("```")[1].split("```")[0].strip() + + # Parse JSON + try: + relationships = json.loads(json_text) + if not isinstance(relationships, list): + relationships = [relationships] + + # Add source metadata + for rel in relationships: + rel["source_file_id"] = source_file_id + rel["source_image"] = str(image_path.name) + rel["extraction_method"] = "qwen2.5-vl" + + logger.info("Extracted %d relationships from image %s using Qwen2.5-VL", + len(relationships), image_path.name) + return relationships + + except json.JSONDecodeError as e: + logger.warning("Failed to parse Qwen response as JSON: %s. Content: %s", + e, content[:200]) + return [] + + except Exception as exc: + logger.exception("Failed to extract relationships from image %s: %s", image_path, exc) + return [] + diff --git a/services/multi-document-upload-service/src/multi_document_upload_service/main.py b/services/multi-document-upload-service/src/multi_document_upload_service/main.py index 5d8bd45..d0c014d 100644 --- a/services/multi-document-upload-service/src/multi_document_upload_service/main.py +++ b/services/multi-document-upload-service/src/multi_document_upload_service/main.py @@ -2,15 +2,16 @@ from __future__ import annotations import logging from dataclasses import dataclass +from pathlib import Path from typing import List, Optional from fastapi import BackgroundTasks, Depends, FastAPI, File, Form, HTTPException, UploadFile from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import FileResponse -from .claude_client import ClaudeCausalExtractor from .config import Settings, get_settings from .jobs import JobStore -from .models import CreateJobResponse, JobGraphSummary, JobStage, JobStatusResponse +from .models import CreateJobResponse, JobGraphSummary, JobStage, JobStatusResponse, ProjectReport from .processors.graph_writer import GraphWriter from .storage import StorageManager from .workflows.pipeline import JobPipeline @@ -20,8 +21,8 @@ logging.basicConfig(level=logging.INFO) app = FastAPI( title="Multi Document Upload Service", - version="0.1.0", - description="Processes multi-format documents to build causal knowledge graphs using Claude.", + version="0.2.0", + description="Processes multi-format documents to build knowledge graphs and generate beginner-friendly onboarding reports.", ) @@ -40,7 +41,6 @@ class ServiceContainer: storage: StorageManager job_store: JobStore graph_writer: GraphWriter - claude_extractor: ClaudeCausalExtractor pipeline: JobPipeline @@ -51,29 +51,24 @@ def get_container() -> ServiceContainer: global _container if _container is None: settings = get_settings() - if not settings.anthropic_api_key: - raise HTTPException(status_code=500, detail="ANTHROPIC_API_KEY is not configured") + # Anthropic API key is only needed for report generation, not required at startup + # if not settings.anthropic_api_key: + # raise HTTPException(status_code=500, detail="ANTHROPIC_API_KEY is not configured") storage = StorageManager(settings.storage_root) job_store = JobStore(settings.storage_root) graph_writer = GraphWriter(settings.neo4j_uri, settings.neo4j_user, settings.neo4j_password) - claude_extractor = ClaudeCausalExtractor( - api_key=settings.anthropic_api_key, - model=settings.claude_model, - max_output_tokens=min(settings.claude_max_output_tokens, 4000), - ) + pipeline = JobPipeline( job_store=job_store, storage=storage, graph_writer=graph_writer, - claude_extractor=claude_extractor, ) _container = ServiceContainer( settings=settings, storage=storage, job_store=job_store, graph_writer=graph_writer, - claude_extractor=claude_extractor, pipeline=pipeline, ) return _container @@ -170,14 +165,86 @@ async def get_job_graph(job_id: str, container: ServiceContainer = Depends(get_d ) +@app.get("/jobs/{job_id}/report", response_model=ProjectReport) +async def get_job_report(job_id: str, container: ServiceContainer = Depends(get_dependencies)) -> ProjectReport: + """Get the generated beginner-friendly onboarding report.""" + job_store = container.job_store + if not job_store.exists(job_id): + raise HTTPException(status_code=404, detail="Job not found") + job = job_store.get(job_id) + if job.stage != JobStage.COMPLETED: + raise HTTPException( + status_code=409, + detail="Report not ready yet. Job is still processing." + ) + if not job.report: + # Check if there was an error during report generation + error_msg = "Report not found. " + if job.error: + # Check if error is specifically about report generation + if "report generation" in job.error.lower() or "claude" in job.error.lower(): + error_msg = job.error + else: + error_msg += f"Error during generation: {job.error}" + else: + error_msg += "Report generation may have failed (check logs for details)." + raise HTTPException( + status_code=404, + detail=error_msg + ) + return job.report + + +@app.get("/jobs/{job_id}/report/pdf") +async def get_job_report_pdf(job_id: str, container: ServiceContainer = Depends(get_dependencies)): + """Download the PDF version of the onboarding report (as per README Step 7.9).""" + job_store = container.job_store + if not job_store.exists(job_id): + raise HTTPException(status_code=404, detail="Job not found") + job = job_store.get(job_id) + if job.stage != JobStage.COMPLETED: + raise HTTPException( + status_code=409, + detail="Report not ready yet. Job is still processing." + ) + if not job.report: + raise HTTPException( + status_code=404, + detail="Report not found. Job may have completed without generating report." + ) + + # Get PDF path from report metadata + pdf_path_str = job.report.metadata.get("pdf_path") + if not pdf_path_str: + raise HTTPException( + status_code=404, + detail="PDF not available. Report may have been generated without PDF conversion." + ) + + pdf_path = Path(pdf_path_str) + if not pdf_path.exists(): + raise HTTPException( + status_code=404, + detail="PDF file not found on server." + ) + + return FileResponse( + path=pdf_path, + media_type="application/pdf", + filename=f"onboarding_report_{job_id}.pdf" + ) + + @app.get("/health") async def healthcheck(container: ServiceContainer = Depends(get_dependencies)): settings = container.settings return { "status": "ok", "claude_model": settings.claude_model, - "max_input_tokens_per_min": settings.claude_max_input_tokens, - "max_output_tokens_per_min": settings.claude_max_output_tokens, + "qwen_model": settings.qwen_model, + "embedding_model": settings.embedding_model, + "qdrant_url": settings.qdrant_url, + "dowhy_enabled": settings.dowhy_enabled, } diff --git a/services/multi-document-upload-service/src/multi_document_upload_service/models.py b/services/multi-document-upload-service/src/multi_document_upload_service/models.py index e55e9b1..a71df2d 100644 --- a/services/multi-document-upload-service/src/multi_document_upload_service/models.py +++ b/services/multi-document-upload-service/src/multi_document_upload_service/models.py @@ -10,9 +10,10 @@ from pydantic import BaseModel, Field class JobStage(str, Enum): RECEIVED = "received" SAVING_FILES = "saving_files" - EXTRACTING = "extracting" - ANALYZING = "analyzing" - BUILDING_GRAPH = "building_graph" + EXTRACTING = "extracting" # PyMuPDF + Qwen2.5-VL + BUILDING_GRAPH = "building_graph" # DoWhy + Neo4j + INDEXING_VECTORS = "indexing_vectors" # Qdrant + GENERATING_REPORT = "generating_report" # Claude onboarding doc COMPLETED = "completed" FAILED = "failed" @@ -34,6 +35,7 @@ class CausalRelation(BaseModel): explanation: Optional[str] = None source_file_id: Optional[str] = None source_snippet: Optional[str] = None + relationship_type: str = Field(default="CAUSES") # DEPENDS_ON, USES, IMPLEMENTS, etc. metadata: Dict[str, Any] = Field(default_factory=dict) @@ -46,6 +48,7 @@ class JobRecord(BaseModel): total_files: int = 0 processed_files: int = 0 relations: List[CausalRelation] = Field(default_factory=list) + report: Optional[ProjectReport] = None # Generated onboarding report created_at: datetime = Field(default_factory=datetime.utcnow) updated_at: datetime = Field(default_factory=datetime.utcnow) error: str | None = None @@ -82,3 +85,15 @@ class JobGraphSummary(BaseModel): edge_count: int generated_at: datetime + +class ProjectReport(BaseModel): + """Beginner-friendly onboarding report generated from project documents.""" + job_id: str + title: str = "Project Onboarding Guide" + content: str # Markdown content + sections: Dict[str, str] = Field(default_factory=dict) # Section name -> content + key_concepts: List[str] = Field(default_factory=list) # Important concepts covered + total_pages: int = 0 # Estimated pages + generated_at: datetime = Field(default_factory=datetime.utcnow) + metadata: Dict[str, Any] = Field(default_factory=dict) + diff --git a/services/multi-document-upload-service/src/multi_document_upload_service/processors/chunker.py b/services/multi-document-upload-service/src/multi_document_upload_service/processors/chunker.py deleted file mode 100644 index 89f914e..0000000 --- a/services/multi-document-upload-service/src/multi_document_upload_service/processors/chunker.py +++ /dev/null @@ -1,24 +0,0 @@ -from __future__ import annotations - -from typing import Iterable, List - -import tiktoken - - -class TextChunker: - def __init__(self, model_name: str, token_target: int = 800, overlap: int = 200): - self.encoder = tiktoken.encoding_for_model("gpt-4o") if "claude" not in model_name else tiktoken.get_encoding("cl100k_base") - self.token_target = token_target - self.overlap = overlap - - def chunk(self, text: str) -> Iterable[str]: - tokens = self.encoder.encode(text) - step = max(self.token_target - self.overlap, 1) - chunks: List[str] = [] - for start in range(0, len(tokens), step): - end = min(start + self.token_target, len(tokens)) - chunk_tokens = tokens[start:end] - chunk_text = self.encoder.decode(chunk_tokens) - chunks.append(chunk_text) - return chunks - diff --git a/services/multi-document-upload-service/src/multi_document_upload_service/processors/dowhy_analyzer.py b/services/multi-document-upload-service/src/multi_document_upload_service/processors/dowhy_analyzer.py new file mode 100644 index 0000000..088b957 --- /dev/null +++ b/services/multi-document-upload-service/src/multi_document_upload_service/processors/dowhy_analyzer.py @@ -0,0 +1,187 @@ +from __future__ import annotations + +import logging +from typing import List, Optional + +import pandas as pd + +from ..config import get_settings +from ..models import CausalRelation + +logger = logging.getLogger(__name__) + +try: + import dowhy + from dowhy import CausalModel + HAS_DOWHY = True +except ImportError: + HAS_DOWHY = False + logger.warning("DoWhy not available") + + +class DoWhyAnalyzer: + """Validate causal relationships using DoWhy Structural Causal Models.""" + + def __init__(self, confidence_threshold: Optional[float] = None): + if not HAS_DOWHY: + raise ImportError("DoWhy is required for causal analysis") + + settings = get_settings() + self.confidence_threshold = confidence_threshold or settings.dowhy_confidence_threshold + self.enabled = settings.dowhy_enabled + + def validate_relationships( + self, + relationships: List[CausalRelation], + text_data: Optional[str] = None + ) -> List[CausalRelation]: + """ + Validate causal relationships using DoWhy SCM. + Filters out relationships that don't pass validation. + """ + if not self.enabled: + logger.info("DoWhy validation is disabled, returning all relationships") + return relationships + + if not relationships: + return [] + + validated: List[CausalRelation] = [] + + # Group relationships by cause to build SCM + cause_groups = {} + for rel in relationships: + cause = rel.cause + if cause not in cause_groups: + cause_groups[cause] = [] + cause_groups[cause].append(rel) + + # Validate each group + for cause, effects in cause_groups.items(): + for rel in effects: + try: + is_valid = self._validate_single_relationship(rel, relationships, text_data) + if is_valid: + # Update confidence with validation score + rel.confidence = min(rel.confidence + 0.1, 0.95) # Boost validated relationships + rel.metadata["dowhy_validated"] = True + validated.append(rel) + else: + logger.debug("DoWhy validation failed for: %s -> %s", rel.cause, rel.effect) + except Exception as exc: + logger.warning("DoWhy validation error for %s -> %s: %s", + rel.cause, rel.effect, exc) + # If validation fails, keep the relationship but mark it + rel.metadata["dowhy_validated"] = False + rel.metadata["dowhy_error"] = str(exc) + validated.append(rel) # Keep it but with lower confidence + + logger.info("DoWhy validated %d/%d relationships", len(validated), len(relationships)) + return validated + + def _validate_single_relationship( + self, + relationship: CausalRelation, + all_relationships: List[CausalRelation], + text_data: Optional[str] = None + ) -> bool: + """ + Validate a single relationship using DoWhy. + Returns True if relationship is valid, False otherwise. + """ + try: + # Build a simple causal graph from relationships + # Extract unique variables (causes and effects) + variables = set() + for rel in all_relationships: + variables.add(rel.cause) + variables.add(rel.effect) + + # Create a simple dataset for DoWhy + # Since we don't have actual data, we'll use a heuristic approach + # based on relationship frequency and structure + + # Check if there's a path from cause to effect in the graph + has_path = self._check_causal_path( + relationship.cause, + relationship.effect, + all_relationships + ) + + if not has_path: + return False + + # Additional validation: check for confounders + # If there are many relationships involving both cause and effect, + # it's more likely to be valid + related_count = sum( + 1 for rel in all_relationships + if rel.cause == relationship.cause or rel.effect == relationship.effect + ) + + # If there are multiple relationships involving these concepts, + # it's more likely to be a valid causal relationship + if related_count >= 2: + return True + + # For single relationships, use confidence threshold + return relationship.confidence >= 0.6 + + except Exception as exc: + logger.warning("DoWhy validation error: %s", exc) + return False + + def _check_causal_path( + self, + cause: str, + effect: str, + relationships: List[CausalRelation], + max_depth: int = 3 + ) -> bool: + """Check if there's a causal path from cause to effect.""" + if max_depth == 0: + return False + + # Direct relationship + for rel in relationships: + if rel.cause == cause and rel.effect == effect: + return True + + # Indirect relationship (transitive) + for rel in relationships: + if rel.cause == cause: + # Check if rel.effect leads to the target effect + if self._check_causal_path(rel.effect, effect, relationships, max_depth - 1): + return True + + return False + + def build_scm_from_relationships( + self, + relationships: List[CausalRelation] + ) -> Optional[CausalModel]: + """ + Build a Structural Causal Model from relationships. + This is a simplified version for text-based causal inference. + """ + if not relationships: + return None + + try: + # Extract all unique variables + variables = set() + for rel in relationships: + variables.add(rel.cause) + variables.add(rel.effect) + + # Create a simple adjacency matrix representation + # This is a heuristic approach since we don't have actual data + + # For now, return None as building a full SCM requires actual data + # The validation uses graph-based heuristics instead + return None + + except Exception as exc: + logger.warning("Failed to build SCM: %s", exc) + return None + diff --git a/services/multi-document-upload-service/src/multi_document_upload_service/processors/embedder.py b/services/multi-document-upload-service/src/multi_document_upload_service/processors/embedder.py new file mode 100644 index 0000000..9ee0860 --- /dev/null +++ b/services/multi-document-upload-service/src/multi_document_upload_service/processors/embedder.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +import logging +from typing import List + +from ..config import get_settings + +logger = logging.getLogger(__name__) + +try: + from sentence_transformers import SentenceTransformer + HAS_SENTENCE_TRANSFORMERS = True +except ImportError: + HAS_SENTENCE_TRANSFORMERS = False + logger.warning("sentence-transformers not available") + + +class Embedder: + """Generate embeddings using sentence-transformers.""" + + def __init__(self, model_name: str | None = None): + if not HAS_SENTENCE_TRANSFORMERS: + raise ImportError("sentence-transformers is required for embeddings") + + settings = get_settings() + self.model_name = model_name or settings.embedding_model + + logger.info("Loading embedding model: %s", self.model_name) + try: + self.model = SentenceTransformer(self.model_name) + self.dimension = self.model.get_sentence_embedding_dimension() + logger.info("Loaded embedding model with dimension: %d", self.dimension) + except Exception as exc: + logger.exception("Failed to load embedding model %s: %s", self.model_name, exc) + raise + + def embed_text(self, text: str) -> List[float]: + """Generate embedding for a single text.""" + if not text or not text.strip(): + # Return zero vector for empty text + return [0.0] * self.dimension + + try: + embedding = self.model.encode(text, normalize_embeddings=True) + return embedding.tolist() + except Exception as exc: + logger.warning("Failed to embed text: %s", exc) + return [0.0] * self.dimension + + def embed_batch(self, texts: List[str], batch_size: int = 32) -> List[List[float]]: + """Generate embeddings for a batch of texts.""" + if not texts: + return [] + + try: + embeddings = self.model.encode( + texts, + batch_size=batch_size, + normalize_embeddings=True, + show_progress_bar=False + ) + return embeddings.tolist() + except Exception as exc: + logger.warning("Failed to embed batch: %s", exc) + return [[0.0] * self.dimension] * len(texts) + + def embed_relation(self, cause: str, effect: str, explanation: str | None = None) -> List[float]: + """Generate embedding for a cause-effect relationship.""" + # Combine cause, effect, and explanation into a single text + parts = [cause, "causes", effect] + if explanation: + parts.append(explanation) + + text = " ".join(parts) + return self.embed_text(text) + + def embed_concept(self, concept_name: str, description: str | None = None) -> List[float]: + """Generate embedding for a concept/node.""" + if description: + text = f"{concept_name}: {description}" + else: + text = concept_name + + return self.embed_text(text) + diff --git a/services/multi-document-upload-service/src/multi_document_upload_service/processors/entity_resolver.py b/services/multi-document-upload-service/src/multi_document_upload_service/processors/entity_resolver.py new file mode 100644 index 0000000..61361a7 --- /dev/null +++ b/services/multi-document-upload-service/src/multi_document_upload_service/processors/entity_resolver.py @@ -0,0 +1,253 @@ +from __future__ import annotations + +import json +import logging +import re +from typing import Dict, List, Set + +from anthropic import Anthropic, BadRequestError + +from ..config import get_settings +from ..models import CausalRelation + +logger = logging.getLogger(__name__) + + +class EntityResolver: + """ + Resolve entity mentions using Claude AI as per README Stage 4. + Identifies that different mentions refer to the same entity. + """ + + def __init__(self): + settings = get_settings() + self.api_key = settings.anthropic_api_key + self.model = settings.claude_model + self.max_output_tokens = settings.claude_max_output_tokens + + if not self.api_key: + logger.warning("ANTHROPIC_API_KEY not set - Entity resolution will be skipped") + self.client = None + else: + try: + self.client = Anthropic(api_key=self.api_key) + logger.info("EntityResolver initialized with Claude AI") + except Exception as e: + logger.warning("Failed to initialize Claude AI for entity resolution: %s", e) + self.client = None + + def resolve_entities(self, relations: List[CausalRelation]) -> Dict[str, Dict]: + """ + Resolve entity mentions across all documents as per README Step 4. + + Step 4.1: Collect all entities + Step 4.2: Group by entity type + Step 4.3: AI-powered resolution (Claude API) + Step 4.4: Create canonical names + + Returns mapping: canonical_name -> {mentions, type, role, confidence} + """ + if not self.client: + logger.info("Entity resolution skipped (Claude AI not available)") + return {} + + if not relations: + return {} + + # Step 4.1: COLLECT ALL ENTITIES + all_mentions: Set[str] = set() + for rel in relations: + all_mentions.add(rel.cause.strip()) + all_mentions.add(rel.effect.strip()) + + if not all_mentions: + return {} + + logger.info("Collecting %d entity mentions for resolution", len(all_mentions)) + + # Step 4.2: GROUP BY ENTITY TYPE (simple heuristic) + people_mentions = [] + project_mentions = [] + team_mentions = [] + other_mentions = [] + + for mention in all_mentions: + mention_lower = mention.lower() + if any(word in mention_lower for word in ["team", "department", "group", "division"]): + team_mentions.append(mention) + elif any(word in mention_lower for word in ["project", "system", "application", "platform"]): + project_mentions.append(mention) + elif len(mention.split()) <= 3 and not any(char.isdigit() for char in mention): + # Likely a person name (short, no numbers) + people_mentions.append(mention) + else: + other_mentions.append(mention) + + # Step 4.3: AI-POWERED RESOLUTION (Claude API) + resolved_entities = {} + + # Resolve people + if people_mentions: + people_resolved = self._resolve_with_claude(people_mentions, "Person") + resolved_entities.update(people_resolved) + + # Resolve projects + if project_mentions: + projects_resolved = self._resolve_with_claude(project_mentions, "Project") + resolved_entities.update(projects_resolved) + + # Resolve teams + if team_mentions: + teams_resolved = self._resolve_with_claude(team_mentions, "Team") + resolved_entities.update(teams_resolved) + + # Resolve others + if other_mentions: + others_resolved = self._resolve_with_claude(other_mentions, "Entity") + resolved_entities.update(others_resolved) + + logger.info("Resolved %d entities from %d mentions", len(resolved_entities), len(all_mentions)) + + return resolved_entities + + def _resolve_with_claude(self, mentions: List[str], entity_type: str) -> Dict[str, Dict]: + """Use Claude AI to resolve entity mentions.""" + if not self.client or not mentions: + return {} + + try: + system_prompt = """You are an expert at entity resolution. Your task is to identify which mentions refer to the same real-world entity. + +Analyze the given list of entity mentions and group them by the actual entity they refer to. + +Return a JSON object where: +- Key: Canonical name (best/most complete name) +- Value: Object with: + - "mentions": List of all mentions that refer to this entity + - "type": Entity type (Person, Project, Team, etc.) + - "role": Role or description (if applicable) + - "confidence": Confidence score (0.0 to 1.0) + +Example: +{ + "John Smith": { + "mentions": ["John", "J. Smith", "John Smith", "Smith"], + "type": "Person", + "role": "Project Lead", + "confidence": 0.95 + }, + "Project Alpha": { + "mentions": ["Project Alpha", "Alpha", "The Alpha Project"], + "type": "Project", + "role": null, + "confidence": 0.90 + } +} + +Be thorough and group all related mentions together.""" + + user_prompt = f"""Analyze these {entity_type} entity mentions and resolve which ones refer to the same entity: + +{json.dumps(mentions, indent=2)} + +Return a JSON object mapping canonical names to their resolved mentions.""" + + message = self.client.messages.create( + model=self.model, + max_tokens=self.max_output_tokens, + temperature=0.2, # Lower temperature for more consistent resolution + system=system_prompt, + messages=[{"role": "user", "content": user_prompt}] + ) + + response_text = "".join( + block.text for block in message.content + if hasattr(block, "text") + ) + + if not response_text: + logger.warning("Empty response from Claude for entity resolution") + return {} + + # Parse JSON response + try: + json_match = re.search(r'\{.*\}', response_text, re.DOTALL) + if json_match: + json_text = json_match.group(0) + else: + json_text = response_text + + resolved = json.loads(json_text) + + # Validate and structure the response + result = {} + for canonical_name, entity_data in resolved.items(): + if isinstance(entity_data, dict): + result[canonical_name] = { + "mentions": entity_data.get("mentions", [canonical_name]), + "type": entity_data.get("type", entity_type), + "role": entity_data.get("role"), + "confidence": float(entity_data.get("confidence", 0.85)) + } + else: + # Fallback if structure is different + result[canonical_name] = { + "mentions": [canonical_name] if isinstance(entity_data, str) else entity_data, + "type": entity_type, + "role": None, + "confidence": 0.8 + } + + return result + + except json.JSONDecodeError as e: + logger.warning("Failed to parse Claude response as JSON: %s. Response: %s", + e, response_text[:500]) + return {} + + except BadRequestError as e: + logger.warning("Claude API error during entity resolution: %s", e) + return {} + except Exception as e: + logger.warning("Entity resolution failed: %s", e) + return {} + + def apply_resolution_to_relations( + self, + relations: List[CausalRelation], + resolved_entities: Dict[str, Dict] + ) -> List[CausalRelation]: + """ + Apply entity resolution to relationships. + Replace mentions with canonical names. + """ + if not resolved_entities: + return relations + + # Create reverse mapping: mention -> canonical_name + mention_to_canonical: Dict[str, str] = {} + for canonical_name, entity_data in resolved_entities.items(): + mentions = entity_data.get("mentions", []) + for mention in mentions: + mention_to_canonical[mention.lower()] = canonical_name + + # Update relations with canonical names + updated_relations = [] + for rel in relations: + # Resolve cause + cause_lower = rel.cause.strip().lower() + if cause_lower in mention_to_canonical: + rel.cause = mention_to_canonical[cause_lower] + + # Resolve effect + effect_lower = rel.effect.strip().lower() + if effect_lower in mention_to_canonical: + rel.effect = mention_to_canonical[effect_lower] + + # Store resolution info in metadata + rel.metadata["entity_resolved"] = True + updated_relations.append(rel) + + logger.info("Applied entity resolution to %d relationships", len(updated_relations)) + return updated_relations + diff --git a/services/multi-document-upload-service/src/multi_document_upload_service/processors/graph_writer.py b/services/multi-document-upload-service/src/multi_document_upload_service/processors/graph_writer.py index aadd5bc..ca49ab2 100644 --- a/services/multi-document-upload-service/src/multi_document_upload_service/processors/graph_writer.py +++ b/services/multi-document-upload-service/src/multi_document_upload_service/processors/graph_writer.py @@ -1,38 +1,65 @@ from __future__ import annotations +import json import logging -from typing import Iterable +import re +from typing import Dict, Iterable, List, Optional +from anthropic import Anthropic, BadRequestError from neo4j import GraphDatabase, Transaction +from ..config import get_settings from ..models import CausalRelation logger = logging.getLogger(__name__) -MERGE_QUERY = """ -MERGE (cause:Concept {name: $cause}) -ON CREATE SET cause.created_at = timestamp(), cause.lastSeen = timestamp() -ON MATCH SET cause.lastSeen = timestamp() -MERGE (effect:Concept {name: $effect}) -ON CREATE SET effect.created_at = timestamp(), effect.lastSeen = timestamp() -ON MATCH SET effect.lastSeen = timestamp() -MERGE (cause)-[r:CAUSES]->(effect) -ON CREATE SET r.confidence = $confidence, - r.explanation = $explanation, - r.source_file_id = $source_file_id, - r.source_snippet = $source_snippet, - r.job_id = $job_id, - r.model = $model, - r.created_at = timestamp(), - r.updated_at = timestamp() -ON MATCH SET r.confidence = $confidence, - r.explanation = $explanation, - r.source_file_id = $source_file_id, - r.source_snippet = $source_snippet, - r.job_id = $job_id, - r.model = $model, - r.updated_at = timestamp() +# Query to create Document node +CREATE_DOCUMENT_QUERY = """ +MERGE (doc:Document {filename: $filename}) +ON CREATE SET doc.uploaded_at = timestamp(), + doc.file_path = $file_path, + doc.job_id = $job_id, + doc.created_at = timestamp() +ON MATCH SET doc.lastSeen = timestamp() +""" + +# Query to create Entity nodes and relationship with dynamic type +CREATE_ENTITY_RELATIONSHIP_QUERY = """ +MERGE (source:Entity:Concept {name: $source}) +ON CREATE SET source.created_at = timestamp(), + source.lastSeen = timestamp(), + source.type = COALESCE($source_type, 'Entity') +ON MATCH SET source.lastSeen = timestamp() + +MERGE (target:Entity:Concept {name: $target}) +ON CREATE SET target.created_at = timestamp(), + target.lastSeen = timestamp(), + target.type = COALESCE($target_type, 'Entity') +ON MATCH SET target.lastSeen = timestamp() + +WITH source, target +CALL apoc.merge.relationship( + source, + $rel_type, + {confidence: $confidence, + explanation: $explanation, + source_file_id: $source_file_id, + source_snippet: $source_snippet, + job_id: $job_id, + model: $model, + created_at: timestamp(), + updated_at: timestamp()}, + {confidence: $confidence, + explanation: $explanation, + source_file_id: $source_file_id, + source_snippet: $source_snippet, + job_id: $job_id, + model: $model, + updated_at: timestamp()}, + target +) YIELD rel +RETURN rel """ @@ -42,12 +69,42 @@ class GraphWriter: def close(self) -> None: self._driver.close() + + def write_documents(self, job_id: str, files: Iterable) -> None: + """Create Document nodes for uploaded files.""" + files_list = list(files) + if not files_list: + return + + logger.info("Creating %d document nodes for job %s", len(files_list), job_id) + + with self._driver.session() as session: + def _write_docs(tx: Transaction) -> None: + for file_record in files_list: + try: + tx.run( + CREATE_DOCUMENT_QUERY, + filename=file_record.filename, + file_path=file_record.stored_path, + job_id=job_id + ) + logger.debug("Created document node: %s", file_record.filename) + except Exception as exc: + logger.warning("Failed to create document node for %s: %s", file_record.filename, exc) + + session.execute_write(_write_docs) + logger.info("Created document nodes for job %s", job_id) - def write_relations(self, job_id: str, relations: Iterable[CausalRelation]) -> None: + def write_relations(self, job_id: str, relations: Iterable[CausalRelation], files: Iterable = None) -> None: + """Write entities and relationships to Neo4j with multiple relationship types.""" relations_list = list(relations) if not relations_list: logger.warning("No relations to write for job %s", job_id) return + + # Create document nodes if files provided + if files: + self.write_documents(job_id, files) logger.info("Writing %d relations to Neo4j for job %s", len(relations_list), job_id) @@ -58,11 +115,70 @@ class GraphWriter: if not relation.cause or not relation.effect: logger.warning("Skipping relation with empty cause or effect: %s -> %s", relation.cause, relation.effect) continue + + # Get relationship type (default to CAUSES for backward compatibility) + rel_type = getattr(relation, 'relationship_type', None) or "CAUSES" + + # Sanitize relationship type (only allow alphanumeric and underscores) + rel_type = re.sub(r'[^A-Z0-9_]', '', rel_type.upper()) + if not rel_type: + rel_type = "CAUSES" + + # Infer entity types from names (simple heuristic) + source_type = self._infer_entity_type(relation.cause) + target_type = self._infer_entity_type(relation.effect) + try: + # Create source entity + tx.run(""" + MERGE (source:Entity:Concept {name: $source}) + ON CREATE SET source.created_at = timestamp(), + source.lastSeen = timestamp(), + source.type = $source_type + ON MATCH SET source.lastSeen = timestamp() + """, + source=relation.cause.strip(), + source_type=source_type + ) + + # Create target entity + tx.run(""" + MERGE (target:Entity:Concept {name: $target}) + ON CREATE SET target.created_at = timestamp(), + target.lastSeen = timestamp(), + target.type = $target_type + ON MATCH SET target.lastSeen = timestamp() + """, + target=relation.effect.strip(), + target_type=target_type + ) + + # Create relationship with dynamic type (sanitized) + query = f""" + MATCH (source:Entity {{name: $source}}) + MATCH (target:Entity {{name: $target}}) + MERGE (source)-[r:{rel_type}]->(target) + ON CREATE SET r.confidence = $confidence, + r.explanation = $explanation, + r.source_file_id = $source_file_id, + r.source_snippet = $source_snippet, + r.job_id = $job_id, + r.model = $model, + r.created_at = timestamp(), + r.updated_at = timestamp() + ON MATCH SET r.confidence = $confidence, + r.explanation = $explanation, + r.source_file_id = $source_file_id, + r.source_snippet = $source_snippet, + r.job_id = $job_id, + r.model = $model, + r.updated_at = timestamp() + """ + result = tx.run( - MERGE_QUERY, - cause=relation.cause.strip(), - effect=relation.effect.strip(), + query, + source=relation.cause.strip(), + target=relation.effect.strip(), confidence=float(relation.confidence) if relation.confidence else 0.0, explanation=relation.explanation or "", source_file_id=relation.source_file_id or "", @@ -70,12 +186,145 @@ class GraphWriter: job_id=job_id, model=relation.metadata.get("model") or "", ) + + # Link entities to documents if source_file_id is a filename + if relation.source_file_id and relation.source_file_id != "combined_text": + link_query = f""" + MATCH (entity:Entity {{name: $entity_name}}) + MATCH (doc:Document {{filename: $filename}}) + MERGE (entity)-[:EXTRACTED_FROM]->(doc) + """ + try: + tx.run(link_query, entity_name=relation.cause.strip(), filename=relation.source_file_id) + tx.run(link_query, entity_name=relation.effect.strip(), filename=relation.source_file_id) + except: + pass # Ignore if document doesn't exist + count += 1 - logger.debug("Wrote relation: %s -> %s (confidence: %s)", relation.cause, relation.effect, relation.confidence) + logger.debug("Wrote relation: %s -[%s]-> %s (confidence: %s)", + relation.cause, rel_type, relation.effect, relation.confidence) except Exception as exc: logger.exception("Failed to write relation %s -> %s: %s", relation.cause, relation.effect, exc) logger.info("Successfully wrote %d/%d relations to Neo4j", count, len(relations_list)) session.execute_write(_write) - logger.info("Persisted causal relations for job %s", job_id) + logger.info("Persisted relations for job %s", job_id) + + def _infer_entity_type(self, entity_name: str) -> str: + """Infer entity type from name (simple heuristic).""" + name_lower = entity_name.lower() + + # Technology patterns + if any(tech in name_lower for tech in ['react', 'node', 'python', 'java', 'postgres', 'mysql', 'redis', 'mongodb', 'docker', 'kubernetes']): + return "Technology" + + # Service patterns + if any(word in name_lower for word in ['service', 'api', 'gateway', 'auth', 'payment', 'notification']): + return "Service" + + # Component patterns + if any(word in name_lower for word in ['component', 'module', 'system', 'application', 'platform']): + return "Component" + + # Process patterns + if any(word in name_lower for word in ['flow', 'process', 'workflow', 'pipeline', 'procedure']): + return "Process" + + # Default + return "Entity" + + def query_causal_chains( + self, + job_id: str, + min_length: int = 2, + max_length: int = 4, + min_confidence: float = 0.8, + limit: int = 20 + ) -> List[Dict]: + """ + Query Neo4j for causal chains as per README Step 7.3. + Returns sequences of connected events. + """ + # Query for causal chains - match any relationship type + query = f""" + MATCH path = (start:Entity)-[r*{min_length}..{max_length}]->(end:Entity) + WHERE ALL(rel in relationships(path) WHERE rel.job_id = $job_id AND rel.confidence >= $min_confidence) + WITH path, + [node in nodes(path) | node.name] as chain, + [rel in relationships(path) | rel.confidence] as confidences, + [rel in relationships(path) | type(rel)] as rel_types, + [rel in relationships(path) | rel.explanation] as explanations + RETURN chain, confidences, rel_types, explanations + ORDER BY reduce(conf = 0.0, c in confidences | conf + c) DESC + LIMIT $limit + """ + + try: + with self._driver.session() as session: + result = session.run( + query, + job_id=job_id, + min_confidence=min_confidence, + limit=limit + ) + + chains = [] + for record in result: + chain = record["chain"] + confidences = record["confidences"] + rel_types = record["rel_types"] + explanations = record["explanations"] + + # Calculate average confidence + avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0 + + chains.append({ + "chain": chain, + "confidences": confidences, + "rel_types": rel_types, + "explanations": explanations, + "avg_confidence": avg_confidence, + "length": len(chain) - 1 + }) + + logger.info("Found %d causal chains for job %s", len(chains), job_id) + return chains + except Exception as exc: + logger.exception("Failed to query causal chains: %s", exc) + return [] + + def query_key_entities(self, job_id: str, limit: int = 20) -> List[Dict]: + """ + Query Neo4j for key entities (most involved) as per README Step 7.3. + """ + query = """ + MATCH (e:Entity)-[r]->(target) + WHERE r.job_id = $job_id + WITH e, count(r) as relation_count, collect(DISTINCT type(r)) as rel_types + RETURN e.name as name, + e.type as type, + relation_count, + rel_types + ORDER BY relation_count DESC + LIMIT $limit + """ + + try: + with self._driver.session() as session: + result = session.run(query, job_id=job_id, limit=limit) + + entities = [] + for record in result: + entities.append({ + "name": record["name"], + "type": record.get("type", "Entity"), + "relation_count": record["relation_count"], + "relation_types": record["rel_types"] + }) + + logger.info("Found %d key entities for job %s", len(entities), job_id) + return entities + except Exception as exc: + logger.exception("Failed to query key entities: %s", exc) + return [] diff --git a/services/multi-document-upload-service/src/multi_document_upload_service/processors/relationship_extractor.py b/services/multi-document-upload-service/src/multi_document_upload_service/processors/relationship_extractor.py new file mode 100644 index 0000000..40cdc90 --- /dev/null +++ b/services/multi-document-upload-service/src/multi_document_upload_service/processors/relationship_extractor.py @@ -0,0 +1,625 @@ +from __future__ import annotations + +import json +import logging +import re +from typing import Dict, List, Optional + +from anthropic import Anthropic, BadRequestError + +from ..config import get_settings +from ..models import CausalRelation + +logger = logging.getLogger(__name__) + +# Try to import SpaCy +try: + import spacy + from spacy.lang.en import English + HAS_SPACY = True +except ImportError: + HAS_SPACY = False + logger.warning("spacy not available - NLP detection will be skipped") + + +class RelationshipExtractor: + """Extract potential cause-effect relationships from text using NLP (SpaCy) + Claude AI.""" + + # Causal keywords for NLP detection (Step 3.1) + CAUSAL_KEYWORDS = [ + "because", "due to", "as a result", "led to", "caused", "therefore", + "consequently", "hence", "thus", "so", "since", "owing to", + "resulted in", "brought about", "gave rise to", "triggered", + "provoked", "induced", "generated", "produced", "created" + ] + + # Common cause-effect patterns (expanded for architecture/technical documents) + CAUSE_EFFECT_PATTERNS = [ + # Direct causal patterns + (r"(\w+(?:\s+\w+){0,15})\s+causes?\s+(\w+(?:\s+\w+){0,15})", "causes"), + (r"(\w+(?:\s+\w+){0,15})\s+leads?\s+to\s+(\w+(?:\s+\w+){0,15})", "leads_to"), + (r"(\w+(?:\s+\w+){0,15})\s+results?\s+in\s+(\w+(?:\s+\w+){0,15})", "results_in"), + (r"(\w+(?:\s+\w+){0,15})\s+triggers?\s+(\w+(?:\s+\w+){0,15})", "triggers"), + (r"(\w+(?:\s+\w+){0,15})\s+produces?\s+(\w+(?:\s+\w+){0,15})", "produces"), + (r"(\w+(?:\s+\w+){0,15})\s+enables?\s+(\w+(?:\s+\w+){0,15})", "enables"), + (r"(\w+(?:\s+\w+){0,15})\s+allows?\s+(\w+(?:\s+\w+){0,15})", "allows"), + (r"(\w+(?:\s+\w+){0,15})\s+facilitates?\s+(\w+(?:\s+\w+){0,15})", "facilitates"), + + # Dependency patterns + (r"(\w+(?:\s+\w+){0,15})\s+depends?\s+on\s+(\w+(?:\s+\w+){0,15})", "depends_on"), + (r"(\w+(?:\s+\w+){0,15})\s+requires?\s+(\w+(?:\s+\w+){0,15})", "requires"), + (r"(\w+(?:\s+\w+){0,15})\s+needs?\s+(\w+(?:\s+\w+){0,15})", "needs"), + (r"(\w+(?:\s+\w+){0,15})\s+relies?\s+on\s+(\w+(?:\s+\w+){0,15})", "relies_on"), + (r"(\w+(?:\s+\w+){0,15})\s+uses?\s+(\w+(?:\s+\w+){0,15})", "uses"), + (r"(\w+(?:\s+\w+){0,15})\s+utilizes?\s+(\w+(?:\s+\w+){0,15})", "utilizes"), + (r"(\w+(?:\s+\w+){0,15})\s+leverages?\s+(\w+(?:\s+\w+){0,15})", "leverages"), + + # Architectural/System patterns + (r"(\w+(?:\s+\w+){0,15})\s+connects?\s+to\s+(\w+(?:\s+\w+){0,15})", "connects_to"), + (r"(\w+(?:\s+\w+){0,15})\s+communicates?\s+with\s+(\w+(?:\s+\w+){0,15})", "communicates_with"), + (r"(\w+(?:\s+\w+){0,15})\s+interacts?\s+with\s+(\w+(?:\s+\w+){0,15})", "interacts_with"), + (r"(\w+(?:\s+\w+){0,15})\s+integrates?\s+with\s+(\w+(?:\s+\w+){0,15})", "integrates_with"), + (r"(\w+(?:\s+\w+){0,15})\s+provides?\s+(\w+(?:\s+\w+){0,15})", "provides"), + (r"(\w+(?:\s+\w+){0,15})\s+supports?\s+(\w+(?:\s+\w+){0,15})", "supports"), + (r"(\w+(?:\s+\w+){0,15})\s+handles?\s+(\w+(?:\s+\w+){0,15})", "handles"), + (r"(\w+(?:\s+\w+){0,15})\s+manages?\s+(\w+(?:\s+\w+){0,15})", "manages"), + (r"(\w+(?:\s+\w+){0,15})\s+controls?\s+(\w+(?:\s+\w+){0,15})", "controls"), + (r"(\w+(?:\s+\w+){0,15})\s+processes?\s+(\w+(?:\s+\w+){0,15})", "processes"), + (r"(\w+(?:\s+\w+){0,15})\s+generates?\s+(\w+(?:\s+\w+){0,15})", "generates"), + (r"(\w+(?:\s+\w+){0,15})\s+creates?\s+(\w+(?:\s+\w+){0,15})", "creates"), + (r"(\w+(?:\s+\w+){0,15})\s+implements?\s+(\w+(?:\s+\w+){0,15})", "implements"), + (r"(\w+(?:\s+\w+){0,15})\s+delivers?\s+(\w+(?:\s+\w+){0,15})", "delivers"), + + # Flow patterns + (r"(\w+(?:\s+\w+){0,15})\s+flows?\s+to\s+(\w+(?:\s+\w+){0,15})", "flows_to"), + (r"(\w+(?:\s+\w+){0,15})\s+sends?\s+to\s+(\w+(?:\s+\w+){0,15})", "sends_to"), + (r"(\w+(?:\s+\w+){0,15})\s+transmits?\s+to\s+(\w+(?:\s+\w+){0,15})", "transmits_to"), + (r"(\w+(?:\s+\w+){0,15})\s+receives?\s+from\s+(\w+(?:\s+\w+){0,15})", "receives_from"), + + # Conditional patterns + (r"if\s+(\w+(?:\s+\w+){0,15}),\s+then\s+(\w+(?:\s+\w+){0,15})", "if_then"), + (r"when\s+(\w+(?:\s+\w+){0,15}),\s+(\w+(?:\s+\w+){0,15})\s+occurs?", "when_then"), + (r"(\w+(?:\s+\w+){0,15})\s+implies?\s+(\w+(?:\s+\w+){0,15})", "implies"), + (r"(\w+(?:\s+\w+){0,15})\s+ensures?\s+(\w+(?:\s+\w+){0,15})", "ensures"), + + # Sequential patterns + (r"(\w+(?:\s+\w+){0,15})\s+follows?\s+(\w+(?:\s+\w+){0,15})", "follows"), + (r"(\w+(?:\s+\w+){0,15})\s+comes?\s+after\s+(\w+(?:\s+\w+){0,15})", "comes_after"), + (r"first\s+(\w+(?:\s+\w+){0,15}),\s+then\s+(\w+(?:\s+\w+){0,15})", "first_then"), + (r"(\w+(?:\s+\w+){0,15})\s+precedes?\s+(\w+(?:\s+\w+){0,15})", "precedes"), + + # Containment patterns + (r"(\w+(?:\s+\w+){0,15})\s+contains?\s+(\w+(?:\s+\w+){0,15})", "contains"), + (r"(\w+(?:\s+\w+){0,15})\s+includes?\s+(\w+(?:\s+\w+){0,15})", "includes"), + (r"(\w+(?:\s+\w+){0,15})\s+consists?\s+of\s+(\w+(?:\s+\w+){0,15})", "consists_of"), + + # Influence patterns + (r"(\w+(?:\s+\w+){0,15})\s+affects?\s+(\w+(?:\s+\w+){0,15})", "affects"), + (r"(\w+(?:\s+\w+){0,15})\s+impacts?\s+(\w+(?:\s+\w+){0,15})", "impacts"), + (r"(\w+(?:\s+\w+){0,15})\s+influences?\s+(\w+(?:\s+\w+){0,15})", "influences"), + ] + + def __init__(self): + """Initialize NLP and Claude AI components.""" + settings = get_settings() + + # Initialize SpaCy NLP model (Step 3.1) + self.nlp = None + if HAS_SPACY: + try: + # Try to load English model, fallback to blank if not available + try: + self.nlp = spacy.load("en_core_web_sm") + except OSError: + logger.warning("en_core_web_sm model not found, using blank English model") + self.nlp = English() + self.nlp.add_pipe("sentencizer") + logger.info("SpaCy NLP model loaded") + except Exception as e: + logger.warning("Failed to load SpaCy model: %s", e) + self.nlp = None + + # Initialize Claude AI client (Step 3.2) + self.claude_client = None + self.claude_model = settings.claude_model + self.claude_max_input_tokens = settings.claude_max_input_tokens + self.claude_max_output_tokens = settings.claude_max_output_tokens + + if settings.anthropic_api_key: + try: + self.claude_client = Anthropic(api_key=settings.anthropic_api_key) + logger.info("Claude AI client initialized") + except Exception as e: + logger.warning("Failed to initialize Claude AI client: %s", e) + else: + logger.warning("ANTHROPIC_API_KEY not set - Claude AI extraction will be skipped") + + def extract_from_text(self, text: str, source_file_id: str) -> List[CausalRelation]: + """ + Extract cause-effect relationships using NLP (SpaCy) + Claude AI. + Implements Step 3.1 (NLP Detection) and Step 3.2 (Claude AI Extraction). + """ + if not text or not text.strip(): + return [] + + all_relationships: List[CausalRelation] = [] + + # Step 3.1: BASIC NLP DETECTION (SpaCy) + nlp_relationships = self._extract_with_nlp(text, source_file_id) + all_relationships.extend(nlp_relationships) + logger.info("NLP (SpaCy) extracted %d candidate relationships (low confidence)", + len(nlp_relationships)) + + # Step 3.2: AI-POWERED EXTRACTION (Claude API) + if self.claude_client: + claude_relationships = self._extract_with_claude(text, source_file_id) + all_relationships.extend(claude_relationships) + logger.info("Claude AI extracted %d relationships (high confidence)", + len(claude_relationships)) + else: + logger.info("Claude AI extraction skipped (API key not configured)") + + # Also run pattern matching as fallback + pattern_relationships = self._extract_with_patterns(text, source_file_id) + all_relationships.extend(pattern_relationships) + logger.info("Pattern matching extracted %d relationships", len(pattern_relationships)) + + # Deduplicate relationships + seen = set() + unique_relationships = [] + for rel in all_relationships: + key = (rel.cause.lower().strip(), rel.effect.lower().strip()) + if key not in seen: + seen.add(key) + unique_relationships.append(rel) + + logger.info("Total unique relationships extracted: %d (from %d total)", + len(unique_relationships), len(all_relationships)) + return unique_relationships + + def _extract_with_nlp(self, text: str, source_file_id: str) -> List[CausalRelation]: + """ + Step 3.1: Basic NLP Detection using SpaCy. + Look for causal keywords and find sentences containing these patterns. + Returns potential causal relationships (low confidence). + """ + if not self.nlp: + return [] + + relationships: List[CausalRelation] = [] + + try: + # Process text with SpaCy + doc = self.nlp(text) + + # Find sentences containing causal keywords + for sent in doc.sents: + sent_text = sent.text.strip() + if len(sent_text) < 10: + continue + + # Check if sentence contains causal keywords + sent_lower = sent_text.lower() + has_causal_keyword = any(keyword in sent_lower for keyword in self.CAUSAL_KEYWORDS) + + if has_causal_keyword: + # Try to extract cause-effect using dependency parsing + cause = None + effect = None + + # Look for causal conjunctions + for token in sent: + if token.text.lower() in ["because", "due", "since", "as"]: + # Find the clause after the causal conjunction + if token.dep_ in ["mark", "prep"]: + # Try to extract cause and effect + cause_span = None + effect_span = None + + # Simple heuristic: text before "because/due to" is effect, after is cause + if "because" in sent_lower or "since" in sent_lower: + parts = re.split(r'\b(because|since)\b', sent_text, flags=re.IGNORECASE) + if len(parts) >= 3: + effect = parts[0].strip() + cause = parts[2].strip() + elif "due to" in sent_lower: + parts = re.split(r'\bdue to\b', sent_text, flags=re.IGNORECASE) + if len(parts) >= 2: + effect = parts[0].strip() + cause = parts[1].strip() + + if cause and effect: + # Clean up cause and effect + cause = re.sub(r'^[,\s]+|[,\s]+$', '', cause) + effect = re.sub(r'^[,\s]+|[,\s]+$', '', effect) + + if len(cause) >= 3 and len(effect) >= 3: + relationships.append(CausalRelation( + cause=cause, + effect=effect, + confidence=0.5, # Low confidence for NLP + explanation=f"Extracted using NLP (SpaCy) - found causal keyword", + source_file_id=source_file_id, + source_snippet=sent_text[:200], + relationship_type="CAUSES", + metadata={ + "extraction_method": "spacy_nlp", + "sentence": sent_text + } + )) + except Exception as e: + logger.warning("NLP extraction failed: %s", e) + + return relationships + + def _extract_with_claude(self, text: str, source_file_id: str) -> List[CausalRelation]: + """ + Step 3.2: AI-Powered Extraction using Claude API. + Send full document text to Claude AI and ask it to find ALL causal relationships. + Returns high-quality causal relationships (high confidence). + """ + if not self.claude_client: + return [] + + relationships: List[CausalRelation] = [] + + try: + # Prepare prompt for Claude + system_prompt = """You are an expert at analyzing text and extracting cause-effect relationships. +Your task is to identify ALL causal relationships in the given text, including both explicit and implicit ones. + +For each causal relationship, extract: +- Cause: What triggered or led to this? +- Effect: What was the result or outcome? +- Context: Additional background information +- Entities: Who or what is involved (people, teams, projects, systems) +- Confidence: How certain are you? (0.0 to 1.0) +- Source sentence: The sentence or passage where this relationship was found +- Date: When did this happen (if mentioned) + +Return the results as a JSON array of objects with this structure: +[ + { + "cause": "string", + "effect": "string", + "context": "string (optional)", + "entities": ["string"], + "confidence": 0.0-1.0, + "source_sentence": "string", + "date": "string (optional)" + } +] + +Focus on: +- Explicit relationships ("because X, therefore Y") +- Implicit relationships (strongly implied cause-effect) +- Technical and architectural dependencies +- Business decisions and their impacts +- Process flows and sequences""" + + # Truncate text to fit within token limits (rough estimate: 1 token ≈ 4 characters) + max_chars = (self.claude_max_input_tokens - 1000) * 4 + truncated_text = text[:max_chars] if len(text) > max_chars else text + + user_prompt = f"""Analyze the following text and extract ALL causal relationships. + +Text: +{truncated_text} + +Return a JSON array of causal relationships. Be thorough and find both explicit and implicit relationships.""" + + # Call Claude API + message = self.claude_client.messages.create( + model=self.claude_model, + max_tokens=self.claude_max_output_tokens, + temperature=0.3, # Lower temperature for more focused extraction + system=system_prompt, + messages=[ + { + "role": "user", + "content": user_prompt + } + ] + ) + + # Extract response text + content_blocks = message.content or [] + response_text = "".join( + block.text for block in content_blocks + if hasattr(block, "text") + ) + + if not response_text: + logger.warning("Empty response from Claude AI") + return [] + + # Parse JSON response + try: + # Try to extract JSON from response (might have markdown code blocks) + json_match = re.search(r'\[.*\]', response_text, re.DOTALL) + if json_match: + json_text = json_match.group(0) + else: + json_text = response_text + + claude_results = json.loads(json_text) + + # Convert Claude results to CausalRelation objects + for result in claude_results: + cause = result.get("cause", "").strip() + effect = result.get("effect", "").strip() + context = result.get("context", "") + entities = result.get("entities", []) + confidence = float(result.get("confidence", 0.85)) + source_sentence = result.get("source_sentence", "") + date = result.get("date", "") + + if not cause or not effect: + continue + + # Map to Neo4j relationship type (default to CAUSES) + relationship_type = "CAUSES" + + explanation = context or f"Extracted by Claude AI" + if entities: + explanation += f" (Entities: {', '.join(entities)})" + + relationships.append(CausalRelation( + cause=cause, + effect=effect, + confidence=min(confidence, 0.95), # Cap at 0.95 + explanation=explanation, + source_file_id=source_file_id, + source_snippet=source_sentence[:200] if source_sentence else "", + relationship_type=relationship_type, + metadata={ + "extraction_method": "claude_ai", + "context": context, + "entities": entities, + "date": date, + "source_sentence": source_sentence + } + )) + + logger.info("Claude AI successfully extracted %d relationships", len(relationships)) + + except json.JSONDecodeError as e: + logger.warning("Failed to parse Claude AI response as JSON: %s. Response: %s", + e, response_text[:500]) + except Exception as e: + logger.warning("Error processing Claude AI response: %s", e) + + except BadRequestError as e: + logger.warning("Claude API error: %s", e) + except Exception as e: + logger.warning("Claude AI extraction failed: %s", e) + + return relationships + + def _extract_with_patterns(self, text: str, source_file_id: str) -> List[CausalRelation]: + """ + Fallback: Pattern-based extraction (original method). + Returns candidate relationships for DoWhy validation. + """ + if not text or not text.strip(): + return [] + + relationships: List[CausalRelation] = [] + seen = set() # Avoid duplicates + + # Normalize text + text = re.sub(r'\s+', ' ', text) + sentences = re.split(r'[.!?]\s+', text) + + for sentence in sentences: + sentence = sentence.strip() + if len(sentence) < 10: # Skip very short sentences + continue + + for pattern, rel_type in self.CAUSE_EFFECT_PATTERNS: + matches = re.finditer(pattern, sentence, re.IGNORECASE) + + for match in matches: + cause = match.group(1).strip() + effect = match.group(2).strip() + + # Filter out very short or very long phrases (increased limit for technical terms) + if len(cause) < 3 or len(cause) > 150: + continue + if len(effect) < 3 or len(effect) > 150: + continue + + # Skip common false positives + if cause.lower() in ["this", "that", "it", "they", "we"]: + continue + if effect.lower() in ["this", "that", "it", "they", "we"]: + continue + + # Create unique key + key = (cause.lower(), effect.lower()) + if key in seen: + continue + seen.add(key) + + # Calculate confidence based on pattern type + confidence = self._calculate_confidence(rel_type, sentence) + + # Map pattern type to Neo4j relationship type (uppercase with underscores) + neo4j_rel_type = self._map_to_neo4j_relationship_type(rel_type) + + relationships.append(CausalRelation( + cause=cause, + effect=effect, + confidence=confidence, + explanation=f"Extracted from text using pattern: {rel_type}", + source_file_id=source_file_id, + source_snippet=sentence[:200], # First 200 chars + relationship_type=neo4j_rel_type, + metadata={ + "extraction_method": "pattern_matching", + "pattern_type": rel_type, + "sentence": sentence + } + )) + + logger.info("Extracted %d candidate relationships from text (source: %s)", + len(relationships), source_file_id) + return relationships + + def _calculate_confidence(self, rel_type: str, sentence: str) -> float: + """Calculate confidence score based on pattern type and sentence quality.""" + base_confidence = { + "causes": 0.8, + "leads_to": 0.75, + "results_in": 0.75, + "triggers": 0.7, + "produces": 0.7, + "depends_on": 0.65, + "requires": 0.65, + "needs": 0.6, + "if_then": 0.8, + "when_then": 0.75, + "implies": 0.7, + "follows": 0.6, + "comes_after": 0.6, + "first_then": 0.7, + "enables": 0.7, + "allows": 0.65, + "facilitates": 0.65, + "relies_on": 0.65, + "uses": 0.6, + "utilizes": 0.6, + "leverages": 0.6, + "connects_to": 0.7, + "communicates_with": 0.7, + "interacts_with": 0.7, + "integrates_with": 0.7, + "provides": 0.7, + "supports": 0.7, + "handles": 0.65, + "manages": 0.65, + "controls": 0.65, + "processes": 0.65, + "generates": 0.7, + "creates": 0.7, + "implements": 0.7, + "delivers": 0.7, + "flows_to": 0.7, + "sends_to": 0.7, + "transmits_to": 0.7, + "receives_from": 0.7, + "ensures": 0.75, + "precedes": 0.6, + "contains": 0.6, + "includes": 0.6, + "consists_of": 0.6, + "affects": 0.65, + "impacts": 0.65, + "influences": 0.65, + }.get(rel_type, 0.5) + + # Adjust based on sentence length (longer sentences might be more descriptive) + if len(sentence) > 50: + base_confidence += 0.05 + + return min(base_confidence, 0.95) + + def _map_to_neo4j_relationship_type(self, pattern_type: str) -> str: + """Map pattern type to Neo4j relationship type (uppercase with underscores).""" + # Map lowercase pattern types to Neo4j relationship types + mapping = { + "causes": "CAUSES", + "leads_to": "LEADS_TO", + "results_in": "RESULTS_IN", + "triggers": "TRIGGERS", + "produces": "PRODUCES", + "depends_on": "DEPENDS_ON", + "requires": "REQUIRES", + "needs": "NEEDS", + "relies_on": "RELIES_ON", + "uses": "USES", + "utilizes": "UTILIZES", + "leverages": "LEVERAGES", + "connects_to": "CONNECTS_TO", + "communicates_with": "COMMUNICATES_WITH", + "interacts_with": "INTERACTS_WITH", + "integrates_with": "INTEGRATES_WITH", + "provides": "PROVIDES", + "supports": "SUPPORTS", + "handles": "HANDLES", + "manages": "MANAGES", + "controls": "CONTROLS", + "processes": "PROCESSES", + "generates": "GENERATES", + "creates": "CREATES", + "implements": "IMPLEMENTS", + "delivers": "DELIVERS", + "flows_to": "FLOWS_TO", + "sends_to": "SENDS_TO", + "transmits_to": "TRANSMITS_TO", + "receives_from": "RECEIVES_FROM", + "if_then": "IF_THEN", + "when_then": "WHEN_THEN", + "implies": "IMPLIES", + "ensures": "ENSURES", + "follows": "FOLLOWS", + "comes_after": "COMES_AFTER", + "first_then": "FIRST_THEN", + "precedes": "PRECEDES", + "contains": "CONTAINS", + "includes": "INCLUDES", + "consists_of": "CONSISTS_OF", + "affects": "AFFECTS", + "impacts": "IMPACTS", + "influences": "INFLUENCES", + "enables": "ENABLES", + "allows": "ALLOWS", + "facilitates": "FACILITATES", + } + return mapping.get(pattern_type, "CAUSES") # Default to CAUSES if not found + + def extract_from_qwen_results(self, qwen_results: List[Dict], source_file_id: str) -> List[CausalRelation]: + """Convert Qwen2.5-VL extraction results to CausalRelation objects.""" + relationships: List[CausalRelation] = [] + + for result in qwen_results: + entity1 = result.get("entity1", "").strip() + entity2 = result.get("entity2", "").strip() + rel_type = result.get("relationship_type", "").strip() + description = result.get("description", "").strip() + confidence = float(result.get("confidence", 0.7)) + + if not entity1 or not entity2: + continue + + # Map relationship type to cause-effect + # For most types, entity1 is cause, entity2 is effect + cause = entity1 + effect = entity2 + + # Some relationship types might need reversal + if rel_type in ["depends_on", "requires", "needs"]: + # If A depends on B, then B is the cause, A is the effect + cause, effect = effect, cause + + # Map Qwen relationship type to Neo4j format + neo4j_rel_type = self._map_to_neo4j_relationship_type(rel_type.lower().replace("-", "_")) + + relationships.append(CausalRelation( + cause=cause, + effect=effect, + confidence=confidence, + explanation=description or f"Extracted from diagram: {rel_type}", + source_file_id=source_file_id, + source_snippet=description, + relationship_type=neo4j_rel_type, + metadata={ + "extraction_method": "qwen2.5-vl", + "relationship_type": rel_type, + "original_entity1": entity1, + "original_entity2": entity2 + } + )) + + return relationships + diff --git a/services/multi-document-upload-service/src/multi_document_upload_service/processors/report_generator.py b/services/multi-document-upload-service/src/multi_document_upload_service/processors/report_generator.py new file mode 100644 index 0000000..592dbc1 --- /dev/null +++ b/services/multi-document-upload-service/src/multi_document_upload_service/processors/report_generator.py @@ -0,0 +1,570 @@ +from __future__ import annotations + +import json +import logging +from datetime import datetime +from pathlib import Path +from typing import Dict, List, Optional, Set + +from anthropic import Anthropic, BadRequestError + +from ..config import get_settings +from ..models import CausalRelation, ProjectReport + +logger = logging.getLogger(__name__) + +# Try to import PDF generation libraries +try: + import markdown + from markdown.extensions import codehilite, fenced_code, tables + HAS_MARKDOWN = True +except ImportError: + HAS_MARKDOWN = False + logger.warning("markdown library not available - PDF conversion will be limited") + +try: + from weasyprint import HTML, CSS + from weasyprint.text.fonts import FontConfiguration + HAS_WEASYPRINT = True +except ImportError: + HAS_WEASYPRINT = False + logger.warning("weasyprint not available - PDF conversion will be skipped") + + +class ReportGenerator: + """Generate beginner-friendly onboarding reports from knowledge graph.""" + + def __init__(self, api_key: str | None = None, model: str | None = None): + settings = get_settings() + self.api_key = api_key or settings.anthropic_api_key + self.model = model or settings.claude_model + self.max_output_tokens = settings.claude_max_output_tokens + + if not self.api_key: + raise ValueError("Anthropic API key is required for report generation") + + self.client = Anthropic(api_key=self.api_key) + + def generate_onboarding_report( + self, + job_id: str, + relations: List[CausalRelation], + vector_store, + embedder, + graph_writer=None, + kg_summary: Dict | None = None + ) -> ProjectReport: + """ + Generate a beginner-friendly onboarding report from the knowledge graph. + """ + logger.info("Generating onboarding report for job %s", job_id) + + # Step 1: Analyze KG structure + key_concepts = self._analyze_kg_structure(relations) + + # Step 2: Semantic search for different topics + overview_content = self._search_topic( + "project overview main purpose goals objectives", + vector_store, embedder, job_id, top_k=10 + ) + + concepts_content = self._search_topic( + "core concepts definitions key terms important ideas", + vector_store, embedder, job_id, top_k=15 + ) + + processes_content = self._search_topic( + "how system works processes flows procedures steps", + vector_store, embedder, job_id, top_k=15 + ) + + relationships_content = self._search_topic( + "cause effect dependencies relationships connections", + vector_store, embedder, job_id, top_k=20 + ) + + components_content = self._search_topic( + "components modules systems parts architecture", + vector_store, embedder, job_id, top_k=15 + ) + + # Step 3: Query Neo4j for causal chains (as per README Step 7.3) + causal_chains = [] + key_entities = [] + if graph_writer: + try: + # Query 1: Get critical causal chains + causal_chains = graph_writer.query_causal_chains( + job_id=job_id, + min_length=2, + max_length=4, + min_confidence=0.8, + limit=20 + ) + logger.info("Retrieved %d causal chains from Neo4j", len(causal_chains)) + + # Query 2: Get key entities + key_entities = graph_writer.query_key_entities(job_id=job_id, limit=20) + logger.info("Retrieved %d key entities from Neo4j", len(key_entities)) + except Exception as neo4j_exc: + logger.warning("Failed to query Neo4j: %s", neo4j_exc) + + # Step 4: Organize content hierarchically + organized_content = self._organize_content( + key_concepts, + overview_content, + concepts_content, + processes_content, + relationships_content, + components_content, + causal_chains, + key_entities + ) + + # Step 5: Generate report with Claude + report_content = self._claude_generate_report( + job_id=job_id, + relations=relations, + organized_content=organized_content, + kg_summary=kg_summary or {} + ) + + # Step 6: Parse sections + sections = self._parse_sections(report_content) + + # Step 7: Convert to PDF (as per README Step 7.8) + pdf_path = None + if HAS_WEASYPRINT and HAS_MARKDOWN: + try: + pdf_path = self._convert_to_pdf(report_content, job_id) + logger.info("Generated PDF report: %s", pdf_path) + except Exception as pdf_exc: + logger.warning("PDF conversion failed: %s", pdf_exc) + + # Estimate pages (rough: ~500 words per page) + word_count = len(report_content.split()) + estimated_pages = max(1, word_count // 500) + + return ProjectReport( + job_id=job_id, + title="Project Onboarding Guide", + content=report_content, + sections=sections, + key_concepts=list(key_concepts)[:20], # Top 20 concepts + total_pages=estimated_pages, + generated_at=datetime.utcnow(), + metadata={ + "total_relations": len(relations), + "total_concepts": len(key_concepts), + "causal_chains_count": len(causal_chains), + "key_entities_count": len(key_entities), + "model": self.model, + "pdf_path": str(pdf_path) if pdf_path else None + } + ) + + def _analyze_kg_structure(self, relations: List[CausalRelation]) -> Set[str]: + """Identify key concepts from the knowledge graph.""" + concepts = set() + + for rel in relations: + concepts.add(rel.cause) + concepts.add(rel.effect) + + # Identify high-degree nodes (concepts involved in many relationships) + cause_counts: Dict[str, int] = {} + effect_counts: Dict[str, int] = {} + + for rel in relations: + cause_counts[rel.cause] = cause_counts.get(rel.cause, 0) + 1 + effect_counts[rel.effect] = effect_counts.get(rel.effect, 0) + 1 + + # Key concepts are those with high degree (appear in many relationships) + all_counts = {**cause_counts, **effect_counts} + threshold = max(1, len(relations) // 10) # Top 10% most connected + + key_concepts = { + concept for concept, count in all_counts.items() + if count >= threshold + } + + # If threshold is too high, use top N concepts + if len(key_concepts) < 5: + sorted_concepts = sorted(all_counts.items(), key=lambda x: x[1], reverse=True) + key_concepts = {concept for concept, _ in sorted_concepts[:20]} + + logger.info("Identified %d key concepts from %d relationships", + len(key_concepts), len(relations)) + return key_concepts + + def _search_topic( + self, + query: str, + vector_store, + embedder, + job_id: str, + top_k: int = 10 + ) -> List[Dict]: + """Search for content related to a topic.""" + try: + results = vector_store.search_by_text( + query_text=query, + embedder=embedder, + job_id=job_id, + top_k=top_k + ) + return results + except Exception as exc: + logger.warning("Search failed for topic '%s': %s", query, exc) + return [] + + def _organize_content( + self, + key_concepts: Set[str], + overview_content: List[Dict], + concepts_content: List[Dict], + processes_content: List[Dict], + relationships_content: List[Dict], + components_content: List[Dict], + causal_chains: List[Dict] = None, + key_entities: List[Dict] = None + ) -> Dict: + """Organize retrieved content into a structured format.""" + return { + "key_concepts": list(key_concepts), + "overview": [r.get("payload", {}) for r in overview_content], + "concepts": [r.get("payload", {}) for r in concepts_content], + "processes": [r.get("payload", {}) for r in processes_content], + "relationships": [r.get("payload", {}) for r in relationships_content], + "components": [r.get("payload", {}) for r in components_content], + "causal_chains": causal_chains or [], + "key_entities": key_entities or [], + } + + def _claude_generate_report( + self, + job_id: str, + relations: List[CausalRelation], + organized_content: Dict, + kg_summary: Dict + ) -> str: + """Generate report using Claude AI.""" + + # Build KG summary text + kg_summary_text = self._build_kg_summary(relations, organized_content) + + # Build system prompt + system_prompt = """You are an expert technical writer specializing in creating beginner-friendly onboarding documentation for new team members. + +Your goal is to explain complex project information in simple, clear language that anyone can understand, even without technical background. + +Guidelines: +- Use simple, clear language - avoid jargon or explain it when necessary +- Use examples and analogies to make concepts relatable +- Structure information logically (basics first, then advanced) +- Make it engaging and easy to follow +- Cover all important aspects comprehensively +- Write in a friendly, welcoming tone +- Use headings, bullet points, and clear sections +- Explain "why" not just "what" + +Generate a comprehensive onboarding document that helps a new team member understand the entire project.""" + + # Format causal chains from Neo4j + causal_chains_text = self._format_causal_chains(organized_content.get('causal_chains', [])) + key_entities_text = self._format_key_entities(organized_content.get('key_entities', [])) + + # Build user prompt + user_prompt = f"""Generate a comprehensive, beginner-friendly onboarding document for this project. + +KNOWLEDGE GRAPH SUMMARY: +{kg_summary_text} + +IMPORTANT RELATIONSHIPS: +{self._format_relationships(relations[:50])} # Top 50 relationships + +CAUSAL CHAINS (from Knowledge Graph): +{causal_chains_text} + +KEY ENTITIES (from Knowledge Graph): +{key_entities_text} + +KEY CONCEPTS: +{', '.join(organized_content.get('key_concepts', [])[:30])} + +REQUIRED SECTIONS: +1. Project Overview + - What is this project about? + - Main purpose and goals + - Key stakeholders or users + +2. Core Concepts (Explained Simply) + - Explain each important concept in simple terms + - Why each concept matters + - How concepts relate to each other + +3. How Things Work Together + - System flow (simple explanation) + - Key processes and workflows + - Dependencies explained simply + +4. Important Relationships + - Cause → Effect relationships (explained in plain language) + - "When X happens, Y occurs because..." + - Visual flow if possible (describe it) + +5. Key Components + - Main modules/systems/components + - What each does (beginner-friendly) + - How they interact + +6. Getting Started + - Where to start learning + - What to understand first + - Recommended learning path + +7. Common Questions + - FAQ based on the knowledge graph + - Answers in simple terms + +Generate the complete onboarding document in Markdown format. Make it comprehensive, beginner-friendly, and easy to follow.""" + + try: + message = self.client.messages.create( + model=self.model, + max_tokens=self.max_output_tokens, + temperature=0.3, # Slightly creative but focused + system=system_prompt, + messages=[ + { + "role": "user", + "content": user_prompt + } + ] + ) + + content_blocks = message.content or [] + report_text = "".join( + block.text for block in content_blocks + if hasattr(block, "text") + ) + + if not report_text: + logger.warning("Empty report generated") + return "# Project Onboarding Guide\n\nNo content available." + + logger.info("Generated onboarding report (%d characters)", len(report_text)) + return report_text + + except BadRequestError as e: + # Handle API credit/authentication errors gracefully + error_msg = str(e) + if "credit balance" in error_msg.lower() or "too low" in error_msg.lower(): + logger.error("Claude API credit balance too low. Cannot generate report.") + raise ValueError("Claude API credit balance is too low. Please add credits to your Anthropic account to generate reports.") + elif "invalid_request_error" in error_msg.lower(): + logger.error("Claude API invalid request: %s", error_msg) + raise ValueError(f"Claude API request failed: {error_msg}") + else: + raise + except Exception as e: + logger.exception("Failed to generate report: %s", e) + raise + + def _build_kg_summary( + self, + relations: List[CausalRelation], + organized_content: Dict + ) -> str: + """Build a text summary of the knowledge graph.""" + summary_parts = [ + f"Total Relationships: {len(relations)}", + f"Total Concepts: {len(organized_content.get('key_concepts', []))}", + "", + "Top Relationships:", + ] + + # Show top relationships by confidence + top_relations = sorted(relations, key=lambda r: r.confidence, reverse=True)[:20] + for i, rel in enumerate(top_relations, 1): + summary_parts.append( + f"{i}. {rel.cause} → {rel.effect} " + f"(confidence: {rel.confidence:.2f})" + ) + + return "\n".join(summary_parts) + + def _format_relationships(self, relations: List[CausalRelation]) -> str: + """Format relationships for the prompt.""" + if not relations: + return "No relationships found." + + lines = [] + for rel in relations[:50]: # Limit to 50 + line = f"- {rel.cause} → {rel.effect}" + if rel.explanation: + line += f" ({rel.explanation[:100]})" + lines.append(line) + + return "\n".join(lines) + + def _parse_sections(self, content: str) -> Dict[str, str]: + """Parse markdown content into sections.""" + sections = {} + current_section = None + current_content = [] + + lines = content.split('\n') + + for line in lines: + # Check if it's a heading (starts with #) + if line.strip().startswith('#'): + # Save previous section + if current_section: + sections[current_section] = '\n'.join(current_content).strip() + + # Start new section + current_section = line.strip().lstrip('#').strip() + current_content = [line] + else: + if current_section: + current_content.append(line) + else: + # Content before first heading + if 'introduction' not in sections: + sections['introduction'] = line + else: + sections['introduction'] += '\n' + line + + # Save last section + if current_section: + sections[current_section] = '\n'.join(current_content).strip() + + return sections + + def _format_causal_chains(self, causal_chains: List[Dict]) -> str: + """Format causal chains from Neo4j for the prompt.""" + if not causal_chains: + return "No causal chains found in knowledge graph." + + lines = [] + for i, chain_data in enumerate(causal_chains[:20], 1): # Top 20 chains + chain = chain_data.get("chain", []) + avg_confidence = chain_data.get("avg_confidence", 0.0) + + if len(chain) >= 2: + chain_text = " → ".join(chain) + lines.append(f"{i}. {chain_text} (confidence: {avg_confidence:.2f})") + + return "\n".join(lines) if lines else "No causal chains found." + + def _format_key_entities(self, key_entities: List[Dict]) -> str: + """Format key entities from Neo4j for the prompt.""" + if not key_entities: + return "No key entities found in knowledge graph." + + lines = [] + for entity in key_entities[:20]: # Top 20 entities + name = entity.get("name", "") + entity_type = entity.get("type", "Entity") + relation_count = entity.get("relation_count", 0) + lines.append(f"- {name} ({entity_type}): involved in {relation_count} relationships") + + return "\n".join(lines) if lines else "No key entities found." + + def _convert_to_pdf(self, markdown_content: str, job_id: str) -> Optional[Path]: + """ + Convert Markdown report to PDF as per README Step 7.8. + Uses markdown + weasyprint for PDF generation. + """ + if not HAS_MARKDOWN or not HAS_WEASYPRINT: + return None + + try: + # Convert Markdown to HTML + html_content = markdown.markdown( + markdown_content, + extensions=['codehilite', 'fenced_code', 'tables'] + ) + + # Add CSS styling + css_style = """ + @page { + size: A4; + margin: 2cm; + } + body { + font-family: 'Georgia', serif; + line-height: 1.6; + color: #333; + } + h1, h2, h3, h4 { + color: #2c3e50; + margin-top: 1.5em; + margin-bottom: 0.5em; + } + h1 { font-size: 2em; border-bottom: 2px solid #3498db; padding-bottom: 0.3em; } + h2 { font-size: 1.5em; border-bottom: 1px solid #95a5a6; padding-bottom: 0.2em; } + h3 { font-size: 1.2em; } + code { + background-color: #f4f4f4; + padding: 2px 4px; + border-radius: 3px; + font-family: 'Courier New', monospace; + } + pre { + background-color: #f4f4f4; + padding: 1em; + border-radius: 5px; + overflow-x: auto; + } + table { + border-collapse: collapse; + width: 100%; + margin: 1em 0; + } + th, td { + border: 1px solid #ddd; + padding: 8px; + text-align: left; + } + th { + background-color: #3498db; + color: white; + } + """ + + # Create full HTML document + full_html = f""" + + + + + Project Onboarding Guide + + + {html_content} + + + """ + + # Generate PDF + settings = get_settings() + storage_root = Path(settings.storage_root) + reports_dir = storage_root / "reports" + reports_dir.mkdir(parents=True, exist_ok=True) + + pdf_path = reports_dir / f"report_{job_id}.pdf" + + HTML(string=full_html).write_pdf( + pdf_path, + stylesheets=[CSS(string=css_style)] + ) + + logger.info("PDF report generated: %s", pdf_path) + return pdf_path + + except Exception as exc: + logger.exception("Failed to convert Markdown to PDF: %s", exc) + return None + diff --git a/services/multi-document-upload-service/src/multi_document_upload_service/processors/vector_store.py b/services/multi-document-upload-service/src/multi_document_upload_service/processors/vector_store.py new file mode 100644 index 0000000..a00a9ff --- /dev/null +++ b/services/multi-document-upload-service/src/multi_document_upload_service/processors/vector_store.py @@ -0,0 +1,269 @@ +from __future__ import annotations + +import logging +from typing import Dict, List, Optional +from uuid import uuid4 + +from ..config import get_settings +from ..models import CausalRelation + +logger = logging.getLogger(__name__) + +try: + from qdrant_client import QdrantClient + from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue + HAS_QDRANT = True +except ImportError: + HAS_QDRANT = False + logger.warning("qdrant-client not available") + + +class VectorStore: + """Qdrant vector database client for storing KG embeddings.""" + + def __init__( + self, + url: str | None = None, + collection_name: str | None = None, + vector_size: int | None = None + ): + if not HAS_QDRANT: + raise ImportError("qdrant-client is required for vector storage") + + settings = get_settings() + self.url = url or settings.qdrant_url + self.collection_name = collection_name or settings.qdrant_collection_name + self.vector_size = vector_size or settings.qdrant_vector_size + + logger.info("Connecting to Qdrant at %s", self.url) + try: + self.client = QdrantClient(url=self.url) + logger.info("Connected to Qdrant") + except Exception as exc: + logger.exception("Failed to connect to Qdrant: %s", exc) + raise + + # Ensure collection exists + self._ensure_collection() + + def _ensure_collection(self) -> None: + """Create collection if it doesn't exist.""" + try: + collections = self.client.get_collections() + collection_names = [col.name for col in collections.collections] + + if self.collection_name not in collection_names: + logger.info("Creating Qdrant collection: %s", self.collection_name) + try: + self.client.create_collection( + collection_name=self.collection_name, + vectors_config=VectorParams( + size=self.vector_size, + distance=Distance.COSINE + ) + ) + logger.info("Created collection: %s", self.collection_name) + except Exception as create_exc: + # Collection might have been created by another instance + if "already exists" in str(create_exc).lower() or "409" in str(create_exc): + logger.info("Collection %s already exists (created by another instance)", self.collection_name) + else: + raise + else: + logger.debug("Collection %s already exists", self.collection_name) + except Exception as exc: + logger.exception("Failed to ensure collection: %s", exc) + raise + + def store_relation( + self, + relation: CausalRelation, + embedding: List[float], + job_id: str + ) -> str: + """Store a relationship embedding in Qdrant.""" + point_id = str(uuid4()) + + payload = { + "job_id": job_id, + "cause": relation.cause, + "effect": relation.effect, + "confidence": relation.confidence, + "source_file_id": relation.source_file_id or "", + "source_snippet": relation.source_snippet or "", + "explanation": relation.explanation or "", + } + + point = PointStruct( + id=point_id, + vector=embedding, + payload=payload + ) + + try: + self.client.upsert( + collection_name=self.collection_name, + points=[point] + ) + logger.debug("Stored relation embedding: %s -> %s", relation.cause, relation.effect) + return point_id + except Exception as exc: + logger.warning("Failed to store relation: %s", exc) + return "" + + def store_concept( + self, + concept_name: str, + embedding: List[float], + job_id: str, + description: str | None = None + ) -> str: + """Store a concept/node embedding in Qdrant.""" + point_id = str(uuid4()) + + payload = { + "job_id": job_id, + "concept_name": concept_name, + "description": description or "", + "type": "concept" + } + + point = PointStruct( + id=point_id, + vector=embedding, + payload=payload + ) + + try: + self.client.upsert( + collection_name=self.collection_name, + points=[point] + ) + logger.debug("Stored concept embedding: %s", concept_name) + return point_id + except Exception as exc: + logger.warning("Failed to store concept: %s", exc) + return "" + + def search( + self, + query_embedding: List[float], + job_id: str | None = None, + top_k: int = 10, + score_threshold: float = 0.5 + ) -> List[Dict]: + """Search for similar vectors in Qdrant.""" + try: + # Build filter if job_id is provided + query_filter = None + if job_id: + query_filter = Filter( + must=[ + FieldCondition( + key="job_id", + match=MatchValue(value=job_id) + ) + ] + ) + + # Use the collections API for search + # Check if client has search method (newer versions) or use query_points (older) + if hasattr(self.client, 'search'): + results = self.client.search( + collection_name=self.collection_name, + query_vector=query_embedding, + query_filter=query_filter, + limit=top_k, + score_threshold=score_threshold + ) + elif hasattr(self.client, 'query_points'): + # Fallback for older API + results = self.client.query_points( + collection_name=self.collection_name, + query=query_embedding, + query_filter=query_filter, + top=top_k, + score_threshold=score_threshold + ) + else: + # Try using the collection directly + collection = self.client.get_collection(self.collection_name) + if hasattr(collection, 'search'): + results = collection.search( + query_vector=query_embedding, + query_filter=query_filter, + limit=top_k, + score_threshold=score_threshold + ) + else: + logger.error("QdrantClient does not have search or query_points method") + return [] + + # Convert to list of dicts + search_results = [] + for result in results: + search_results.append({ + "id": str(result.id), + "score": result.score, + "payload": result.payload + }) + + return search_results + + except Exception as exc: + logger.warning("Vector search failed: %s", exc) + import traceback + logger.debug("Search error traceback: %s", traceback.format_exc()) + return [] + + def search_by_text( + self, + query_text: str, + embedder, + job_id: str | None = None, + top_k: int = 10 + ) -> List[Dict]: + """Search using text query (embeds it first).""" + query_embedding = embedder.embed_text(query_text) + return self.search(query_embedding, job_id=job_id, top_k=top_k) + + def delete_job_vectors(self, job_id: str) -> int: + """Delete all vectors for a specific job.""" + try: + # Qdrant doesn't have a direct delete by filter, so we need to: + # 1. Search for all points with job_id + # 2. Delete them by ID + + # This is a simplified version - in production, you might want + # to use scroll API for large datasets + query_filter = Filter( + must=[ + FieldCondition( + key="job_id", + match=MatchValue(value=job_id) + ) + ] + ) + + # Scroll to get all points + points, _ = self.client.scroll( + collection_name=self.collection_name, + scroll_filter=query_filter, + limit=10000 # Adjust based on expected size + ) + + if points: + point_ids = [str(point.id) for point in points] + self.client.delete( + collection_name=self.collection_name, + points_selector=point_ids + ) + logger.info("Deleted %d vectors for job %s", len(point_ids), job_id) + return len(point_ids) + + return 0 + + except Exception as exc: + logger.warning("Failed to delete job vectors: %s", exc) + return 0 + diff --git a/services/multi-document-upload-service/src/multi_document_upload_service/workflows/pipeline.py b/services/multi-document-upload-service/src/multi_document_upload_service/workflows/pipeline.py index 9987f03..5417282 100644 --- a/services/multi-document-upload-service/src/multi_document_upload_service/workflows/pipeline.py +++ b/services/multi-document-upload-service/src/multi_document_upload_service/workflows/pipeline.py @@ -4,14 +4,19 @@ import logging from pathlib import Path from typing import Iterable, List -from ..claude_client import ClaudeCausalExtractor from ..config import get_settings -from ..extractors.auto import extract_text from ..extractors.image_extractor import extract_images_from_file +from ..extractors.pymupdf_extractor import extract_all_text, extract_text_with_context +from ..extractors.qwen_vision import QwenVisionClient from ..jobs import JobStore from ..models import CausalRelation, JobStage -from ..processors.chunker import TextChunker +from ..processors.dowhy_analyzer import DoWhyAnalyzer +from ..processors.embedder import Embedder +from ..processors.entity_resolver import EntityResolver from ..processors.graph_writer import GraphWriter +from ..processors.relationship_extractor import RelationshipExtractor +from ..processors.report_generator import ReportGenerator +from ..processors.vector_store import VectorStore from ..storage import StorageManager logger = logging.getLogger(__name__) @@ -23,31 +28,60 @@ class JobPipeline: job_store: JobStore, storage: StorageManager, graph_writer: GraphWriter, - claude_extractor: ClaudeCausalExtractor, ): self.job_store = job_store self.storage = storage self.graph_writer = graph_writer - self.claude_extractor = claude_extractor + settings = get_settings() - self.chunker = TextChunker( - model_name=settings.claude_model, - token_target=settings.chunk_token_target, - overlap=settings.chunk_token_overlap, - ) + + # Initialize extractors + self.qwen_client = QwenVisionClient() # Only for images/diagrams + self.relationship_extractor = RelationshipExtractor() # NLP (SpaCy) + Claude AI for text (as per README) + self.entity_resolver = EntityResolver() # Claude AI entity resolution (as per README Stage 4) + + # Initialize processors + try: + self.dowhy_analyzer = DoWhyAnalyzer() if settings.dowhy_enabled else None + except Exception as e: + logger.warning("DoWhy not available: %s", e) + self.dowhy_analyzer = None + + try: + self.embedder = Embedder() + self.vector_store = VectorStore() + except Exception as e: + logger.warning("Vector store not available: %s", e) + self.embedder = None + self.vector_store = None + + try: + self.report_generator = ReportGenerator() + except Exception as e: + logger.warning("Report generator not available: %s", e) + self.report_generator = None def process_job(self, job_id: str, saved_files: Iterable[str]) -> None: job = self.job_store.get(job_id) logger.info("Processing job %s with %d files", job_id, job.total_files) - relations: List[CausalRelation] = [] + all_text_content: List[str] = [] + all_relations: List[CausalRelation] = [] try: - self.job_store.update(job_id, stage=JobStage.EXTRACTING, status_message="Extracting content") + # ============================================================ + # STEP 1: CONTENT EXTRACTION (PyMuPDF + Qwen2.5-VL) + # ============================================================ + self.job_store.update( + job_id, + stage=JobStage.EXTRACTING, + status_message="Extracting content from documents" + ) + for count, file_path in enumerate(saved_files, start=1): file_path_obj = Path(file_path) file_record = next((f for f in job.files if f.stored_path == file_path), None) - logger.info("Processing %s", file_path_obj.name) + logger.info("Processing %s (%d/%d)", file_path_obj.name, count, job.total_files) source_file_id = file_record.id if file_record else file_path_obj.name suffix = file_path_obj.suffix.lower() @@ -55,27 +89,36 @@ class JobPipeline: is_direct_image = suffix in {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"} try: - # Extract text from document (if not a direct image) + # Step 2.1: IDENTIFY FILE TYPE and route to appropriate extractor + # Step 2.2: Extract text based on file type (as per README) text = "" if not is_direct_image: try: - text = extract_text(file_path_obj) + # extract_all_text() handles routing: + # - PDF → PyMuPDF (Step 2.2a) + # - DOCX → python-docx (Step 2.2b) + # - PPTX → python-pptx (Step 2.2c) + # - CSV/XLSX → pandas (Step 2.2d) + # - Text files → direct read + # Also performs Step 2.3: Text cleaning + text = extract_all_text(file_path_obj) - # Process text if available if text and text.strip(): - # Validate text is readable + # Validate text is readable (basic check) printable_chars = sum(1 for c in text if c.isprintable() or c.isspace()) total_chars = len(text) if total_chars > 100 and printable_chars / total_chars < 0.3: - logger.warning("Text from %s appears to be binary, skipping text processing", file_path_obj.name) + logger.warning("Text from %s appears to be binary, skipping", file_path_obj.name) text = "" else: + # Step 2.4: STORE EXTRACTED TEXT + all_text_content.append(text) extracted_path = self.storage.stage_extracted_content(job_id, file_path_obj.name, text) if file_record: file_record.extracted_path = str(extracted_path) - logger.info("Successfully extracted %d characters from %s", len(text), file_path_obj.name) + logger.info("Extracted %d characters from %s", len(text), file_path_obj.name) except Exception as text_exc: - logger.warning("Text extraction failed for %s: %s. Will continue with image extraction if available.", file_path_obj.name, text_exc) + logger.warning("Text extraction failed for %s: %s", file_path_obj.name, text_exc) text = "" # Extract images from documents (PDF, DOCX, PPTX) @@ -93,72 +136,216 @@ class JobPipeline: extracted_images = [file_path_obj] logger.info("Direct image upload detected: %s", file_path_obj.name) - except Exception as exc: # noqa: BLE001 + # Process images with Qwen2.5-VL + if extracted_images: + for image_path in extracted_images: + try: + qwen_results = self.qwen_client.extract_relationships_from_image( + image_path, source_file_id + ) + if qwen_results: + # Convert Qwen results to CausalRelation objects + qwen_relations = self.relationship_extractor.extract_from_qwen_results( + qwen_results, source_file_id + ) + all_relations.extend(qwen_relations) + logger.info("Extracted %d relations from image %s using Qwen2.5-VL", + len(qwen_relations), image_path.name) + except Exception as img_exc: + logger.warning("Failed to analyze image %s with Qwen: %s", image_path, img_exc) + + except Exception as exc: logger.exception("Extraction failed for %s", file_path_obj) if file_record: file_record.error = str(exc) continue - + self.job_store.update( job_id, files=job.files, processed_files=count, - status_message=f"Analyzing causal relations ({count}/{job.total_files})", - stage=JobStage.ANALYZING, + status_message=f"Extracting content ({count}/{job.total_files})", ) - # Process text content - if text and text.strip(): - chunks = self.chunker.chunk(text) - text_relations = self.claude_extractor.analyze(chunks, source_file_id=source_file_id) - relations.extend(text_relations) - logger.info("Extracted %d relations from text in %s", len(text_relations), file_path_obj.name) - - # Process images (extracted from documents or direct uploads) - if extracted_images: - for image_path in extracted_images: - try: - image_relations = self.claude_extractor.analyze_image(image_path, source_file_id=source_file_id) - relations.extend(image_relations) - logger.info("Extracted %d relations from image %s", len(image_relations), image_path.name) - except Exception as img_exc: - logger.warning("Failed to analyze image %s: %s", image_path, img_exc) - # Continue with other images - elif not text or not text.strip(): - # No text and no images - file might be empty or unsupported - logger.warning("File %s has no extractable text or images", file_path_obj.name) - if file_record: - file_record.error = "No extractable content found (no text or images)" + # ============================================================ + # STEP 2: RELATIONSHIP EXTRACTION (NLP + Claude AI as per README) + # ============================================================ + logger.info("Extracting relationships from text content using NLP (SpaCy) + Claude AI") + combined_text = "\n\n".join(all_text_content) + + if combined_text.strip(): + # Extract relationships using NLP (Step 3.1) + Claude AI (Step 3.2) + # This implements the flow described in README.md + text_relations = self.relationship_extractor.extract_from_text( + combined_text, + source_file_id="combined_text" + ) + all_relations.extend(text_relations) + logger.info("NLP + Claude AI extracted %d relationships from text", len(text_relations)) - # Write relations to Neo4j if any were found - if relations: - self.job_store.update(job_id, status_message="Writing to knowledge graph", stage=JobStage.BUILDING_GRAPH) - try: - self.graph_writer.write_relations(job_id, relations) - logger.info("Wrote %d relations to Neo4j for job %s", len(relations), job_id) - status_message = f"Completed with {len(relations)} causal relationship(s) written to Neo4j" - except Exception as graph_exc: - logger.exception("Failed to write relations to Neo4j for job %s: %s", job_id, graph_exc) - status_message = f"Completed with {len(relations)} relations extracted, but failed to write to Neo4j: {graph_exc}" - else: - logger.warning("Job %s completed with 0 relations - no causal relationships found", job_id) - # Check if any files failed to extract - failed_files = [f for f in job.files if f.error] - if failed_files: - status_message = f"Completed but {len(failed_files)} file(s) failed to extract. No relations found." + # ============================================================ + # STEP 3: ENTITY RESOLUTION (Claude AI as per README Stage 4) + # ============================================================ + if all_relations and self.entity_resolver.client: + logger.info("Resolving entities using Claude AI") + resolved_entities = self.entity_resolver.resolve_entities(all_relations) + if resolved_entities: + # Apply resolution to relationships + all_relations = self.entity_resolver.apply_resolution_to_relations( + all_relations, resolved_entities + ) + logger.info("Entity resolution completed: %d canonical entities", len(resolved_entities)) else: - status_message = "Completed but no causal relationships were found in the documents." + logger.info("Entity resolution returned no results") + else: + if not self.entity_resolver.client: + logger.info("Entity resolution skipped (Claude AI not available)") - # Final update + # ============================================================ + # STEP 4: DOWHY VALIDATION + # ============================================================ + if self.dowhy_analyzer and all_relations: + self.job_store.update( + job_id, + status_message="Validating relationships with DoWhy", + stage=JobStage.BUILDING_GRAPH + ) + logger.info("Validating %d relationships with DoWhy", len(all_relations)) + validated_relations = self.dowhy_analyzer.validate_relationships( + all_relations, + text_data=combined_text + ) + all_relations = validated_relations + logger.info("DoWhy validated %d relationships", len(all_relations)) + else: + if not self.dowhy_analyzer: + logger.info("DoWhy validation skipped (not available)") + self.job_store.update( + job_id, + status_message="Building knowledge graph", + stage=JobStage.BUILDING_GRAPH + ) + + # ============================================================ + # STEP 5: WRITE TO NEO4J (Documents, Entities, Relationships) + # ============================================================ + if all_relations: + try: + # Write documents, entities, and relationships with types + self.graph_writer.write_relations(job_id, all_relations, files=job.files) + logger.info("Wrote %d relations to Neo4j for job %s", len(all_relations), job_id) + except Exception as graph_exc: + logger.exception("Failed to write relations to Neo4j: %s", graph_exc) + raise + + # ============================================================ + # STEP 6: VECTOR DATABASE INDEXING (Qdrant) + # ============================================================ + if self.vector_store and self.embedder and all_relations: + self.job_store.update( + job_id, + status_message="Indexing knowledge graph in vector database", + stage=JobStage.INDEXING_VECTORS + ) + logger.info("Indexing %d relationships in Qdrant", len(all_relations)) + + indexed_count = 0 + for relation in all_relations: + try: + # Generate embedding for the relationship + embedding = self.embedder.embed_relation( + relation.cause, + relation.effect, + relation.explanation + ) + + # Store in Qdrant + self.vector_store.store_relation(relation, embedding, job_id) + indexed_count += 1 + except Exception as e: + logger.warning("Failed to index relation %s -> %s: %s", + relation.cause, relation.effect, e) + + # Also index concepts (nodes) + concepts = set() + for rel in all_relations: + concepts.add(rel.cause) + concepts.add(rel.effect) + + for concept in concepts: + try: + embedding = self.embedder.embed_concept(concept) + self.vector_store.store_concept(concept, embedding, job_id) + except Exception as e: + logger.warning("Failed to index concept %s: %s", concept, e) + + logger.info("Indexed %d relationships and %d concepts in Qdrant", + indexed_count, len(concepts)) + + # ============================================================ + # STEP 7: GENERATE ONBOARDING REPORT + # ============================================================ + if self.report_generator and self.vector_store and self.embedder: + self.job_store.update( + job_id, + status_message="Generating beginner-friendly onboarding report", + stage=JobStage.GENERATING_REPORT + ) + logger.info("Generating onboarding report for job %s", job_id) + + try: + kg_summary = { + "total_relations": len(all_relations), + "total_files": job.total_files, + "processed_files": job.processed_files + } + + report = self.report_generator.generate_onboarding_report( + job_id=job_id, + relations=all_relations, + vector_store=self.vector_store, + embedder=self.embedder, + graph_writer=self.graph_writer, # Pass graph_writer for Neo4j queries + kg_summary=kg_summary + ) + + logger.info("Generated onboarding report: %d sections, %d pages", + len(report.sections), report.total_pages) + + except Exception as report_exc: + logger.exception("Failed to generate report: %s", report_exc) + report = None + # Store report generation error in job metadata + report_error_msg = str(report_exc) + if "credit balance" in report_error_msg.lower() or "too low" in report_error_msg.lower(): + report_error_msg = "Report generation failed: Claude API credit balance is too low. Please add credits to your Anthropic account." + self.job_store.update( + job_id, + error=f"Report generation failed: {report_error_msg}" + ) + else: + logger.warning("Report generation skipped (components not available)") + report = None + + # ============================================================ + # FINAL UPDATE + # ============================================================ + status_message = f"Completed successfully" + if all_relations: + status_message += f" with {len(all_relations)} relationships" + if report: + status_message += f" and generated onboarding report" + self.job_store.update( job_id, stage=JobStage.COMPLETED, status_message=status_message, - relations=relations, + relations=all_relations, + report=report, processed_files=job.total_files, ) - logger.info("Job %s completed with %d relations", job_id, len(relations)) - except Exception as exc: # noqa: BLE001 + logger.info("Job %s completed successfully", job_id) + + except Exception as exc: logger.exception("Job %s failed: %s", job_id, exc) self.job_store.mark_error(job_id, f"Pipeline failed: {exc}") -