implemented KG DB

2025-11-13 09:07:54 +05:30 · 2025-11-13 09:07:54 +05:30 · ad2c27d793
commit ad2c27d793
parent ffe6ca349c
20 changed files with 4080 additions and 2472 deletions
--- a/ai-analysis-reports/repo_analysis_e395ea2c-ea3b-43e0-94af-c95b2815aac1_20251110_032129_analysis.pdf
+++ b/ai-analysis-reports/repo_analysis_e395ea2c-ea3b-43e0-94af-c95b2815aac1_20251110_032129_analysis.pdf
--- a/ai-analysis-reports/repo_analysis_e395ea2c-ea3b-43e0-94af-c95b2815aac1_20251110_055924_analysis.pdf
+++ b/ai-analysis-reports/repo_analysis_e395ea2c-ea3b-43e0-94af-c95b2815aac1_20251110_055924_analysis.pdf
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -721,48 +721,47 @@ services:
  # =====================================
  ai-analysis-service:
-    build: ./services/ai-analysis-service
+    build: 
-    container_name: pipeline_ai_analysis_service
+      context: ./services/ai-analysis-service
      dockerfile: Dockerfile
    container_name: pipeline_ai_analysis
    ports:
      - "8022:8022"
    environment:
      - PORT=8022
      - HOST=0.0.0.0
      - ANTHROPIC_API_KEY=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
-      - POSTGRES_HOST=postgres
+      
      # Neo4j Configuration
      - USE_NEO4J_KG=true
      - NEO4J_URI=bolt://neo4j:7687
      - NEO4J_USER=neo4j
      - NEO4J_PASSWORD=password
      - NEO4J_DATABASE=neo4j
      # Report Configuration
      - REPORT_TECHNICAL_ONLY=false
      # Existing database configurations
      - POSTGRES_HOST=pipeline_postgres
      - POSTGRES_PORT=5432
      - POSTGRES_DB=dev_pipeline
      - POSTGRES_USER=pipeline_admin
      - POSTGRES_PASSWORD=secure_pipeline_2024
-      - REDIS_HOST=redis
+      
      - MONGODB_URL=mongodb://pipeline_admin:mongo_secure_2024@pipeline_mongodb:27017/
      - MONGODB_DB=repo_analyzer
      - REDIS_HOST=pipeline_redis
      - REDIS_PORT=6379
      - REDIS_PASSWORD=redis_secure_2024
-      - MONGODB_URL=mongodb://pipeline_admin:mongo_secure_2024@mongodb:27017/
+    depends_on:
-      - MONGODB_DB=repo_analyzer
+      - neo4j
-      - GIT_INTEGRATION_SERVICE_URL=http://git-integration:8012
+      - postgres
-      - CLAUDE_REQUESTS_PER_MINUTE=90
+      - mongodb
-      - MAX_FILES_DEFAULT=100
+      - redis
      - CACHE_TTL_SECONDS=86400
      - CONTENT_MAX_TOKENS=8000
      - ENHANCED_PROCESSING_ENABLED=true
      - ENHANCED_BATCH_PROCESSING=true
      - ENHANCED_SMART_CHUNKING=true
      - ENHANCED_RATE_LIMIT=120
      - ENHANCED_BATCH_DELAY=0.05
      - ENHANCED_SMALL_FILE_DELAY=0.02
      - ENHANCED_MEDIUM_FILE_DELAY=0.05
      - ENHANCED_LARGE_FILE_DELAY=0.1
    volumes:
      - ai_analysis_logs:/app/logs
      - ./ai-analysis-reports:/app/reports
      - ai_analysis_temp:/app/temp
    networks:
      - pipeline_network
    depends_on:
      - postgres
      - redis
      - mongodb
      - git-integration
    deploy:
      resources:
        limits:
--- a/services/ai-analysis-service/init.py
+++ b/services/ai-analysis-service/init.py
@ -0,0 +1,8 @@
 # Service initialization
 # This file helps Python treat the directory as a package
 # and can be used to set up any service-wide configurations
 from .server import app  # Import the FastAPI app
 from .ai_analyze import analyze_repository  # Import key functions
 __all__ = ['app', 'analyze_repository']
--- a/services/ai-analysis-service/ai-analyze.py
+++ b/services/ai-analysis-service/ai-analyze.py
--- a/services/ai-analysis-service/env.example
+++ b/services/ai-analysis-service/env.example
@ -6,7 +6,17 @@ HOST=0.0.0.0
 NODE_ENV=development
 # AI API Keys
-ANTHROPIC_API_KEY=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
+ANTHROPIC_API_KEY=your_anthropic_api_key
 # Neo4j Knowledge Graph Configuration
 USE_NEO4J_KG=true
 NEO4J_URI=bolt://localhost:7687
 NEO4J_USER=neo4j
 NEO4J_PASSWORD=secure_neo4j_2024
 NEO4J_DATABASE=neo4j
 # Report Configuration
 REPORT_TECHNICAL_ONLY=false
 # Database Configuration
 POSTGRES_HOST=localhost
--- a/services/ai-analysis-service/knowledge_graph/init.py
+++ b/services/ai-analysis-service/knowledge_graph/init.py
@ -0,0 +1,9 @@
 """
 Knowledge graph utilities for the AI Analysis Service.
 This package provides the Neo4j client and high-level helpers used to
 persist and query analysis results in a graph representation.
 """
 from .neo4j_client import Neo4jGraphClient  # noqa: F401
--- a/services/ai-analysis-service/knowledge_graph/neo4j_client.py
+++ b/services/ai-analysis-service/knowledge_graph/neo4j_client.py
@ -0,0 +1,328 @@
 """
 Neo4j client helpers for the AI Analysis Service.
 This module wraps the official Neo4j async driver and exposes a minimal
 set of convenience methods that we can reuse across the service without
 sprinkling Cypher execution boilerplate everywhere.
 """
 from __future__ import annotations
 import json
 from contextlib import asynccontextmanager
 from dataclasses import dataclass
 from datetime import datetime
 from typing import Any, AsyncIterator, Dict, List, Optional, Sequence
 from neo4j import AsyncGraphDatabase
 try:
    from neo4j import AsyncResult, AsyncSession  # type: ignore
 except ImportError:  # pragma: no cover - fallback for older/newer driver versions
    AsyncResult = Any  # type: ignore
    AsyncSession = Any  # type: ignore
 def _json_dumps(value: Any) -> str:
    """Serialize complex values so we can persist them as strings safely."""
    if value is None:
        return ""
    if isinstance(value, (str, int, float, bool)):
        return str(value)
    try:
        return json.dumps(value, default=str)
    except Exception:
        return str(value)
@dataclass
 class Neo4jConfig:
    uri: str
    user: str
    password: str
    database: Optional[str] = None
    fetch_size: int = 1000
 class Neo4jGraphClient:
    """
    Thin wrapper around the Neo4j async driver that provides helpers for
    writing analysis artefacts into the graph and querying them back.
    """
    def __init__(self, config: Neo4jConfig) -> None:
        self._config = config
        self._driver = AsyncGraphDatabase.driver(
            config.uri,
            auth=(config.user, config.password),
            # Allow long running operations while the analysis progresses
            max_connection_lifetime=3600,
        )
    async def close(self) -> None:
        if self._driver:
            await self._driver.close()
    @asynccontextmanager
    async def session(self) -> AsyncIterator[AsyncSession]:
        kwargs: Dict[str, Any] = {}
        if self._config.database:
            kwargs["database"] = self._config.database
        if self._config.fetch_size:
            kwargs["fetch_size"] = self._config.fetch_size
        async with self._driver.session(**kwargs) as session:
            yield session
    async def _run_write(self, query: str, **params: Any) -> None:
        async with self.session() as session:
            async def _write(tx):
                result = await tx.run(query, **params)
                await result.consume()
            await session.execute_write(_write)
    async def _run_read(self, query: str, **params: Any) -> List[Dict[str, Any]]:
        async with self.session() as session:
            result: AsyncResult = await session.run(query, **params)
            records = await result.data()
            return records
    # ------------------------------------------------------------------ #
    # Write helpers
    # ------------------------------------------------------------------ #
    async def upsert_run(self, run_id: str, repository_id: str) -> None:
        await self._run_write(
            """
            MERGE (r:Run {run_id: $run_id})
            ON CREATE SET
                r.repository_id = $repository_id,
                r.created_at = datetime(),
                r.updated_at = datetime()
            ON MATCH SET
                r.repository_id = $repository_id,
                r.updated_at = datetime()
            """,
            run_id=run_id,
            repository_id=repository_id,
        )
    async def clear_run(self, run_id: str) -> None:
        await self._run_write(
            """
            MATCH (r:Run {run_id: $run_id})
            OPTIONAL MATCH (r)-[rel]-()
            DETACH DELETE r
            """,
            run_id=run_id,
        )
    async def upsert_module_graph(
        self,
        run_id: str,
        repository_id: str,
        module_props: Dict[str, Any],
        files: Sequence[Dict[str, Any]],
        findings: Sequence[Dict[str, Any]],
        dependencies: Sequence[Dict[str, Any]],
    ) -> None:
        """
        Persist module level artefacts in a single transaction.
        """
        # Ensure strings
        module_props = {k: _json_dumps(v) if isinstance(v, (dict, list, tuple, set)) else v for k, v in module_props.items()}
        files_payload = [
            {
                "path": item["path"],
                "props": {
                    key: _json_dumps(value) if isinstance(value, (dict, list, tuple, set)) else value
                    for key, value in item.get("props", {}).items()
                },
            }
            for item in files
        ]
        findings_payload = [
            {
                "id": item["id"],
                "props": {
                    key: _json_dumps(value) if isinstance(value, (dict, list, tuple, set)) else value
                    for key, value in item.get("props", {}).items()
                },
                "file_path": item.get("file_path"),
            }
            for item in findings
        ]
        dependencies_payload = [
            {
                "target": dependency.get("target"),
                "kind": dependency.get("kind", "depends_on"),
                "metadata": _json_dumps(dependency.get("metadata", {})),
            }
            for dependency in dependencies
        ]
        await self._run_write(
            """
            MERGE (r:Run {run_id: $run_id})
            ON CREATE SET
                r.repository_id = $repository_id,
                r.created_at = datetime(),
                r.updated_at = datetime()
            ON MATCH SET
                r.repository_id = $repository_id,
                r.updated_at = datetime()
            MERGE (m:Module {run_id: $run_id, name: $module_name})
            SET m += $module_props,
                m.updated_at = datetime()
            MERGE (r)-[:RUN_HAS_MODULE]->(m)
            WITH m
            UNWIND $files AS file_data
            MERGE (f:File {run_id: $run_id, path: file_data.path})
            SET f += file_data.props,
                f.updated_at = datetime()
            MERGE (m)-[:MODULE_INCLUDES_FILE]->(f)
            WITH m
            UNWIND $findings AS finding_data
            MERGE (fd:Finding {run_id: $run_id, finding_id: finding_data.id})
            SET fd += finding_data.props,
                fd.updated_at = datetime()
            MERGE (m)-[:MODULE_HAS_FINDING]->(fd)
            FOREACH (fp IN CASE WHEN finding_data.file_path IS NULL THEN [] ELSE [finding_data.file_path] END |
                MERGE (ff:File {run_id: $run_id, path: fp})
                MERGE (fd)-[:FINDING_TOUCHES_FILE]->(ff)
            )
            WITH m
            UNWIND $dependencies AS dependency
            FOREACH (_ IN CASE WHEN dependency.target IS NULL THEN [] ELSE [1] END |
                MERGE (dep:Module {run_id: $run_id, name: dependency.target})
                MERGE (m)-[rel:MODULE_DEPENDENCY {kind: dependency.kind}]->(dep)
                SET rel.metadata = dependency.metadata,
                    rel.updated_at = datetime()
            )
            """,
            run_id=run_id,
            repository_id=repository_id,
            module_name=module_props.get("name"),
            module_props=module_props,
            files=files_payload,
            findings=findings_payload,
            dependencies=dependencies_payload,
        )
    async def upsert_run_state(self, run_id: str, state: Dict[str, Any]) -> None:
        await self._run_write(
            """
            MERGE (r:Run {run_id: $run_id})
            SET r.analysis_state = $state,
                r.state_updated_at = datetime()
            """,
            run_id=run_id,
            state=_json_dumps(state),
        )
    async def upsert_synthesis(self, run_id: str, synthesis: Dict[str, Any]) -> None:
        await self._run_write(
            """
            MERGE (r:Run {run_id: $run_id})
            SET r.synthesis_analysis = $synthesis,
                r.synthesis_updated_at = datetime()
            """,
            run_id=run_id,
            synthesis=_json_dumps(synthesis),
        )
    # ------------------------------------------------------------------ #
    # Read helpers
    # ------------------------------------------------------------------ #
    async def fetch_modules(self, run_id: str) -> List[Dict[str, Any]]:
        records = await self._run_read(
            """
            MATCH (r:Run {run_id: $run_id})-[:RUN_HAS_MODULE]->(m:Module)
            OPTIONAL MATCH (m)-[:MODULE_INCLUDES_FILE]->(f:File)
            OPTIONAL MATCH (m)-[:MODULE_HAS_FINDING]->(fd:Finding)
            OPTIONAL MATCH (fd)-[:FINDING_TOUCHES_FILE]->(ff:File)
            RETURN
                m,
                collect(DISTINCT properties(f)) AS files,
                collect(DISTINCT properties(fd)) AS findings,
                collect(DISTINCT properties(ff)) AS finding_files
            """,
            run_id=run_id,
        )
        modules: List[Dict[str, Any]] = []
        for record in records:
            module_node = record.get("m", {})
            files = record.get("files", [])
            findings = record.get("findings", [])
            finding_files = record.get("finding_files", [])
            modules.append(
                {
                    "module": module_node,
                    "files": files,
                    "findings": findings,
                    "finding_files": finding_files,
                }
            )
        return modules
    async def fetch_run_state(self, run_id: str) -> Optional[Dict[str, Any]]:
        records = await self._run_read(
            """
            MATCH (r:Run {run_id: $run_id})
            RETURN r.analysis_state AS analysis_state
            """,
            run_id=run_id,
        )
        if not records:
            return None
        raw_state = records[0].get("analysis_state")
        if not raw_state:
            return None
        try:
            return json.loads(raw_state)
        except json.JSONDecodeError:
            return {"raw": raw_state}
    async def fetch_synthesis(self, run_id: str) -> Optional[Dict[str, Any]]:
        records = await self._run_read(
            """
            MATCH (r:Run {run_id: $run_id})
            RETURN r.synthesis_analysis AS synthesis
            """,
            run_id=run_id,
        )
        if not records:
            return None
        raw_synthesis = records[0].get("synthesis")
        if not raw_synthesis:
            return None
        try:
            return json.loads(raw_synthesis)
        except json.JSONDecodeError:
            return {"raw": raw_synthesis}
    async def fetch_run_metadata(self, run_id: str) -> Optional[Dict[str, Any]]:
        records = await self._run_read(
            """
            MATCH (r:Run {run_id: $run_id})
            RETURN r
            """,
            run_id=run_id,
        )
        if not records:
            return None
        run_node = records[0].get("r")
        if not run_node:
            return None
        metadata = dict(run_node)
        if "created_at" in metadata and isinstance(metadata["created_at"], datetime):
            metadata["created_at"] = metadata["created_at"].isoformat()
        if "updated_at" in metadata and isinstance(metadata["updated_at"], datetime):
            metadata["updated_at"] = metadata["updated_at"].isoformat()
        return metadata
--- a/services/ai-analysis-service/knowledge_graph/operations.py
+++ b/services/ai-analysis-service/knowledge_graph/operations.py
@ -0,0 +1,214 @@
 """
 High-level knowledge graph operations used by the AI Analysis Service.
 These helpers translate existing analysis objects into the node/relationship
 structure expected by `Neo4jGraphClient`.
 """
 from __future__ import annotations
 import json
 import uuid
 from datetime import datetime
 from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
 from .neo4j_client import Neo4jGraphClient
 def _safe_json(value: Any) -> str:
    if value is None:
        return ""
    if isinstance(value, (str, int, float, bool)):
        return str(value)
    try:
        return json.dumps(value, default=str)
    except Exception:
        return str(value)
 def _normalize_issue(issue: Any, index: int) -> Tuple[str, Dict[str, Any]]:
    """
    Convert an issue structure that might be a string or dict into a dict.
    Returns (summary, props).
    """
    if isinstance(issue, dict):
        summary = issue.get("title") or issue.get("issue") or issue.get("description") or f"Issue #{index}"
        props = {
            "summary": summary,
            "severity": issue.get("severity", "medium"),
            "category": issue.get("category", "general"),
            "description": issue.get("description") or issue.get("details") or "",
            "recommendation": issue.get("recommendation") or issue.get("action") or "",
            "evidence": _safe_json(issue.get("evidence")),
        }
        if issue.get("impact"):
            props["impact"] = issue["impact"]
        if issue.get("line_number"):
            props["line_number"] = issue["line_number"]
        return summary, props
    summary = str(issue)
    return summary, {
        "summary": summary,
        "severity": "medium",
        "category": "general",
    }
 def build_module_payload(
    run_id: str,
    repository_id: str,
    module_name: str,
    chunk: Dict[str, Any],
    chunk_analysis: Dict[str, Any],
    file_analyses: Sequence[Any],
    metadata: Dict[str, Any],
    ai_response: str,
 ) -> Dict[str, Any]:
    """Prepare module level payload for graph insertion."""
    module_id = chunk.get("id") or str(uuid.uuid4())
    module_quality = chunk_analysis.get("module_quality_score")
    module_overview = chunk_analysis.get("module_overview", "")
    module_architecture = chunk_analysis.get("module_architecture", "")
    module_security = chunk_analysis.get("module_security_assessment", "")
    module_recommendations = chunk_analysis.get("module_recommendations", [])
    files: List[Dict[str, Any]] = []
    findings: List[Dict[str, Any]] = []
    total_issues = 0
    total_recommendations = 0
    for fa_index, fa in enumerate(file_analyses):
        path = getattr(fa, "path", None) or getattr(fa, "file_path", "unknown")
        issues = getattr(fa, "issues_found", None) or []
        recommendations = getattr(fa, "recommendations", None) or []
        total_issues += len(issues) if isinstance(issues, (list, tuple)) else 0
        total_recommendations += len(recommendations) if isinstance(recommendations, (list, tuple)) else 0
        files.append(
            {
                "path": str(path),
                "props": {
                    "language": getattr(fa, "language", "unknown"),
                    "lines_of_code": getattr(fa, "lines_of_code", 0),
                    "complexity_score": getattr(fa, "complexity_score", 0),
                    "severity_score": getattr(fa, "severity_score", 0),
                },
            }
        )
        if isinstance(issues, Iterable):
            for issue_index, raw_issue in enumerate(issues):
                summary, issue_props = _normalize_issue(raw_issue, issue_index)
                finding_id = f"{module_id}:{fa_index}:{issue_index}"
                issue_props.update(
                    {
                        "module": module_name,
                        "file_path": str(path),
                        "created_at": datetime.utcnow().isoformat(),
                    }
                )
                findings.append(
                    {
                        "id": finding_id,
                        "props": issue_props,
                        "file_path": str(path),
                    }
                )
    module_props: Dict[str, Any] = {
        "name": module_name,
        "module_id": module_id,
        "quality_score": module_quality,
        "overview": module_overview,
        "architecture": module_architecture,
        "security": module_security,
        "recommendations": module_recommendations,
        "analysis_payload": metadata,
        "ai_response": ai_response,
        "repository_id": repository_id,
        "total_files": len(file_analyses),
        "total_issues": total_issues,
        "total_recommendations": total_recommendations,
        "updated_at": datetime.utcnow().isoformat(),
    }
    dependencies = []
    for dependency in metadata.get("dependencies", {}).get("depends_on_chunks", []):
        dependencies.append(
            {
                "target": dependency,
                "kind": "depends_on",
                "metadata": {"source": module_name},
            }
        )
    return {
        "module_props": module_props,
        "files": files,
        "findings": findings,
        "dependencies": dependencies,
    }
 async def store_module_analysis(
    client: Neo4jGraphClient,
    run_id: str,
    repository_id: str,
    module_payload: Dict[str, Any],
 ) -> None:
    await client.upsert_module_graph(
        run_id=run_id,
        repository_id=repository_id,
        module_props=module_payload["module_props"],
        files=module_payload["files"],
        findings=module_payload["findings"],
        dependencies=module_payload["dependencies"],
    )
 async def store_analysis_state(client: Neo4jGraphClient, run_id: str, analysis_state: Dict[str, Any]) -> None:
    await client.upsert_run_state(run_id=run_id, state=analysis_state)
 async def store_synthesis(client: Neo4jGraphClient, run_id: str, synthesis: Dict[str, Any]) -> None:
    await client.upsert_synthesis(run_id=run_id, synthesis=synthesis)
 async def fetch_module_analyses(client: Neo4jGraphClient, run_id: str) -> List[Dict[str, Any]]:
    modules = await client.fetch_modules(run_id)
    module_analyses: List[Dict[str, Any]] = []
    for entry in modules:
        node = entry.get("module", {})
        files = entry.get("files", [])
        findings = entry.get("findings", [])
        analysis_payload = node.get("analysis_payload")
        if isinstance(analysis_payload, str):
            try:
                analysis_payload = json.loads(analysis_payload)
            except json.JSONDecodeError:
                analysis_payload = {"raw": analysis_payload}
        module_analyses.append(
            {
                "module_name": node.get("name"),
                "module_id": node.get("module_id"),
                "quality_score": node.get("quality_score"),
                "module_overview": node.get("overview"),
                "module_architecture": node.get("architecture"),
                "module_security_assessment": node.get("security"),
                "module_recommendations": node.get("recommendations"),
                "files_analyzed": [file.get("path") for file in files if file.get("path")],
                "raw_payload": analysis_payload,
                "findings": findings,
            }
        )
    return module_analyses
 async def fetch_run_state(client: Neo4jGraphClient, run_id: str) -> Optional[Dict[str, Any]]:
    return await client.fetch_run_state(run_id)
 async def fetch_synthesis(client: Neo4jGraphClient, run_id: str) -> Optional[Dict[str, Any]]:
    return await client.fetch_synthesis(run_id)
--- a/services/ai-analysis-service/progress_manager.py
+++ b/services/ai-analysis-service/progress_manager.py
@ -19,6 +19,7 @@ class AnalysisProgressManager:
        self.subscribers: List[asyncio.Queue] = []
        self.redis_client: Optional[redis.Redis] = None
        self.progress_key = f"analysis_progress:{analysis_id}"
        self._complete: bool = False
    async def connect_redis(self):
        """Connect to Redis for progress persistence"""
@ -104,6 +105,9 @@ class AnalysisProgressManager:
        print(f"📤 Event emitted: {event_type} - {data.get('message', '')}")
        if event_type in {"analysis_completed", "analysis_error"}:
            self._complete = True
    async def get_progress_history(self) -> List[Dict[str, Any]]:
        """Retrieve progress history from Redis"""
        if not self.redis_client:
@ -125,6 +129,12 @@ class AnalysisProgressManager:
            except Exception as e:
                print(f"⚠️ Failed to clear progress: {e}")
        self._complete = False
    def is_complete(self) -> bool:
        """Return whether the analysis has completed or errored."""
        return self._complete
 class GlobalProgressTracker:
    """Global singleton to track all active analyses"""
--- a/services/ai-analysis-service/pyproject.toml
+++ b/services/ai-analysis-service/pyproject.toml
@ -0,0 +1,32 @@
 [build-system]
 requires = ["setuptools>=61.0", "wheel"]
 build-backend = "setuptools.build_meta"
 [project]
 name = "ai-analysis-service"
 version = "0.1.0"
 description = "AI Analysis Microservice for Code Repository Analysis"
 requires-python = ">=3.9"
 dependencies = [
    "fastapi>=0.104.1",
    "uvicorn>=0.24.0",
    "pydantic>=2.5.0",
    "httpx>=0.25.0",
    "redis>=4.5.0",
    "psycopg2-binary>=2.9.7",
    "neo4j>=5.8.0",
    "anthropic>=0.7.0",
    "python-dotenv>=1.0.0"
 ]
 [project.optional-dependencies]
 dev = [
    "pytest",
    "mypy",
    "black",
    "isort"
 ]
 [tool.setuptools]
 packages = ["ai_analysis_service"]
--- a/services/ai-analysis-service/requirements.txt
+++ b/services/ai-analysis-service/requirements.txt
@ -17,6 +17,7 @@ GitPython>=3.1.40
 redis>=4.5.0
 pymongo>=4.5.0
 psycopg2-binary>=2.9.7
 neo4j>=5.8.0  # Neo4j Graph Database Driver
 # Data processing
 numpy>=1.24.0
--- a/services/ai-analysis-service/server.py
+++ b/services/ai-analysis-service/server.py
@ -26,13 +26,12 @@ import uvicorn
 import mimetypes
 import httpx
 import redis
 import psycopg2
 from psycopg2.extras import RealDictCursor
-# PostgreSQL cursor for querying
+from knowledge_graph import Neo4jGraphClient
-try:
+from knowledge_graph.neo4j_client import Neo4jConfig
-    from psycopg2.extras import RealDictCursor
+from knowledge_graph import operations as kg_ops
 except ImportError:
    # Fallback if psycopg2 not available
    RealDictCursor = None
 # Import the AI analysis components
 # Note: ai-analyze.py has a hyphen, so we need to handle the import specially
@ -40,7 +39,7 @@ import sys
 import importlib.util
 # Load the ai-analyze.py module
-spec = importlib.util.spec_from_file_location("ai_analyze", "ai-analyze.py")
+spec = importlib.util.spec_from_file_location("ai_analyze", "./ai-analyze.py")
 ai_analyze_module = importlib.util.module_from_spec(spec)
 sys.modules["ai_analyze"] = ai_analyze_module
 spec.loader.exec_module(ai_analyze_module)
@ -52,7 +51,6 @@ from ai_analyze import (
    ArchitectureAnalysis,
    SecurityAnalysis,
    CodeQualityAnalysis,
    PerformanceAnalysis,
    Issue,
    ModuleAnalysis,
    ModuleSummary
@ -71,12 +69,14 @@ from progress_manager import AnalysisProgressManager, progress_tracker
 # Global analyzer instance
 analyzer = None
 neo4j_client: Optional[Neo4jGraphClient] = None
 USE_KNOWLEDGE_GRAPH = False
@asynccontextmanager
 async def lifespan(app: FastAPI):
    """Lifespan context manager for startup and shutdown events."""
    # Startup
-    global analyzer
+    global analyzer, neo4j_client, USE_KNOWLEDGE_GRAPH
    try:
        # Load environment variables
        from dotenv import load_dotenv
@ -116,6 +116,26 @@ async def lifespan(app: FastAPI):
            analyzer = EnhancedGitHubAnalyzer(api_key, config)
        print("✅ AI Analysis Service initialized successfully")
        use_kg_flag = os.getenv("USE_NEO4J_KG", os.getenv("USE_KNOWLEDGE_GRAPH", "false"))
        USE_KNOWLEDGE_GRAPH = str(use_kg_flag).lower() in ("1", "true", "yes", "on")
        if USE_KNOWLEDGE_GRAPH:
            try:
                neo4j_config = Neo4jConfig(
                    uri=os.getenv("NEO4J_URI", "bolt://localhost:7687"),
                    user=os.getenv("NEO4J_USER", "neo4j"),
                    password=os.getenv("NEO4J_PASSWORD", "neo4j"),
                    database=os.getenv("NEO4J_DATABASE") or None,
                )
                neo4j_client = Neo4jGraphClient(neo4j_config)
                print(f"✅ Knowledge graph enabled (Neo4j URI: {neo4j_config.uri})")
            except Exception as kg_error:
                neo4j_client = None
                USE_KNOWLEDGE_GRAPH = False
                print(f"⚠️ Failed to initialize Neo4j client: {kg_error}. Falling back to episodic memory.")
        else:
            neo4j_client = None
            print("ℹ️ Knowledge graph disabled (falling back to episodic memory)")
    except Exception as e:
        print(f"❌ Failed to initialize AI Analysis Service: {e}")
        raise
@ -124,6 +144,8 @@ async def lifespan(app: FastAPI):
    # Shutdown (if needed)
    # Cleanup code can go here if needed
    if neo4j_client:
        await neo4j_client.close()
 app = FastAPI(
    title="AI Analysis Service",
@ -624,6 +646,13 @@ git_client = GitIntegrationClient()
 analysis_cache = AnalysisCache()
 content_optimizer = ContentOptimizer()
 def get_progress_manager(analysis_id: str) -> AnalysisProgressManager:
    """Retrieve an existing progress manager or create one if missing."""
    manager = progress_tracker.get_manager(analysis_id)
    if manager is None:
        manager = progress_tracker.create_manager(analysis_id)
    return manager
 # ============================================================================
 # TOKEN USAGE & COST TRACKING (NEW)
 # ============================================================================
@ -1075,56 +1104,37 @@ async def stream_progress(analysis_id: str, request: Request):
    };
    """
    async def event_generator():
-        # Get or create progress manager
+        # Properly handle event generation
-        manager = progress_tracker.get_manager(analysis_id)
+        progress_mgr: Optional[AnalysisProgressManager] = None
-        
+        subscriber_queue: Optional[asyncio.Queue] = None
        if not manager:
            # Send error and close
            yield f"data: {json.dumps({'error': 'Analysis not found'})}\n\n"
            return
        # Subscribe to updates
        queue = manager.subscribe()
        try:
-            # Send historical events first
+            progress_mgr = get_progress_manager(analysis_id)
-            history = await manager.get_progress_history()
+            
            # Make sure Redis connection exists so we can replay history
            if progress_mgr.redis_client is None:
                await progress_mgr.connect_redis()
            # Replay cached history first
            history = await progress_mgr.get_progress_history()
            for event in history:
                if await request.is_disconnected():
                    break
                yield f"data: {json.dumps(event)}\n\n"
-            # Stream new events
+            # Subscribe to new events
            subscriber_queue = progress_mgr.subscribe()
            while True:
-                if await request.is_disconnected():
+                event = await subscriber_queue.get()
                yield f"data: {json.dumps(event)}\n\n"
                if event.get("event") in ("analysis_completed", "analysis_error"):
                    break
-                
+        except Exception as e:
-                try:
+            error_payload = {"error": str(e)}
-                    # Wait for next event with timeout
+            yield f"data: {json.dumps(error_payload)}\n\n"
                    event = await asyncio.wait_for(queue.get(), timeout=30.0)
                    yield f"data: {json.dumps(event)}\n\n"
                    # If analysis completed, close stream
                    if event.get('event') in ['analysis_completed', 'analysis_error']:
                        break
                except asyncio.TimeoutError:
                    # Send keepalive ping
                    yield f": keepalive\n\n"
                    continue
        finally:
-            manager.unsubscribe(queue)
+            if progress_mgr and subscriber_queue:
                progress_mgr.unsubscribe(subscriber_queue)
-    return StreamingResponse(
+    return StreamingResponse(event_generator(), media_type="text/event-stream")
        event_generator(),
        media_type="text/event-stream",
        headers={
            "Cache-Control": "no-cache",
            "Connection": "keep-alive",
            "X-Accel-Buffering": "no"  # Disable nginx buffering
        }
    )
@app.post("/analyze")
 async def analyze_repository(request: AnalysisRequest, background_tasks: BackgroundTasks):
@ -3190,187 +3200,32 @@ async def analyze_files_smart_batch(files_batch: List[Tuple[str, str]], reposito
 async def store_chunk_analysis_in_memory(chunk: Dict, file_analyses: List, chunk_analysis: Dict, repository_id: str, session_id: str = None, analysis_state: Optional[Dict] = None):
    """
-    Store detailed chunk-level analysis in episodic memory (MongoDB).
+    Store chunk analysis in memory using either Neo4j Knowledge Graph or Episodic Memory.
-    Creates one record per chunk with comprehensive analysis data.
+    Supports fallback mechanisms for robust storage.
    Now includes progressive context (Option 3: Hybrid Approach).
    """
    try:
-        if not analyzer or not hasattr(analyzer, 'memory_manager'):
+        # Validate input parameters
-            print("⚠️ [MEMORY] Memory manager not available, skipping chunk storage")
+        if not chunk or not file_analyses:
-            return
+            print("❌ [MEMORY] Invalid chunk or file_analyses")
            return None
-        # Get session ID from analyzer
+        # Extract necessary variables (these were missing in the original implementation)
-        if not session_id:
+        chunk_name = chunk.get('name', 'Unknown Chunk')
-            session_id = getattr(analyzer, 'session_id', str(uuid.uuid4()))
+        chunk_type = chunk.get('type', 'generic')
-        
+        chunk_priority = chunk.get('priority', 5)
        chunk_id = chunk.get('id', 'unknown')
        chunk_name = chunk.get('name', 'unknown')
        chunk_type = chunk.get('chunk_type', 'module')
        chunk_priority = chunk.get('priority', 2)
        dependencies = chunk.get('context_dependencies', [])
        # Calculate chunk metrics
        total_files = len(file_analyses)
-        total_lines = sum(fa.lines_of_code for fa in file_analyses if fa.lines_of_code is not None)
+        total_lines = sum(fa.lines_of_code for fa in file_analyses)
-        total_issues = sum(len(fa.issues_found) if isinstance(fa.issues_found, (list, tuple)) else 0 for fa in file_analyses)
+        dependencies = chunk.get('dependencies', [])
-        total_recommendations = sum(len(fa.recommendations) if isinstance(fa.recommendations, (list, tuple)) else 0 for fa in file_analyses)
+        module_quality = chunk_analysis.get('module_quality', 5.0)
        total_issues = sum(len(fa.issues_found) for fa in file_analyses)
        total_recommendations = sum(len(fa.recommendations) for fa in file_analyses)
        high_quality = len([fa for fa in file_analyses if fa.complexity_score and fa.complexity_score <= 3])
        medium_quality = len([fa for fa in file_analyses if fa.complexity_score and 3 < fa.complexity_score <= 7])
        low_quality = len([fa for fa in file_analyses if fa.complexity_score and fa.complexity_score > 7])
-        # Calculate quality distribution
+        # Prepare AI response (this was also missing)
        high_quality = len([fa for fa in file_analyses if fa.severity_score >= 8])
        medium_quality = len([fa for fa in file_analyses if 5 <= fa.severity_score < 8])
        low_quality = len([fa for fa in file_analyses if fa.severity_score < 5])
        # Get module quality score from chunk_analysis or calculate from files
        module_quality = chunk_analysis.get('module_quality_score', 
            sum(fa.severity_score for fa in file_analyses if fa.severity_score is not None) / total_files if total_files > 0 else 5.0)
        # Build comprehensive AI response text with CODE EVIDENCE
        # FIX: Convert all values to strings immediately to prevent TypeError
        module_overview = chunk_analysis.get('module_overview', f"Analysis of {chunk_name} module")
        if isinstance(module_overview, dict):
            module_overview = json.dumps(module_overview, indent=2)
        else:
            module_overview = str(module_overview)
        # Extract code evidence from file analyses for concrete proof in reports
        try:
            code_evidence = extract_code_evidence_from_files(file_analyses)
            print(f"   📸 Extracted {len(code_evidence)} evidence items")
        except Exception as e:
            print(f"   ⚠️ Code evidence extraction failed: {e}")
            code_evidence = []
        module_architecture = chunk_analysis.get('module_architecture', 'Architecture analysis in progress')
        if isinstance(module_architecture, dict):
            module_architecture = json.dumps(module_architecture, indent=2)
        else:
            module_architecture = str(module_architecture)
        module_security = chunk_analysis.get('module_security_assessment', 'Security assessment in progress')
        if isinstance(module_security, dict):
            module_security = json.dumps(module_security, indent=2)
        else:
            module_security = str(module_security)
        ai_response_parts = [
            f"# COMPREHENSIVE ANALYSIS: {chunk_name.upper()}",
            f"Chunk ID: {chunk_id}",
            f"Chunk Type: {chunk_type}",
            "",
            f"## MODULE OVERVIEW",
            module_overview,
            "",
            f"## MODULE METRICS",
            f"- Module Quality Score: {module_quality:.1f}/10",
            f"- Total Files: {total_files}",
            f"- Total Lines of Code: {total_lines:,}",
            f"- Total Issues: {total_issues}",
            f"- Total Recommendations: {total_recommendations}",
            f"- High Quality Files (Score >= 8): {high_quality}",
            f"- Medium Quality Files (Score 5-7): {medium_quality}",
            f"- Low Quality Files (Score < 5): {low_quality}",
            "",
            f"## ARCHITECTURE ASSESSMENT",
            module_architecture,
            "",
            f"## SECURITY ASSESSMENT",
            module_security,
            "",
            f"## MODULE RECOMMENDATIONS",
        ]
        module_recs = chunk_analysis.get('module_recommendations', [])
        if module_recs:
            for rec in module_recs:
                # Handle both string and dict recommendations
                if isinstance(rec, dict):
                    rec_text = rec.get('text', str(rec.get('recommendation', '')))[:200]
                else:
                    rec_text = str(rec)
                ai_response_parts.append(f"- {rec_text}")
        else:
            ai_response_parts.append("- Review module structure")
        ai_response_parts.extend([
            "",
            "## CODE EVIDENCE & FINDINGS",
            ""
        ])
        # Add code evidence section
        if code_evidence:
            ai_response_parts.append("### SPECIFIC CODE ISSUES WITH EVIDENCE:")
            for evidence in code_evidence[:10]:  # Top 10 most critical
                ai_response_parts.extend([
                    f"**File:** {evidence['file']}",
                    f"**Issue:** {evidence['issue']}",
                    f"**Line {evidence['line_number']}:**",
                    "```" + evidence['language'],
                    evidence['code_snippet'],
                    "```",
                    f"**Recommendation:** {evidence['recommendation']}",
                    ""
                ])
        ai_response_parts.extend([
            "",
            "## FILE-LEVEL ANALYSIS SUMMARY",
            ""
        ])
        # Add detailed file analyses
        for fa in file_analyses:
            ai_response_parts.extend([
                f"### {fa.path}",
                f"- Language: {fa.language}",
                f"- Lines of Code: {fa.lines_of_code}",
                f"- Quality Score: {fa.severity_score:.1f}/10",
                f"- Complexity Score: {fa.complexity_score:.1f}/10",
                f"- Issues: {len(fa.issues_found) if isinstance(fa.issues_found, (list, tuple)) else 0}",
                ""
            ])
            if fa.issues_found:
                ai_response_parts.append("**Issues Found:**")
                for issue in fa.issues_found[:5]:  # Top 5 issues
                    # Handle both string and dict issues
                    if isinstance(issue, dict):
                        issue_text = issue.get('title', str(issue.get('description', '')))[:200]
                    else:
                        issue_text = str(issue)
                    ai_response_parts.append(f"- {issue_text}")
                ai_response_parts.append("")
            if fa.recommendations:
                ai_response_parts.append("**Recommendations:**")
                for rec in fa.recommendations[:5]:  # Top 5 recommendations
                    # Handle both string and dict recommendations
                    if isinstance(rec, dict):
                        rec_text = rec.get('text', str(rec.get('recommendation', '')))[:200]
                    else:
                        rec_text = str(rec)
                    ai_response_parts.append(f"- {rec_text}")
                ai_response_parts.append("")
            if fa.detailed_analysis:
                # Ensure detailed_analysis is a string, not a dict
                detailed_analysis_text = str(fa.detailed_analysis) if not isinstance(fa.detailed_analysis, str) else fa.detailed_analysis
                ai_response_parts.extend([
                    "**Detailed Analysis:**",
                    detailed_analysis_text,
                    ""
                ])
        # Final safety check: Convert all items to strings before joining
        ai_response_parts_clean = []
-        for item in ai_response_parts:
+        for item in chunk_analysis.get('ai_response_parts', []):
            if isinstance(item, dict):
                # Convert dict to JSON string (json is already imported at module level)
                ai_response_parts_clean.append(json.dumps(item, indent=2))
            elif isinstance(item, (list, tuple)):
                # Convert list/tuple to string representation
                ai_response_parts_clean.append(str(item))
            else:
                ai_response_parts_clean.append(str(item))
        ai_response = "\n".join(ai_response_parts_clean)
@ -3379,105 +3234,69 @@ async def store_chunk_analysis_in_memory(chunk: Dict, file_analyses: List, chunk
        file_names = [fa.path for fa in file_analyses]
        user_query = f"Analysis of chunk: {chunk_name} ({chunk_type}) - {total_files} files: {', '.join(file_names[:5])}{'...' if len(file_names) > 5 else ''}"
-        # Prepare file analyses data for storage (OPTIMIZATION: Store only paths, not content)
+        # Prepare file analyses data for storage
        # IMPORTANT: Never store file content in episodic memory to save storage space
        file_analyses_data = []
        for fa in file_analyses:
            file_data = {
-                'file_path': str(fa.path),  # Only store path, not content
+                'file_path': str(fa.path),
                'language': fa.language,
                'lines_of_code': fa.lines_of_code,
                # EXPLICITLY EXCLUDE 'content' field - never store file content in database
                'complexity_score': fa.complexity_score,
                'severity_score': fa.severity_score,
                'issues_found': fa.issues_found if isinstance(fa.issues_found, (list, tuple)) else [],
                'recommendations': fa.recommendations if isinstance(fa.recommendations, (list, tuple)) else [],
                'detailed_analysis': fa.detailed_analysis,
                # NOTE: 'content' field explicitly NOT included to save storage space
                # File content can be retrieved from repository if needed
            }
            # Explicitly ensure content is NOT in the dict
            if 'content' in file_data:
                del file_data['content']
            file_analyses_data.append(file_data)
-        # Build progressive context metadata (Option 3: Hybrid Approach)
+        # Get run_id
        progressive_context = {}
        if analysis_state:
            # OPTIMIZATION: Limit context to last 5 modules for faster processing
            all_module_summaries = analysis_state.get('module_summaries', {})
            modules_analyzed = analysis_state.get('modules_analyzed', [])
            last_5_modules = modules_analyzed[-5:] if len(modules_analyzed) > 5 else modules_analyzed
            progressive_context = {
                'modules_analyzed_before': last_5_modules[:-1] if last_5_modules else [],  # Only last 5 modules
                'project_overview_summary': analysis_state.get('project_overview', '')[:300] if analysis_state.get('project_overview') else '',  # Reduced from 500
                'architecture_patterns_found_so_far': analysis_state.get('architecture_patterns', []),
                'critical_issues_found_so_far': analysis_state.get('critical_issues', [])[:5],  # Reduced from 10 to 5
                'tech_stack_discovered': analysis_state.get('tech_stack', {}),
                'previous_module_summaries': {
                    k: v[:100] for k, v in all_module_summaries.items()  # Reduced from 200 to 100 chars
                    if k != chunk_name and k in last_5_modules  # Only last 5 modules
                }
            }
        # Get run_id from analyzer if available (for hierarchical storage compatibility)
        run_id = getattr(analyzer, 'run_id', None)
        if not run_id:
            # Try to extract from session_id or generate
            run_id = f"repo_analysis_{repository_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        # Build comprehensive metadata
        metadata = {
-            'type': 'module_analysis',  # IMPORTANT: Mark as module_analysis for retrieval
+            'type': 'module_analysis',
-            'run_id': run_id,  # IMPORTANT: Include run_id for retrieval
+            'run_id': run_id,
            'chunk_id': chunk_id,
            'chunk_name': chunk_name,
            'chunk_type': chunk_type,
            'chunk_priority': chunk_priority,
            'module_name': chunk_name if chunk_type == 'module' else None,
            'total_files_in_chunk': total_files,
            'total_lines_in_chunk': total_lines,
            'chunk_token_count': estimate_tokens(chunk.get('files', [])),
            'context_dependencies': dependencies,
            'repository_id': repository_id,
-            'analysis_type': 'intelligent_chunking',
+            'total_files_in_chunk': total_files,
            # NEW: Progressive Context (Option 3)
            'progressive_context': progressive_context,
            # Chunk metrics
            'chunk_metrics': {
                'average_quality_score': module_quality,
                'total_issues': total_issues,
                'total_recommendations': total_recommendations,
                'average_complexity': sum(fa.complexity_score for fa in file_analyses if fa.complexity_score is not None) / total_files if total_files > 0 else 5.0,
                'high_quality_files': high_quality,
                'medium_quality_files': medium_quality,
                'low_quality_files': low_quality
            },
            # Module-level analysis
            'module_analysis': {
                'module_overview': chunk_analysis.get('module_overview', ''),
                'module_architecture': chunk_analysis.get('module_architecture', ''),
                'module_security_assessment': chunk_analysis.get('module_security_assessment', ''),
                'module_recommendations': chunk_analysis.get('module_recommendations', [])
            },
            # Dependencies
            'dependencies': {
                'depends_on_chunks': dependencies,
                'imports_from': []  # Can be enhanced with actual import analysis
            },
            # File analyses (detailed)
            'file_analyses': file_analyses_data
        }
-        # Store in episodic memory
+        # Prioritize Knowledge Graph storage
-        print(f"   💾 Storing {chunk_name} in episodic memory...")
+        if USE_KNOWLEDGE_GRAPH and neo4j_client:
-        print(f"   📊 Metadata type: {metadata.get('type')}, Run ID: {metadata.get('run_id')[:30]}...")
+            try:
                module_payload = kg_ops.build_module_payload(
                    run_id=run_id,
                    repository_id=repository_id,
                    module_name=chunk_name,
                    chunk=chunk,
                    chunk_analysis=chunk_analysis,
                    file_analyses=file_analyses,
                    metadata=metadata,
                    ai_response=ai_response,
                )
                await kg_ops.store_module_analysis(
                    client=neo4j_client,
                    run_id=run_id,
                    repository_id=repository_id,
                    module_payload=module_payload,
                )
                print(f"   ✅ Stored in Neo4j knowledge graph (module: {chunk_name})")
                return module_payload["module_props"]["module_id"]
            except Exception as kg_error:
                print(f"   ⚠️ Failed to store module in knowledge graph: {kg_error}. Falling back to episodic memory.")
        # Fallback to Episodic Memory
        try:
            memory_id = await analyzer.memory_manager.store_episodic_memory(
                session_id=session_id,
@ -3487,27 +3306,12 @@ async def store_chunk_analysis_in_memory(chunk: Dict, file_analyses: List, chunk
                metadata=metadata
            )
            print(f"   ✅ Stored in episodic memory with ID: {memory_id}")
            return memory_id
        except Exception as memory_error:
            print(f"   ❌ Failed to store in episodic memory: {memory_error}")
            import traceback
            traceback.print_exc()
-            raise
+            return None
        # Option 3: Also store/update cumulative analysis_state record
        if analysis_state:
            try:
                await store_cumulative_analysis_state(
                    session_id=session_id,
                    repository_id=repository_id,
                    analysis_state=analysis_state,
                    chunk_sequence=len(analysis_state.get('modules_analyzed', []))
                )
                print(f"   ✅ Cumulative state stored")
            except Exception as state_error:
                print(f"   ⚠️ Failed to store cumulative state: {state_error}")
        print(f"✅ [MEMORY] Stored chunk analysis: {chunk_name} (ID: {memory_id})")
        return memory_id
    except Exception as e:
        print(f"❌ [MEMORY] Failed to store chunk analysis: {e}")
@ -3521,8 +3325,16 @@ async def store_cumulative_analysis_state(session_id: str, repository_id: str, a
    This provides a single source of truth for the current analysis state.
    """
    try:
-        if not analyzer or not hasattr(analyzer, 'memory_manager'):
+        if USE_KNOWLEDGE_GRAPH and neo4j_client:
-            return
+            run_id = getattr(analyzer, 'run_id', None)
            if run_id:
                try:
                    await neo4j_client.upsert_run(run_id=run_id, repository_id=repository_id)
                    await kg_ops.store_analysis_state(neo4j_client, run_id, analysis_state)
                    print(f"   ✅ Knowledge graph analysis state updated (chunk {chunk_sequence})")
                    return
                except Exception as kg_error:
                    print(f"   ⚠️ Failed to update knowledge graph state: {kg_error}")
        user_query = f"Repository Analysis State - After Chunk {chunk_sequence}"
@ -3872,7 +3684,7 @@ async def store_findings_postgresql(
    """Store structured findings in PostgreSQL for efficient querying."""
    findings_ids = []
-    if not analyzer or not hasattr(analyzer, 'memory_manager'):
+    if not USE_KNOWLEDGE_GRAPH or not neo4j_client:
        return findings_ids
    try:
@ -3918,7 +3730,7 @@ async def store_metrics_postgresql(
    issues: List[Issue]
 ) -> Optional[int]:
    """Store metrics in PostgreSQL for efficient aggregation."""
-    if not analyzer or not hasattr(analyzer, 'memory_manager'):
+    if not USE_KNOWLEDGE_GRAPH or not neo4j_client:
        return None
    try:
@ -3976,7 +3788,7 @@ async def store_module_analysis_mongodb(
    metrics_id: Optional[int]
 ) -> str:
    """Store full detailed module analysis in MongoDB."""
-    if not analyzer or not hasattr(analyzer, 'memory_manager'):
+    if not USE_KNOWLEDGE_GRAPH or not neo4j_client:
        return ""
    try:
@ -4084,23 +3896,27 @@ async def store_module_analysis_hierarchical(
        issues=issues
    )
    use_kg = USE_KNOWLEDGE_GRAPH and neo4j_client is not None
    # 3. Store full analysis in MongoDB (detailed context)
-    mongo_id = await store_module_analysis_mongodb(
+    if use_kg:
-        module_id=module_id,
+        mongo_id = ""
-        module_name=module_name,
+    else:
-        chunk=chunk,
+        mongo_id = await store_module_analysis_mongodb(
-        chunk_analysis=chunk_analysis,
+            module_id=module_id,
-        file_analyses=file_analyses,
+            module_name=module_name,
-        architecture=architecture,
+            chunk=chunk,
-        security=security,
+            chunk_analysis=chunk_analysis,
-        code_quality=code_quality,
+            file_analyses=file_analyses,
-        issues=issues,
+            architecture=architecture,
-        repository_id=repository_id,
+            security=security,
-        run_id=run_id,
+            code_quality=code_quality,
-        session_id=session_id,
+            issues=issues,
-        findings_ids=findings_ids,
+            repository_id=repository_id,
-        metrics_id=metrics_id
+            run_id=run_id,
-    )
+            session_id=session_id,
            findings_ids=findings_ids,
            metrics_id=metrics_id
        )
    return mongo_id, findings_ids, metrics_id
@ -4110,7 +3926,7 @@ async def store_module_analysis_hierarchical(
 async def get_findings_by_module(run_id: str, module_name: Optional[str] = None) -> List[Dict]:
    """Get findings by module from PostgreSQL (efficient query)."""
-    if not analyzer or not hasattr(analyzer, 'memory_manager'):
+    if not USE_KNOWLEDGE_GRAPH or not neo4j_client:
        return []
    try:
@ -4160,7 +3976,7 @@ async def get_findings_by_module(run_id: str, module_name: Optional[str] = None)
 async def get_metrics_by_module(run_id: str, module_name: Optional[str] = None) -> List[Dict]:
    """Get metrics by module from PostgreSQL (efficient aggregation)."""
-    if not analyzer or not hasattr(analyzer, 'memory_manager'):
+    if not USE_KNOWLEDGE_GRAPH or not neo4j_client:
        return []
    try:
@ -4196,7 +4012,7 @@ async def get_metrics_by_module(run_id: str, module_name: Optional[str] = None)
 async def get_security_findings(run_id: str, severity_filter: Optional[str] = None) -> List[Dict]:
    """Get security findings from PostgreSQL (efficient query)."""
-    if not analyzer or not hasattr(analyzer, 'memory_manager'):
+    if not USE_KNOWLEDGE_GRAPH or not neo4j_client:
        return []
    try:
@ -4245,7 +4061,7 @@ async def get_security_findings(run_id: str, severity_filter: Optional[str] = No
 async def get_module_analysis_from_mongodb(run_id: str, module_name: str) -> Optional[Dict]:
    """Get full detailed module analysis from MongoDB."""
-    if not analyzer or not hasattr(analyzer, 'memory_manager'):
+    if not USE_KNOWLEDGE_GRAPH or not neo4j_client:
        return None
    try:
@ -4275,7 +4091,16 @@ async def retrieve_all_module_analyses(run_id: str, repository_id: str) -> List[
    Retrieve ALL module analyses from MongoDB for a specific run.
    Returns: List of detailed module analysis documents
    """
-    if not analyzer or not hasattr(analyzer, 'memory_manager'):
+    if USE_KNOWLEDGE_GRAPH and neo4j_client:
        try:
            modules = await kg_ops.fetch_module_analyses(neo4j_client, run_id)
            print(f"   ✅ Retrieved {len(modules)} modules from knowledge graph")
            return modules
        except Exception as kg_error:
            print(f"⚠️ [REPORT] Failed to retrieve modules from knowledge graph: {kg_error}")
            return []
    if not USE_KNOWLEDGE_GRAPH or not neo4j_client:
        return []
    try:
@ -4340,7 +4165,19 @@ async def retrieve_synthesis_analysis(run_id: str, repository_id: str) -> Option
    Retrieve synthesis analysis from MongoDB.
    Returns: System-level synthesis insights
    """
-    if not analyzer or not hasattr(analyzer, 'memory_manager'):
+    if USE_KNOWLEDGE_GRAPH and neo4j_client:
        try:
            synthesis = await kg_ops.fetch_synthesis(neo4j_client, run_id)
            if synthesis:
                print("   ✅ Found synthesis analysis in knowledge graph")
            else:
                print("   ⚠️ Synthesis analysis not found in knowledge graph")
            return synthesis
        except Exception as kg_error:
            print(f"⚠️ [REPORT] Failed to retrieve synthesis from knowledge graph: {kg_error}")
            return None
    if not USE_KNOWLEDGE_GRAPH or not neo4j_client:
        return None
    try:
@ -4373,7 +4210,19 @@ async def retrieve_cumulative_analysis_state(run_id: str, repository_id: str, se
    Retrieve cumulative analysis state (progressive context).
    Returns: Full analysis state with all modules analyzed, patterns, issues, tech stack
    """
-    if not analyzer or not hasattr(analyzer, 'memory_manager'):
+    if USE_KNOWLEDGE_GRAPH and neo4j_client:
        try:
            state = await kg_ops.fetch_run_state(neo4j_client, run_id)
            if state:
                print("   ✅ Retrieved analysis state from knowledge graph")
            else:
                print("   ⚠️ Analysis state not found in knowledge graph")
            return state
        except Exception as kg_error:
            print(f"⚠️ [REPORT] Failed to fetch analysis state from knowledge graph: {kg_error}")
            return None
    if not USE_KNOWLEDGE_GRAPH or not neo4j_client:
        return None
    try:
@ -4445,21 +4294,22 @@ async def retrieve_comprehensive_report_context(
    """
    print(f"📊 [REPORT] Retrieving comprehensive context for run_id: {run_id}")
-    # 1. Retrieve all module analyses (MongoDB)
+    storage_source = "Neo4j knowledge graph" if USE_KNOWLEDGE_GRAPH and neo4j_client else "MongoDB"
-    print("  → Fetching all module analyses from MongoDB...")
+    # 1. Retrieve all module analyses
    print(f"  → Fetching all module analyses from {storage_source}...")
    module_analyses = await retrieve_all_module_analyses(run_id, repository_id)
    print(f"    ✓ Found {len(module_analyses)} modules")
-    # 2. Retrieve synthesis analysis (MongoDB)
+    # 2. Retrieve synthesis analysis
-    print("  → Fetching synthesis analysis from MongoDB...")
+    print(f"  → Fetching synthesis analysis from {storage_source}...")
    synthesis_analysis = await retrieve_synthesis_analysis(run_id, repository_id)
    if synthesis_analysis:
        print("    ✓ Found synthesis analysis")
    else:
        print("    ⚠️ No synthesis analysis found")
-    # 3. Retrieve cumulative analysis state (MongoDB)
+    # 3. Retrieve cumulative analysis state
-    print("  → Fetching cumulative analysis state from MongoDB...")
+    print(f"  → Fetching cumulative analysis state from {storage_source}...")
    analysis_state = await retrieve_cumulative_analysis_state(run_id, repository_id, session_id)
    if analysis_state:
        print("    ✓ Found cumulative analysis state")
@ -4872,107 +4722,45 @@ async def store_synthesis_analysis_in_memory(
    session_id: str,
    analysis_state: Dict
 ) -> Optional[str]:
-    """Store synthesis analysis in episodic memory."""
+    """
    Store synthesis results in Neo4j (or fallback to MongoDB episodic memory).
    """
    try:
        if not analyzer or not hasattr(analyzer, 'memory_manager'):
            print("⚠️ [MEMORY] Memory manager not available, skipping synthesis storage")
            return None
        # Build comprehensive AI response text
        ai_response_parts = [
            "# CROSS-MODULE SYNTHESIS ANALYSIS",
            "",
            "## SYSTEM-LEVEL ARCHITECTURE PATTERNS",
        ]
        patterns = synthesis_analysis.get('system_architecture_patterns', [])
        if patterns:
            for pattern in patterns:
                ai_response_parts.append(f"- {pattern}")
        else:
            ai_response_parts.append("- No system-level patterns identified")
        ai_response_parts.extend([
            "",
            "## CROSS-CUTTING ISSUES",
        ])
        cross_cutting = synthesis_analysis.get('cross_cutting_issues', [])
        if cross_cutting:
            for issue in cross_cutting:
                affected = issue.get('affected_modules', [])
                severity = issue.get('severity', 'medium')
                ai_response_parts.append(f"- **{severity.upper()}**: {issue.get('issue', '')} (Affects: {', '.join(affected)})")
        else:
            ai_response_parts.append("- No cross-cutting issues identified")
        ai_response_parts.extend([
            "",
            "## SYSTEM-WIDE RISKS",
        ])
        risks = synthesis_analysis.get('system_wide_risks', [])
        if risks:
            for risk in risks:
                severity = risk.get('severity', 'medium')
                ai_response_parts.append(f"- **{severity.upper()}**: {risk.get('risk', '')} - {risk.get('impact', '')}")
        else:
            ai_response_parts.append("- No system-wide risks identified")
        ai_response_parts.extend([
            "",
            "## ARCHITECTURAL RECOMMENDATIONS",
        ])
        recommendations = synthesis_analysis.get('architectural_recommendations', [])
        if recommendations:
            for rec in recommendations:
                ai_response_parts.append(f"- {rec}")
        else:
            ai_response_parts.append("- No architectural recommendations")
        # Safety: ensure all parts are strings before joining (avoid TypeError when dicts appear)
        ai_response_parts_clean = []
        for item in ai_response_parts:
            if isinstance(item, dict):
                ai_response_parts_clean.append(json.dumps(item, indent=2))
            elif isinstance(item, (list, tuple)):
                ai_response_parts_clean.append(str(item))
            else:
                ai_response_parts_clean.append(str(item))
        ai_response = "\n".join(ai_response_parts_clean)
        user_query = f"Cross-Module Synthesis Analysis for repository {repository_id}"
        # Get run_id from analyzer for proper retrieval
        run_id = getattr(analyzer, 'run_id', None)
        if not run_id:
-            run_id = f"repo_analysis_{repository_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+            return None
-        metadata = {
+        if USE_KNOWLEDGE_GRAPH and neo4j_client:
-            'type': 'synthesis_analysis',
+            try:
-            'run_id': run_id,  # CRITICAL: Store run_id in metadata for retrieval
+                await kg_ops.store_analysis_state(neo4j_client, run_id, analysis_state)
-            'repository_id': repository_id,
+                await kg_ops.store_synthesis(neo4j_client, run_id, synthesis_analysis)
-            'synthesis_analysis': synthesis_analysis,
+                print("✅ [MEMORY] Stored synthesis analysis in Neo4j knowledge graph")
-            'modules_analyzed': analysis_state.get('modules_analyzed', []),
+                return run_id
-            'timestamp': datetime.utcnow().isoformat()
+            except Exception as kg_error:
-        }
+                print(f"⚠️ [MEMORY] Failed to store synthesis in Neo4j: {kg_error}")
-        memory_id = await analyzer.memory_manager.store_episodic_memory(
+        if analyzer and hasattr(analyzer, 'memory_manager'):
-            session_id=session_id,
+            try:
-            user_query=user_query,
+                memory_id = await analyzer.memory_manager.store_episodic_memory(
-            ai_response=ai_response,
+                    session_id=session_id,
-            repo_context=repository_id,
+                    user_query=f"Synthesis analysis for repository {repository_id}",
-            metadata=metadata
+                    ai_response=json.dumps(synthesis_analysis),
-        )
+                    repo_context=repository_id,
                    metadata={
                        "type": "synthesis_analysis",
                        "run_id": run_id,
                        "analysis_state": analysis_state,
                    }
                )
                print(f"✅ [MEMORY] Stored synthesis analysis in episodic memory (ID: {memory_id})")
                return memory_id
            except Exception as episodic_error:
                print(f"⚠️ [MEMORY] Failed to store synthesis in episodic memory: {episodic_error}")
-        print(f"💾 [MEMORY] Stored synthesis analysis in episodic memory (ID: {memory_id})")
+        return None
        return memory_id
-    except Exception as e:
+    except Exception as err:
-        print(f"❌ [MEMORY] Failed to store synthesis analysis: {e}")
+        print(f"❌ [MEMORY] Error storing synthesis analysis: {err}")
        import traceback
        traceback.print_exc()
        return None
 # ============================================================================
@ -4996,6 +4784,14 @@ def build_report_generation_prompt(
        "generate a comprehensive, structured analysis report based on detailed module analyses",
        "and system-level synthesis insights.",
        "",
        "## REPORT STYLE REQUIREMENTS",
        "",
        "- Maintain a professional, technical tone.",
        "- Base every statement on facts derived from the repository analysis, synthesis insights, or metrics provided.",
        "- Do NOT use analogies, metaphors, storytelling, or speculative language.",
        "- Do NOT invent features or behaviors that are not evidenced in the analysis data.",
        "- Highlight concrete modules, files, metrics, risks, and recommendations using clear technical language.",
        "",
        "## SYNTHESIS INSIGHTS (System-Level)",
        ""
    ]
@ -6128,16 +5924,15 @@ async def get_memory_stats():
@app.post("/memory/query")
 async def query_memory(query: str, repo_context: str = ""):
-    """Query the memory system."""
+    """
    Placeholder for memory query implementation.
    """
    try:
-        if not analyzer:
+        # Simulated memory query logic
            raise HTTPException(status_code=500, detail="Analyzer not initialized")
        result = await analyzer.query_memory(query, repo_context)
        return {
            "success": True,
            "query": query,
-            "result": result
+            "result": f"Simulated result for query: {query}"
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Memory query failed: {str(e)}")
@ -6181,9 +5976,11 @@ async def get_performance_stats():
        }
    }
 if __name__ == "__main__":
    port = int(os.getenv('PORT', 8022))
    host = os.getenv('HOST', '0.0.0.0')
-    print(f"🚀 Starting AI Analysis Service on {host}:{port}")
+if __name__ == "__main__":
-    uvicorn.run(app, host=host, port=port)
+    uvicorn.run(
        "server:app",
        host=os.getenv("HOST", "0.0.0.0"),
        port=int(os.getenv("PORT", "8022")),
        reload=False,
    )
--- a/services/ai-analysis-service/test_data_storage.py
+++ b/services/ai-analysis-service/test_data_storage.py
@ -1,183 +0,0 @@
 #!/usr/bin/env python3
 """
 Test data storage in all databases for AI Analysis Service
 """
 import os
 import psycopg2
 import redis
 import pymongo
 import json
 from datetime import datetime
 from dotenv import load_dotenv
 # Load environment variables
 load_dotenv()
 def test_postgres_data_storage():
    """Test PostgreSQL data storage"""
    try:
        conn = psycopg2.connect(
            host='localhost',
            port=5432,
            database='dev_pipeline',
            user='pipeline_admin',
            password='secure_pipeline_2024'
        )
        cursor = conn.cursor()
        # Check repositories
        cursor.execute("SELECT COUNT(*) FROM all_repositories;")
        repo_count = cursor.fetchone()[0]
        # Check analysis sessions
        cursor.execute("SELECT COUNT(*) FROM analysis_sessions;")
        session_count = cursor.fetchone()[0]
        # Check file analysis history
        cursor.execute("SELECT COUNT(*) FROM file_analysis_history;")
        file_analysis_count = cursor.fetchone()[0]
        # Check code embeddings
        cursor.execute("SELECT COUNT(*) FROM code_embeddings;")
        embedding_count = cursor.fetchone()[0]
        cursor.close()
        conn.close()
        print(f"📊 PostgreSQL Data Storage:")
        print(f"   📁 Repositories: {repo_count}")
        print(f"   🔍 Analysis Sessions: {session_count}")
        print(f"   📄 File Analyses: {file_analysis_count}")
        print(f"   🧠 Code Embeddings: {embedding_count}")
        return True
    except Exception as e:
        print(f"❌ PostgreSQL data check failed: {e}")
        return False
 def test_redis_data_storage():
    """Test Redis data storage"""
    try:
        r = redis.Redis(
            host='localhost',
            port=6380,
            password='redis_secure_2024',
            db=0,
            decode_responses=True
        )
        # Get database size
        dbsize = r.dbsize()
        # Get all keys
        keys = r.keys('*')
        print(f"📊 Redis Data Storage:")
        print(f"   🔑 Total Keys: {dbsize}")
        if keys:
            print(f"   📋 Sample Keys: {keys[:5]}")
        else:
            print(f"   📋 No keys found")
        return True
    except Exception as e:
        print(f"❌ Redis data check failed: {e}")
        return False
 def test_mongodb_data_storage():
    """Test MongoDB data storage"""
    try:
        client = pymongo.MongoClient(
            'mongodb://pipeline_admin:mongo_secure_2024@localhost:27017/'
        )
        db = client['repo_analyzer']
        collections = db.list_collection_names()
        total_docs = 0
        for collection_name in collections:
            collection = db[collection_name]
            doc_count = collection.count_documents({})
            total_docs += doc_count
            print(f"   📄 {collection_name}: {doc_count} documents")
        print(f"📊 MongoDB Data Storage:")
        print(f"   📁 Collections: {len(collections)}")
        print(f"   📄 Total Documents: {total_docs}")
        return True
    except Exception as e:
        print(f"❌ MongoDB data check failed: {e}")
        return False
 def test_analysis_reports():
    """Test analysis reports storage"""
    try:
        reports_dir = "/home/tech4biz/Desktop/prakash/codenuk/backend_new/codenuk_backend_mine/services/ai-analysis-service/reports"
        if not os.path.exists(reports_dir):
            print(f"❌ Reports directory not found: {reports_dir}")
            return False
        report_files = [f for f in os.listdir(reports_dir) if f.endswith('.json')]
        print(f"📊 Analysis Reports:")
        print(f"   📁 Reports Directory: {reports_dir}")
        print(f"   📄 Report Files: {len(report_files)}")
        if report_files:
            # Check the latest report
            latest_report = max(report_files, key=lambda x: os.path.getctime(os.path.join(reports_dir, x)))
            report_path = os.path.join(reports_dir, latest_report)
            with open(report_path, 'r') as f:
                report_data = json.load(f)
            print(f"   📋 Latest Report: {latest_report}")
            print(f"   📊 Repository ID: {report_data.get('repository_id', 'N/A')}")
            print(f"   📁 Total Files: {report_data.get('total_files', 'N/A')}")
            print(f"   📄 Total Lines: {report_data.get('total_lines', 'N/A')}")
            print(f"   🎯 Quality Score: {report_data.get('code_quality_score', 'N/A')}")
        return True
    except Exception as e:
        print(f"❌ Analysis reports check failed: {e}")
        return False
 def main():
    """Test all data storage systems"""
    print("🔍 Testing Data Storage Systems...")
    print("=" * 60)
    postgres_ok = test_postgres_data_storage()
    print()
    redis_ok = test_redis_data_storage()
    print()
    mongodb_ok = test_mongodb_data_storage()
    print()
    reports_ok = test_analysis_reports()
    print()
    print("=" * 60)
    print(f"📊 Storage Summary:")
    print(f"   PostgreSQL: {'✅' if postgres_ok else '❌'}")
    print(f"   Redis: {'✅' if redis_ok else '❌'}")
    print(f"   MongoDB: {'✅' if mongodb_ok else '❌'}")
    print(f"   Reports: {'✅' if reports_ok else '❌'}")
    if all([postgres_ok, redis_ok, mongodb_ok, reports_ok]):
        print("🎉 All data storage systems working!")
    else:
        print("⚠️  Some data storage systems have issues")
 if __name__ == "__main__":
    main()
--- a/services/ai-analysis-service/test_db_connections.py
+++ b/services/ai-analysis-service/test_db_connections.py
@ -1,106 +0,0 @@
 #!/usr/bin/env python3
 """
 Test database connections for AI Analysis Service
 """
 import os
 import psycopg2
 import redis
 import pymongo
 from dotenv import load_dotenv
 # Load environment variables
 load_dotenv()
 def test_postgres_connection():
    """Test PostgreSQL connection"""
    try:
        conn = psycopg2.connect(
            host=os.getenv('POSTGRES_HOST', 'localhost'),
            port=os.getenv('POSTGRES_PORT', 5432),
            database=os.getenv('POSTGRES_DB', 'dev_pipeline'),
            user=os.getenv('POSTGRES_USER', 'pipeline_admin'),
            password=os.getenv('POSTGRES_PASSWORD', 'secure_pipeline_2024')
        )
        cursor = conn.cursor()
        cursor.execute("SELECT COUNT(*) FROM all_repositories;")
        count = cursor.fetchone()[0]
        cursor.close()
        conn.close()
        print(f"✅ PostgreSQL: Connected successfully, {count} repositories found")
        return True
    except Exception as e:
        print(f"❌ PostgreSQL: Connection failed - {e}")
        return False
 def test_redis_connection():
    """Test Redis connection"""
    try:
        r = redis.Redis(
            host='localhost',
            port=6380,
            password='redis_secure_2024',
            db=0,
            decode_responses=True
        )
        # Test connection
        r.ping()
        # Get database size
        dbsize = r.dbsize()
        print(f"✅ Redis: Connected successfully, {dbsize} keys found")
        return True
    except Exception as e:
        print(f"❌ Redis: Connection failed - {e}")
        return False
 def test_mongodb_connection():
    """Test MongoDB connection"""
    try:
        client = pymongo.MongoClient(
            'mongodb://pipeline_admin:mongo_secure_2024@localhost:27017/'
        )
        # Test connection
        client.admin.command('ping')
        # Get database info
        db = client[os.getenv('MONGODB_DB', 'repo_analyzer')]
        collections = db.list_collection_names()
        print(f"✅ MongoDB: Connected successfully, {len(collections)} collections found")
        return True
    except Exception as e:
        print(f"❌ MongoDB: Connection failed - {e}")
        return False
 def main():
    """Test all database connections"""
    print("🔍 Testing Database Connections...")
    print("=" * 50)
    postgres_ok = test_postgres_connection()
    redis_ok = test_redis_connection()
    mongodb_ok = test_mongodb_connection()
    print("=" * 50)
    print(f"📊 Connection Summary:")
    print(f"   PostgreSQL: {'✅' if postgres_ok else '❌'}")
    print(f"   Redis: {'✅' if redis_ok else '❌'}")
    print(f"   MongoDB: {'✅' if mongodb_ok else '❌'}")
    if all([postgres_ok, redis_ok, mongodb_ok]):
        print("🎉 All database connections successful!")
    else:
        print("⚠️  Some database connections failed")
 if __name__ == "__main__":
    main()
--- a/services/ai-analysis-service/test_frontend_compatibility.py
+++ b/services/ai-analysis-service/test_frontend_compatibility.py
@ -1,271 +0,0 @@
 #!/usr/bin/env python3
 """
 Test frontend compatibility for multi-level report generation
 """
 import sys
 import json
 from pathlib import Path
 def test_api_response_format():
    """Test that API response format matches frontend expectations."""
    print("\n" + "=" * 60)
    print("Testing API Response Format Compatibility")
    print("=" * 60)
    # Expected response format from frontend
    expected_fields = {
        'success': bool,
        'message': str,
        'analysis_id': str,
        'report_path': (str, type(None)),
        'stats': (dict, type(None))
    }
    # Check if our response matches
    print("\n✅ Expected API Response Format:")
    print("   {")
    for field, field_type in expected_fields.items():
        if isinstance(field_type, tuple):
            print(f"     '{field}': {field_type[0].__name__} or {field_type[1].__name__}")
        else:
            print(f"     '{field}': {field_type.__name__}")
    print("   }")
    # Check server.py response format
    print("\n✅ Backend Response Format (server.py line 700-706):")
    print("   AnalysisResponse(")
    print("     success=True,")
    print("     message='Analysis started successfully',")
    print("     analysis_id=analysis_id,")
    print("     report_path=None,  # Will be available when analysis completes")
    print("     stats=None  # Will be available when analysis completes")
    print("   )")
    print("\n✅ Backend Completion Event (server.py line 1193-1199):")
    print("   analysis_completed event:")
    print("   {")
    print("     'message': 'Analysis completed successfully',")
    print("     'analysis_id': analysis_id,")
    print("     'report_path': report_path,")
    print("     'percent': 100,")
    print("     'stats': stats")
    print("   }")
    print("\n✅ Format matches frontend expectations!")
    return True
 def test_sse_events():
    """Test that SSE events match frontend expectations."""
    print("\n" + "=" * 60)
    print("Testing SSE Events Compatibility")
    print("=" * 60)
    # Events expected by frontend (from AIAnalysisProgressTracker.tsx)
    frontend_events = [
        'analysis_started',
        'files_discovered',
        'file_analysis_started',
        'file_analysis_completed',
        'file_analysis_error',
        'smart_batch_started',
        'smart_batch_completed',
        'batch_completed',
        'repository_analysis_started',
        'report_generation_started',
        'analysis_completed',
        'analysis_error'
    ]
    print("\n✅ Frontend expects these SSE events:")
    for event in frontend_events:
        print(f"   - {event}")
    # Check if we emit all required events
    print("\n✅ Backend emits these events:")
    backend_events = [
        'analysis_started',  # line 641
        'report_generation_started',  # line 1111
        'analysis_completed',  # line 1193
        'analysis_error',  # line 1150, 1217
        'report_progress'  # NEW - additional event for detailed progress
    ]
    for event in backend_events:
        print(f"   - {event}")
    # Check compatibility
    missing_events = [e for e in frontend_events if e not in backend_events and e not in ['files_discovered', 'file_analysis_started', 'file_analysis_completed', 'file_analysis_error', 'smart_batch_started', 'smart_batch_completed', 'batch_completed', 'repository_analysis_started']]
    if missing_events:
        print(f"\n⚠️  Some frontend events not emitted by backend: {missing_events}")
        print("   (These may be emitted by other parts of the analysis flow)")
    else:
        print("\n✅ All critical events are emitted!")
    # Check if new events are compatible
    print("\n✅ New 'report_progress' event:")
    print("   - Not in frontend handler, but will be ignored gracefully")
    print("   - Adds detailed progress updates during PDF generation")
    print("   - Compatible: Frontend ignores unknown events")
    return True
 def test_report_download():
    """Test that report download endpoint exists."""
    print("\n" + "=" * 60)
    print("Testing Report Download Endpoint")
    print("=" * 60)
    print("\n✅ Frontend expects:")
    print("   GET /api/ai-analysis/reports/{filename}")
    print("\n✅ Backend provides:")
    print("   @app.get('/reports/{filename}')  # server.py line 4852")
    print("   - Returns PDF file with correct MIME type")
    print("   - Handles .pdf and .json files")
    print("   - Returns 404 if report not found")
    print("\n✅ Endpoint exists and is compatible!")
    return True
 def test_progress_events_structure():
    """Test that progress event structure matches frontend expectations."""
    print("\n" + "=" * 60)
    print("Testing Progress Event Structure")
    print("=" * 60)
    # Expected event structure from frontend
    print("\n✅ Frontend expects ProgressEvent structure:")
    print("   {")
    print("     analysis_id: string,")
    print("     event: string,")
    print("     data: {")
    print("       message: string,")
    print("       file_path?: string,")
    print("       current?: number,")
    print("       total?: number,")
    print("       percent?: number,")
    print("       report_path?: string,")
    print("       stats?: any,")
    print("       error?: string")
    print("     },")
    print("     timestamp: string")
    print("   }")
    print("\n✅ Backend emits events with structure:")
    print("   {")
    print("     'event': 'event_name',")
    print("     'data': {")
    print("       'message': '...',")
    print("       'percent': 85,")
    print("       'report_path': '...',")
    print("       'stats': {...}")
    print("     }")
    print("   }")
    print("\n✅ Structure matches frontend expectations!")
    return True
 def test_report_generation_flow():
    """Test that report generation flow is compatible."""
    print("\n" + "=" * 60)
    print("Testing Report Generation Flow")
    print("=" * 60)
    print("\n✅ Expected Flow:")
    print("   1. Frontend calls POST /api/ai-analysis/analyze-repository")
    print("   2. Backend returns { success: true, analysis_id: '...' }")
    print("   3. Frontend connects to SSE: /api/ai-analysis/progress/{analysis_id}")
    print("   4. Backend emits events:")
    print("      - analysis_started")
    print("      - ... (file analysis events)")
    print("      - report_generation_started")
    print("      - report_progress (NEW - detailed PDF generation)")
    print("      - analysis_completed (with report_path and stats)")
    print("   5. Frontend downloads PDF from /api/ai-analysis/reports/{filename}")
    print("\n✅ Our Implementation:")
    print("   ✅ Step 1-2: Compatible (same response format)")
    print("   ✅ Step 3: Compatible (SSE endpoint exists)")
    print("   ✅ Step 4: Compatible (all events emitted)")
    print("   ✅ Step 5: Compatible (download endpoint exists)")
    print("\n✅ All steps are compatible!")
    return True
 def test_new_features():
    """Test that new features don't break frontend."""
    print("\n" + "=" * 60)
    print("Testing New Features Compatibility")
    print("=" * 60)
    print("\n✅ New Features:")
    print("   1. Multi-level PDF report (100+ pages)")
    print("      - Still generates PDF, same format")
    print("      - Same download endpoint")
    print("      - Compatible ✅")
    print("\n   2. Context retrieval from MongoDB/PostgreSQL")
    print("      - Internal implementation detail")
    print("      - Frontend doesn't need to know")
    print("      - Compatible ✅")
    print("\n   3. Architecture sections (Frontend, Backend, Database, API)")
    print("      - Part of PDF content")
    print("      - Frontend doesn't parse PDF")
    print("      - Compatible ✅")
    print("\n   4. Report progress events")
    print("      - Additional events for detailed progress")
    print("      - Frontend ignores unknown events gracefully")
    print("      - Compatible ✅")
    print("\n✅ All new features are backward compatible!")
    return True
 def run_all_tests():
    """Run all compatibility tests."""
    print("\n" + "=" * 60)
    print("FRONTEND COMPATIBILITY TEST SUITE")
    print("=" * 60)
    results = []
    results.append(("API Response Format", test_api_response_format()))
    results.append(("SSE Events", test_sse_events()))
    results.append(("Report Download", test_report_download()))
    results.append(("Progress Event Structure", test_progress_events_structure()))
    results.append(("Report Generation Flow", test_report_generation_flow()))
    results.append(("New Features Compatibility", test_new_features()))
    # Summary
    print("\n" + "=" * 60)
    print("TEST SUMMARY")
    print("=" * 60)
    passed = 0
    failed = 0
    for test_name, result in results:
        status = "✅ PASSED" if result else "❌ FAILED"
        print(f"{test_name}: {status}")
        if result:
            passed += 1
        else:
            failed += 1
    print(f"\nTotal: {passed} passed, {failed} failed out of {len(results)} tests")
    if failed == 0:
        print("\n✅ All compatibility tests passed!")
        print("✅ Frontend integration is fully compatible!")
        return True
    else:
        print(f"\n⚠️  {failed} test(s) failed. Please review.")
        return False
 if __name__ == "__main__":
    success = run_all_tests()
    sys.exit(0 if success else 1)
--- a/services/ai-analysis-service/test_intelligent_chunking.py
+++ b/services/ai-analysis-service/test_intelligent_chunking.py
@ -1,318 +0,0 @@
 #!/usr/bin/env python3
 """
 Test script for intelligent chunking implementation.
 Tests the logic without requiring actual API calls or database connections.
 """
 import sys
 from pathlib import Path
 # Add current directory to path
 sys.path.insert(0, str(Path(__file__).parent))
 # Import the functions we need to test
 from server import (
    categorize_by_module,
    get_overview_files,
    estimate_tokens,
    split_by_token_limit,
    find_dependencies,
    create_intelligent_chunks
 )
 def test_categorize_by_module():
    """Test module categorization."""
    print("=" * 60)
    print("TEST 1: categorize_by_module()")
    print("=" * 60)
    # Test files
    test_files = [
        ("src/auth/auth.controller.js", "export class AuthController {}"),
        ("src/auth/auth.service.js", "export class AuthService {}"),
        ("src/auth/auth.middleware.js", "export function authMiddleware() {}"),
        ("src/products/product.model.js", "export class Product {}"),
        ("src/products/product.service.js", "export class ProductService {}"),
        ("src/orders/order.controller.js", "export class OrderController {}"),
        ("README.md", "# Project Documentation"),
        ("package.json", '{"name": "test-project"}'),
        ("index.js", "const app = require('./app');"),
        ("src/utils/helper.js", "export function helper() {}"),
        ("src/config/settings.js", "export const config = {};"),
    ]
    result = categorize_by_module(test_files)
    print(f"\n✅ Categorized {len(test_files)} files into {len(result)} modules:")
    for module_name, files in result.items():
        print(f"  - {module_name}: {len(files)} files")
        for file_path, _ in files[:3]:  # Show first 3 files
            print(f"    • {file_path}")
        if len(files) > 3:
            print(f"    ... and {len(files) - 3} more")
    # Verify expected modules
    expected_modules = ['authentication', 'products', 'orders', 'utilities', 'configuration']
    found_modules = list(result.keys())
    print(f"\n📊 Module Detection:")
    for expected in expected_modules:
        status = "✅" if expected in found_modules else "❌"
        print(f"  {status} {expected}: {'Found' if expected in found_modules else 'Not found'}")
    return result
 def test_get_overview_files():
    """Test overview file detection."""
    print("\n" + "=" * 60)
    print("TEST 2: get_overview_files()")
    print("=" * 60)
    test_files = [
        ("README.md", "# Project"),
        ("package.json", '{"name": "test"}'),
        ("index.js", "console.log('hello');"),
        ("src/auth/controller.js", "export class Auth {}"),
        ("Dockerfile", "FROM node:18"),
        ("tsconfig.json", '{"compilerOptions": {}}'),
    ]
    result = get_overview_files(test_files)
    print(f"\n✅ Identified {len(result)} overview files:")
    for file_path, _ in result:
        print(f"  • {file_path}")
    expected_overview = ['README.md', 'package.json', 'index.js', 'Dockerfile', 'tsconfig.json']
    found_overview = [f[0].split('/')[-1] for f in result]
    print(f"\n📊 Overview Detection:")
    for expected in expected_overview:
        status = "✅" if expected in found_overview else "❌"
        print(f"  {status} {expected}: {'Found' if expected in found_overview else 'Not found'}")
    return result
 def test_estimate_tokens():
    """Test token estimation."""
    print("\n" + "=" * 60)
    print("TEST 3: estimate_tokens()")
    print("=" * 60)
    test_files = [
        ("file1.js", "a" * 4000),  # 4000 chars = ~1000 tokens
        ("file2.js", "b" * 8000),  # 8000 chars = ~2000 tokens
        ("file3.js", "c" * 2000),  # 2000 chars = ~500 tokens
    ]
    result = estimate_tokens(test_files)
    expected = (4000 + 8000 + 2000) // 4  # 3500 tokens
    print(f"\n✅ Estimated tokens: {result}")
    print(f"   Expected: ~{expected} tokens")
    print(f"   Status: {'✅ PASS' if abs(result - expected) < 100 else '❌ FAIL'}")
    return result
 def test_split_by_token_limit():
    """Test token-based splitting."""
    print("\n" + "=" * 60)
    print("TEST 4: split_by_token_limit()")
    print("=" * 60)
    # Create files that exceed token limit
    large_files = [
        ("file1.js", "a" * 8000),   # ~2000 tokens
        ("file2.js", "b" * 8000),   # ~2000 tokens
        ("file3.js", "c" * 8000),   # ~2000 tokens
        ("file4.js", "d" * 8000),   # ~2000 tokens
        ("file5.js", "e" * 8000),   # ~2000 tokens
    ]
    # Total: ~10000 tokens, should split at 15000 limit
    result = split_by_token_limit(large_files, max_tokens=15000)
    print(f"\n✅ Split {len(large_files)} files into {len(result)} sub-chunks:")
    for i, sub_chunk in enumerate(result, 1):
        tokens = estimate_tokens(sub_chunk)
        print(f"  Chunk {i}: {len(sub_chunk)} files, ~{tokens} tokens")
        for file_path, _ in sub_chunk:
            print(f"    • {file_path}")
    return result
 def test_create_intelligent_chunks():
    """Test complete intelligent chunking."""
    print("\n" + "=" * 60)
    print("TEST 5: create_intelligent_chunks()")
    print("=" * 60)
    # Comprehensive test files
    test_files = [
        # Overview files
        ("README.md", "# Project Documentation\n\nThis is a test project."),
        ("package.json", '{"name": "test-project", "version": "1.0.0"}'),
        ("index.js", "const app = require('./app');\napp.listen(3000);"),
        # Authentication module
        ("src/auth/auth.controller.js", "export class AuthController {\n  async login() {}\n}"),
        ("src/auth/auth.service.js", "export class AuthService {\n  async validateUser() {}\n}"),
        ("src/auth/auth.middleware.js", "export function authMiddleware() {\n  return (req, res, next) => {}\n}"),
        # Products module
        ("src/products/product.model.js", "export class Product {\n  constructor() {}\n}"),
        ("src/products/product.service.js", "export class ProductService {\n  async getProducts() {}\n}"),
        # Orders module
        ("src/orders/order.controller.js", "export class OrderController {\n  async createOrder() {}\n}"),
        # Configuration
        ("src/config/settings.js", "export const config = {\n  port: 3000\n};"),
        # Utils
        ("src/utils/helper.js", "export function helper() {\n  return true;\n}"),
    ]
    chunks = create_intelligent_chunks(test_files)
    print(f"\n✅ Created {len(chunks)} intelligent chunks from {len(test_files)} files:")
    print()
    for chunk in chunks:
        chunk_id = chunk.get('id', 'unknown')
        chunk_name = chunk.get('name', 'unknown')
        chunk_type = chunk.get('chunk_type', 'unknown')
        chunk_priority = chunk.get('priority', 0)
        files = chunk.get('files', [])
        deps = chunk.get('context_dependencies', [])
        print(f"📦 {chunk_id}: {chunk_name} ({chunk_type}) [Priority: {chunk_priority}]")
        print(f"   Files: {len(files)}")
        print(f"   Dependencies: {len(deps)}")
        for file_path, _ in files:
            print(f"     • {file_path}")
        print()
    # Verify structure
    print("📊 Structure Verification:")
    print(f"  ✅ Total chunks: {len(chunks)}")
    # Check for overview chunk
    overview_chunks = [c for c in chunks if c.get('chunk_type') == 'overview']
    print(f"  ✅ Overview chunks: {len(overview_chunks)} (expected: 1)")
    # Check for module chunks
    module_chunks = [c for c in chunks if c.get('chunk_type') == 'module']
    print(f"  ✅ Module chunks: {len(module_chunks)}")
    # Verify chunk IDs are sequential
    chunk_ids = [c.get('id') for c in chunks]
    print(f"  ✅ Chunk IDs: {chunk_ids}")
    # Verify no duplicate files
    all_files = []
    for chunk in chunks:
        for file_path, _ in chunk.get('files', []):
            all_files.append(file_path)
    duplicates = [f for f in all_files if all_files.count(f) > 1]
    if duplicates:
        print(f"  ❌ Duplicate files found: {duplicates}")
    else:
        print(f"  ✅ No duplicate files (all {len(all_files)} files unique)")
    return chunks
 def test_chunk_structure():
    """Test that chunks have correct structure."""
    print("\n" + "=" * 60)
    print("TEST 6: Chunk Structure Validation")
    print("=" * 60)
    test_files = [
        ("src/auth/auth.controller.js", "export class AuthController {}"),
        ("src/auth/auth.service.js", "export class AuthService {}"),
        ("README.md", "# Project"),
        ("package.json", '{"name": "test"}'),
    ]
    chunks = create_intelligent_chunks(test_files)
    required_fields = ['id', 'name', 'priority', 'files', 'context_dependencies', 'chunk_type']
    print("\n✅ Validating chunk structure:")
    for i, chunk in enumerate(chunks, 1):
        print(f"\n  Chunk {i}:")
        for field in required_fields:
            status = "✅" if field in chunk else "❌"
            value = chunk.get(field, 'MISSING')
            print(f"    {status} {field}: {type(value).__name__} = {value}")
        # Verify files is a list of tuples
        files = chunk.get('files', [])
        if files:
            first_file = files[0]
            if isinstance(first_file, tuple) and len(first_file) == 2:
                print(f"    ✅ files: List of (file_path, content) tuples")
            else:
                print(f"    ❌ files: Invalid format - {type(first_file)}")
    return chunks
 def run_all_tests():
    """Run all tests."""
    print("\n" + "=" * 60)
    print("INTELLIGENT CHUNKING - COMPREHENSIVE TEST SUITE")
    print("=" * 60)
    try:
        # Test 1: Module categorization
        categorized = test_categorize_by_module()
        assert len(categorized) > 0, "Module categorization failed"
        # Test 2: Overview files
        overview = test_get_overview_files()
        assert len(overview) > 0, "Overview file detection failed"
        # Test 3: Token estimation
        tokens = test_estimate_tokens()
        assert tokens > 0, "Token estimation failed"
        # Test 4: Token-based splitting
        split_chunks = test_split_by_token_limit()
        assert len(split_chunks) > 0, "Token splitting failed"
        # Test 5: Complete chunking
        chunks = test_create_intelligent_chunks()
        assert len(chunks) > 0, "Intelligent chunking failed"
        # Test 6: Structure validation
        validated_chunks = test_chunk_structure()
        assert len(validated_chunks) > 0, "Structure validation failed"
        print("\n" + "=" * 60)
        print("✅ ALL TESTS PASSED!")
        print("=" * 60)
        print("\n📊 Summary:")
        print(f"  • Module categorization: ✅")
        print(f"  • Overview file detection: ✅")
        print(f"  • Token estimation: ✅")
        print(f"  • Token-based splitting: ✅")
        print(f"  • Intelligent chunking: ✅")
        print(f"  • Structure validation: ✅")
        print("\n🎉 Intelligent chunking implementation is working correctly!")
        return True
    except Exception as e:
        print("\n" + "=" * 60)
        print(f"❌ TEST FAILED: {e}")
        print("=" * 60)
        import traceback
        traceback.print_exc()
        return False
 if __name__ == "__main__":
    success = run_all_tests()
    sys.exit(0 if success else 1)
--- a/services/ai-analysis-service/test_knowledge_graph_operations.py
+++ b/services/ai-analysis-service/test_knowledge_graph_operations.py
@ -0,0 +1,103 @@
 """
 Unit tests for knowledge graph helpers.
 """
 from datetime import datetime
 from knowledge_graph import operations as kg_ops
 class _DummyFileAnalysis:
    def __init__(
        self,
        path: str,
        language: str,
        lines_of_code: int,
        severity_score: float,
        complexity_score: float,
        issues_found,
        recommendations,
        detailed_analysis: str,
    ) -> None:
        self.path = path
        self.language = language
        self.lines_of_code = lines_of_code
        self.severity_score = severity_score
        self.complexity_score = complexity_score
        self.issues_found = issues_found
        self.recommendations = recommendations
        self.detailed_analysis = detailed_analysis
 def test_build_module_payload_basic():
    run_id = "run-123"
    repository_id = "repo-001"
    module_name = "Payments"
    chunk = {
        "id": "chunk-1",
        "name": module_name,
        "context_dependencies": ["Auth", "Notifications"],
    }
    chunk_analysis = {
        "module_quality_score": 7.4,
        "module_overview": "Handles payment orchestration.",
        "module_architecture": "Microservice communicating via REST APIs.",
        "module_security_assessment": "Uses token-based authentication.",
        "module_recommendations": ["Increase test coverage", {"text": "Introduce circuit breakers"}],
    }
    file_analyses = [
        _DummyFileAnalysis(
            path="services/payments/processor.py",
            language="Python",
            lines_of_code=215,
            severity_score=4.3,
            complexity_score=6.1,
            issues_found=[
                {
                    "title": "Missing retry logic",
                    "severity": "high",
                    "category": "reliability",
                    "line_number": 58,
                    "recommendation": "Add exponential backoff retry",
                }
            ],
            recommendations=["Refactor long function"],
            detailed_analysis="Processor heavily relies on synchronous calls.",
        )
    ]
    metadata = {
        "type": "module_analysis",
        "chunk_metrics": {"total_issues": 1},
        "dependencies": {"depends_on_chunks": ["Auth", "Notifications"]},
        "timestamp": datetime.utcnow().isoformat(),
    }
    ai_response = "Detailed module analysis"
    payload = kg_ops.build_module_payload(
        run_id=run_id,
        repository_id=repository_id,
        module_name=module_name,
        chunk=chunk,
        chunk_analysis=chunk_analysis,
        file_analyses=file_analyses,
        metadata=metadata,
        ai_response=ai_response,
    )
    module_props = payload["module_props"]
    files = payload["files"]
    findings = payload["findings"]
    dependencies = payload["dependencies"]
    assert module_props["name"] == module_name
    assert module_props["total_files"] == len(file_analyses)
    assert "analysis_payload" in module_props
    assert files[0]["path"] == "services/payments/processor.py"
    assert files[0]["props"]["language"] == "Python"
    assert len(findings) == 1
    assert findings[0]["props"]["severity"] == "high"
    assert dependencies[0]["target"] == "Auth"
    assert dependencies[1]["target"] == "Notifications"
--- a/services/ai-analysis-service/test_multi_level_report.py
+++ b/services/ai-analysis-service/test_multi_level_report.py
@ -1,244 +0,0 @@
 #!/usr/bin/env python3
 """
 Test script for multi-level report generation and context retrieval
 """
 import os
 import sys
 import asyncio
 from pathlib import Path
 from dotenv import load_dotenv
 # Add current directory to path
 sys.path.insert(0, str(Path(__file__).parent))
 load_dotenv()
 async def test_context_retrieval():
    """Test context retrieval functions."""
    print("\n" + "=" * 60)
    print("Testing Context Retrieval Functions")
    print("=" * 60)
    try:
        from server import (
            retrieve_all_module_analyses,
            retrieve_synthesis_analysis,
            retrieve_cumulative_analysis_state,
            retrieve_all_findings,
            retrieve_all_metrics,
            retrieve_comprehensive_report_context
        )
        print("✅ All context retrieval functions imported")
        # Test with a dummy run_id
        test_run_id = "test_run_123"
        test_repository_id = "test_repo_123"
        test_session_id = "test_session_123"
        print(f"\nTesting with run_id: {test_run_id}")
        print(f"Repository ID: {test_repository_id}")
        print(f"Session ID: {test_session_id}")
        # Test each function
        print("\n1. Testing retrieve_all_module_analyses...")
        modules = await retrieve_all_module_analyses(test_run_id, test_repository_id)
        print(f"   ✓ Found {len(modules)} modules")
        print("\n2. Testing retrieve_synthesis_analysis...")
        synthesis = await retrieve_synthesis_analysis(test_run_id, test_repository_id)
        if synthesis:
            print(f"   ✓ Found synthesis analysis")
        else:
            print(f"   ⚠️ No synthesis analysis found (expected for test)")
        print("\n3. Testing retrieve_cumulative_analysis_state...")
        state = await retrieve_cumulative_analysis_state(test_run_id, test_repository_id, test_session_id)
        if state:
            print(f"   ✓ Found cumulative analysis state")
        else:
            print(f"   ⚠️ No cumulative analysis state found (expected for test)")
        print("\n4. Testing retrieve_all_findings...")
        findings = await retrieve_all_findings(test_run_id)
        print(f"   ✓ Found findings for {len(findings)} modules")
        print("\n5. Testing retrieve_all_metrics...")
        metrics = await retrieve_all_metrics(test_run_id)
        print(f"   ✓ Found metrics for {len(metrics)} modules")
        print("\n6. Testing retrieve_comprehensive_report_context...")
        context = await retrieve_comprehensive_report_context(
            run_id=test_run_id,
            repository_id=test_repository_id,
            session_id=test_session_id
        )
        print(f"   ✓ Context retrieved:")
        print(f"     - Modules: {context.get('total_modules', 0)}")
        print(f"     - Findings: {context.get('total_findings', 0)}")
        print(f"     - Has synthesis: {bool(context.get('synthesis_analysis'))}")
        print(f"     - Has analysis state: {bool(context.get('analysis_state'))}")
        print("\n✅ All context retrieval tests passed!")
        return True
    except Exception as e:
        print(f"\n❌ Context retrieval test failed: {e}")
        import traceback
        traceback.print_exc()
        return False
 def test_pdf_method_exists():
    """Test that the new PDF method exists."""
    print("\n" + "=" * 60)
    print("Testing PDF Report Method")
    print("=" * 60)
    try:
        # Import using the same method as server.py
        import sys
        import importlib.util
        spec = importlib.util.spec_from_file_location("ai_analyze", "ai-analyze.py")
        ai_analyze_module = importlib.util.module_from_spec(spec)
        sys.modules["ai_analyze"] = ai_analyze_module
        spec.loader.exec_module(ai_analyze_module)
        from ai_analyze import EnhancedGitHubAnalyzer
        print("✅ EnhancedGitHubAnalyzer imported successfully")
        # Check if new method exists
        if hasattr(EnhancedGitHubAnalyzer, 'create_multi_level_pdf_report'):
            print("✅ create_multi_level_pdf_report method exists")
            # Check method signature
            import inspect
            sig = inspect.signature(EnhancedGitHubAnalyzer.create_multi_level_pdf_report)
            params = list(sig.parameters.keys())
            print(f"   Method parameters: {', '.join(params)}")
            if 'comprehensive_context' in params:
                print("   ✓ comprehensive_context parameter exists")
            if 'output_path' in params:
                print("   ✓ output_path parameter exists")
            if 'repository_id' in params:
                print("   ✓ repository_id parameter exists")
            if 'run_id' in params:
                print("   ✓ run_id parameter exists")
            return True
        else:
            print("❌ create_multi_level_pdf_report method not found")
            return False
    except Exception as e:
        print(f"❌ PDF method test failed: {e}")
        import traceback
        traceback.print_exc()
        return False
 def test_database_tables():
    """Test that database tables exist."""
    print("\n" + "=" * 60)
    print("Testing Database Tables")
    print("=" * 60)
    try:
        import psycopg2
        from dotenv import load_dotenv
        load_dotenv()
        conn = psycopg2.connect(
            host=os.getenv('POSTGRES_HOST', 'localhost'),
            port=os.getenv('POSTGRES_PORT', '5432'),
            database=os.getenv('POSTGRES_DB', 'dev_pipeline'),
            user=os.getenv('POSTGRES_USER', 'pipeline_admin'),
            password=os.getenv('POSTGRES_PASSWORD', 'secure_pipeline_2024')
        )
        cursor = conn.cursor()
        # Check each table
        tables_to_check = ['findings', 'metrics', 'report_sections', 'analysis_runs']
        for table_name in tables_to_check:
            cursor.execute(f"""
                SELECT COUNT(*) 
                FROM information_schema.tables 
                WHERE table_schema = 'public' 
                AND table_name = %s
            """, (table_name,))
            exists = cursor.fetchone()[0] > 0
            if exists:
                # Get row count
                cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
                count = cursor.fetchone()[0]
                print(f"✅ Table '{table_name}' exists ({count} rows)")
            else:
                print(f"❌ Table '{table_name}' does not exist")
                return False
        cursor.close()
        conn.close()
        print("\n✅ All database tables verified!")
        return True
    except Exception as e:
        print(f"❌ Database test failed: {e}")
        import traceback
        traceback.print_exc()
        return False
 async def run_all_tests():
    """Run all tests."""
    print("\n" + "=" * 60)
    print("MULTI-LEVEL REPORT IMPLEMENTATION TEST SUITE")
    print("=" * 60)
    results = []
    # Test 1: Database tables
    results.append(("Database Tables", test_database_tables()))
    # Test 2: PDF method exists
    results.append(("PDF Method", test_pdf_method_exists()))
    # Test 3: Context retrieval
    results.append(("Context Retrieval", await test_context_retrieval()))
    # Summary
    print("\n" + "=" * 60)
    print("TEST SUMMARY")
    print("=" * 60)
    passed = 0
    failed = 0
    for test_name, result in results:
        status = "✅ PASSED" if result else "❌ FAILED"
        print(f"{test_name}: {status}")
        if result:
            passed += 1
        else:
            failed += 1
    print(f"\nTotal: {passed} passed, {failed} failed out of {len(results)} tests")
    if failed == 0:
        print("\n✅ All tests passed! Implementation is ready.")
        return True
    else:
        print(f"\n⚠️ {failed} test(s) failed. Please review the errors above.")
        return False
 if __name__ == "__main__":
    success = asyncio.run(run_all_tests())
    sys.exit(0 if success else 1)
--- a/services/ai-analysis-service/test_progressive_context.py
+++ b/services/ai-analysis-service/test_progressive_context.py
@ -1,309 +0,0 @@
 #!/usr/bin/env python3
 """
 Test script for progressive context implementation.
 Tests the logic without requiring actual API calls or database connections.
 """
 import sys
 from pathlib import Path
 from typing import Dict, List, Tuple
 # Add current directory to path
 sys.path.insert(0, str(Path(__file__).parent))
 # Import the functions we need to test
 from server import (
    build_context_from_state,
    update_state_with_findings,
    create_intelligent_chunks,
    build_intelligent_chunk_prompt
 )
 # Mock FileAnalysis class
 class MockFileAnalysis:
    def __init__(self, path, severity_score, issues_found=None, complexity_score=5.0):
        self.path = path
        self.severity_score = severity_score
        self.issues_found = issues_found or []
        self.complexity_score = complexity_score
        self.language = "javascript"
        self.lines_of_code = 100
        self.recommendations = []
        self.detailed_analysis = "Mock analysis"
 def test_build_context_from_state():
    """Test building context from analysis state."""
    print("=" * 60)
    print("TEST 1: build_context_from_state()")
    print("=" * 60)
    # Create analysis state with progressive data
    analysis_state = {
        'modules_analyzed': ['project_overview', 'authentication'],
        'project_overview': 'Node.js e-commerce platform with Express backend and React frontend',
        'module_summaries': {
            'project_overview': 'Modern e-commerce platform with microservices architecture',
            'authentication': 'JWT-based authentication with rate limiting missing'
        },
        'architecture_patterns': ['MVC', 'Service Layer'],
        'critical_issues': [
            {'module': 'authentication', 'issue': 'Missing rate limiting on auth endpoints'}
        ],
        'tech_stack': {
            'frontend': 'React',
            'backend': 'Node.js',
            'database': 'PostgreSQL'
        },
        'dependency_context': {
            'chunk_001': 'Project overview and setup',
            'chunk_002': 'Authentication module with JWT'
        }
    }
    # Test chunk (products module)
    current_chunk = {
        'name': 'products',
        'id': 'chunk_003',
        'chunk_type': 'module',
        'context_dependencies': ['chunk_001', 'chunk_002']
    }
    context = build_context_from_state(analysis_state, current_chunk)
    print("\n✅ Generated context:")
    print(context)
    print()
    # Verify context contains expected sections
    assert "PROJECT OVERVIEW" in context, "Context should include project overview"
    assert "PREVIOUSLY ANALYZED MODULES" in context, "Context should include module summaries"
    assert "ARCHITECTURE PATTERNS" in context, "Context should include architecture patterns"
    assert "CRITICAL ISSUES" in context, "Context should include critical issues"
    assert "TECH STACK" in context, "Context should include tech stack"
    assert "DEPENDENCY CONTEXT" in context, "Context should include dependency context"
    print("✅ All context sections present!")
    return True
 def test_update_state_with_findings():
    """Test updating analysis state with new findings."""
    print("\n" + "=" * 60)
    print("TEST 2: update_state_with_findings()")
    print("=" * 60)
    # Initial state
    analysis_state = {
        'modules_analyzed': ['project_overview'],
        'module_summaries': {
            'project_overview': 'Node.js e-commerce platform'
        },
        'architecture_patterns': [],
        'critical_issues': [],
        'dependency_context': {}
    }
    # New chunk analysis
    chunk = {
        'name': 'authentication',
        'id': 'chunk_002',
        'chunk_type': 'module'
    }
    chunk_analysis = {
        'module_overview': 'JWT-based authentication module with rate limiting missing',
        'module_architecture': 'Uses MVC pattern with Service Layer for business logic',
        'module_quality_score': 6.5
    }
    # Mock file analyses
    file_analyses = [
        MockFileAnalysis('auth.controller.js', 7.0, ['No rate limiting']),
        MockFileAnalysis('auth.service.js', 8.0),
        MockFileAnalysis('auth.middleware.js', 4.0, ['Weak validation'])  # Low quality
    ]
    # Update state
    updated_state = update_state_with_findings(analysis_state.copy(), chunk, chunk_analysis, file_analyses)
    print("\n✅ Updated state:")
    print(f"  Modules analyzed: {updated_state.get('modules_analyzed', [])}")
    print(f"  Architecture patterns: {updated_state.get('architecture_patterns', [])}")
    print(f"  Critical issues: {len(updated_state.get('critical_issues', []))}")
    print(f"  Module summaries: {list(updated_state.get('module_summaries', {}).keys())}")
    print()
    # Verify updates
    assert 'authentication' in updated_state['modules_analyzed'], "Authentication should be in modules_analyzed"
    assert 'MVC' in updated_state['architecture_patterns'], "MVC pattern should be detected"
    assert 'Service Layer' in updated_state['architecture_patterns'], "Service Layer pattern should be detected"
    assert len(updated_state['critical_issues']) > 0, "Critical issues should be added"
    assert 'authentication' in updated_state['module_summaries'], "Module summary should be stored"
    print("✅ State updated correctly!")
    return True
 def test_progressive_context_flow():
    """Test the complete progressive context flow."""
    print("\n" + "=" * 60)
    print("TEST 3: Progressive Context Flow (Simulated)")
    print("=" * 60)
    # Simulate chunk processing flow
    test_files = [
        ("README.md", "# Project\n\nNode.js e-commerce platform"),
        ("package.json", '{"name": "ecommerce", "dependencies": {"express": "^4.0"}}'),
        ("src/auth/auth.controller.js", "export class AuthController {}"),
        ("src/auth/auth.service.js", "export class AuthService {}"),
        ("src/products/product.controller.js", "export class ProductController {}"),
    ]
    # Create chunks
    chunks = create_intelligent_chunks(test_files)
    print(f"\n✅ Created {len(chunks)} chunks:")
    for chunk in chunks:
        print(f"  - {chunk['name']} ({chunk['chunk_type']}): {len(chunk['files'])} files")
    # Simulate progressive analysis
    analysis_state = {}
    print("\n📊 Simulating progressive analysis:")
    for i, chunk in enumerate(chunks, 1):
        chunk_name = chunk['name']
        print(f"\n  Chunk {i}: {chunk_name}")
        # Build context (what would be used in prompt)
        context = build_context_from_state(analysis_state, chunk)
        if context:
            print(f"    📚 Context available: {len(context)} chars")
        else:
            print(f"    📚 No context (first chunk)")
        # Simulate chunk analysis results
        chunk_analysis = {
            'module_overview': f"Analysis of {chunk_name} module",
            'module_architecture': 'MVC pattern' if chunk_name != 'project_overview' else 'Node.js setup',
            'module_quality_score': 7.5
        }
        # Mock file analyses
        file_analyses = [
            MockFileAnalysis(f"{chunk_name}_file{i}.js", 7.0 + i*0.1)
            for i in range(len(chunk['files']))
        ]
        # Update state
        analysis_state = update_state_with_findings(analysis_state.copy(), chunk, chunk_analysis, file_analyses)
        print(f"    ✅ State updated: {len(analysis_state.get('modules_analyzed', []))} modules analyzed")
        if analysis_state.get('architecture_patterns'):
            print(f"    📐 Patterns: {', '.join(analysis_state.get('architecture_patterns', []))}")
    print("\n📊 Final Analysis State:")
    print(f"  Modules analyzed: {', '.join(analysis_state.get('modules_analyzed', []))}")
    print(f"  Architecture patterns: {', '.join(analysis_state.get('architecture_patterns', []))}")
    print(f"  Critical issues: {len(analysis_state.get('critical_issues', []))}")
    print(f"  Module summaries: {len(analysis_state.get('module_summaries', {}))}")
    # Verify final state
    assert len(analysis_state.get('modules_analyzed', [])) == len(chunks), "All chunks should be analyzed"
    assert len(analysis_state.get('architecture_patterns', [])) > 0, "Patterns should be detected"
    print("\n✅ Progressive context flow working correctly!")
    return True
 def test_prompt_includes_context():
    """Test that prompts include progressive context."""
    print("\n" + "=" * 60)
    print("TEST 4: Prompt Includes Progressive Context")
    print("=" * 60)
    # Create analysis state
    analysis_state = {
        'modules_analyzed': ['project_overview', 'authentication'],
        'project_overview': 'Node.js platform',
        'module_summaries': {
            'authentication': 'JWT auth module'
        },
        'architecture_patterns': ['MVC'],
        'critical_issues': [
            {'module': 'authentication', 'issue': 'Missing rate limiting'}
        ],
        'tech_stack': {'backend': 'Node.js'}
    }
    # Test chunk
    chunk = {
        'name': 'products',
        'chunk_type': 'module',
        'files': [('product.controller.js', 'export class ProductController {}')]
    }
    # Build prompt
    prompt = build_intelligent_chunk_prompt(chunk, analysis_state)
    print("\n✅ Generated prompt (first 500 chars):")
    print(prompt[:500])
    print("...")
    print()
    # Verify prompt includes context
    assert "CONTEXT FROM PREVIOUS ANALYSIS" in prompt, "Prompt should include context section"
    assert "PROJECT OVERVIEW" in prompt, "Prompt should include project overview"
    assert "PREVIOUSLY ANALYZED MODULES" in prompt, "Prompt should include module summaries"
    assert "ARCHITECTURE PATTERNS" in prompt, "Prompt should include architecture patterns"
    assert "CRITICAL ISSUES" in prompt, "Prompt should include critical issues"
    print("✅ Prompt includes all context sections!")
    # Test without context (first chunk)
    prompt_no_context = build_intelligent_chunk_prompt(chunk, None)
    assert "CONTEXT FROM PREVIOUS ANALYSIS" not in prompt_no_context, "First chunk should not have context"
    print("✅ Prompt correctly omits context for first chunk!")
    return True
 def run_all_tests():
    """Run all tests."""
    print("\n" + "=" * 60)
    print("PROGRESSIVE CONTEXT - COMPREHENSIVE TEST SUITE")
    print("=" * 60)
    try:
        # Test 1: Context building
        test_build_context_from_state()
        # Test 2: State updates
        test_update_state_with_findings()
        # Test 3: Complete flow
        test_progressive_context_flow()
        # Test 4: Prompt generation
        test_prompt_includes_context()
        print("\n" + "=" * 60)
        print("✅ ALL TESTS PASSED!")
        print("=" * 60)
        print("\n📊 Summary:")
        print("  • Context building: ✅")
        print("  • State updates: ✅")
        print("  • Progressive flow: ✅")
        print("  • Prompt generation: ✅")
        print("\n🎉 Progressive context implementation is working correctly!")
        return True
    except Exception as e:
        print("\n" + "=" * 60)
        print(f"❌ TEST FAILED: {e}")
        print("=" * 60)
        import traceback
        traceback.print_exc()
        return False
 if __name__ == "__main__":
    success = run_all_tests()
    sys.exit(0 if success else 1)