CP_Assessment_engine/scripts/comprehensive_quality_check.py

"""
Comprehensive Quality Check - 100% Verification
Checks completion, data quality, schema accuracy, and completeness
"""
import pandas as pd
from pathlib import Path
import sys
import io

# Fix Windows console encoding
if sys.platform == 'win32':
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')

BASE_DIR = Path(__file__).resolve().parent.parent
OUTPUT_DIR = BASE_DIR / "output" / "full_run"
DATA_DIR = BASE_DIR / "data"
QUESTIONS_FILE = BASE_DIR / "data" / "AllQuestions.xlsx"

# Expected counts
EXPECTED_ADOLESCENTS = 1507
EXPECTED_ADULTS = 1493
EXPECTED_DOMAINS = 5
EXPECTED_COGNITION_TESTS = 12

def load_questions():
    """Load all questions to verify completeness"""
    try:
        df = pd.read_excel(QUESTIONS_FILE, engine='openpyxl')
        questions_by_domain = {}
        for domain in df['domain'].unique():
            domain_df = df[df['domain'] == domain]
            for age_group in domain_df['age-group'].unique():
                key = f"{domain}_{age_group}"
                questions_by_domain[key] = len(domain_df[domain_df['age-group'] == age_group])
        return questions_by_domain, df
    except Exception as e:
        print(f"⚠️ Error loading questions: {e}")
        return {}, pd.DataFrame()

def check_file_completeness(file_path, expected_rows, domain_name, age_group):
    """Check if file exists and has correct row count"""
    if not file_path.exists():
        return False, f"❌ MISSING: {file_path.name}"

    try:
        df = pd.read_excel(file_path, engine='openpyxl')
        actual_rows = len(df)

        if actual_rows != expected_rows:
            return False, f"❌ ROW COUNT MISMATCH: Expected {expected_rows}, got {actual_rows}"

        # Check for required columns
        if 'Student CPID' not in df.columns and 'Participant' not in df.columns:
            return False, f"❌ MISSING ID COLUMN: No Student CPID or Participant column"

        # Check for NaN in ID column
        id_col = 'Student CPID' if 'Student CPID' in df.columns else 'Participant'
        nan_count = df[id_col].isna().sum()
        if nan_count > 0:
            return False, f"❌ {nan_count} NaN values in ID column"

        # Check data density (non-null percentage)
        total_cells = len(df) * len(df.columns)
        null_cells = df.isnull().sum().sum()
        density = ((total_cells - null_cells) / total_cells) * 100

        if density < 95:
            return False, f"⚠️ LOW DATA DENSITY: {density:.2f}% (expected >95%)"

        return True, f"✅ {actual_rows} rows, {density:.2f}% density"
    except Exception as e:
        return False, f"❌ ERROR: {str(e)}"

def check_question_completeness(file_path, domain_name, age_group, questions_df):
    """Check if all questions are answered"""
    try:
        df = pd.read_excel(file_path, engine='openpyxl')

        # Get expected questions for this domain/age
        domain_questions = questions_df[
            (questions_df['domain'] == domain_name) &
            (questions_df['age-group'] == age_group)
        ]
        expected_q_codes = set(domain_questions['code'].astype(str).unique())

        # Get answered question codes (columns minus metadata)
        metadata_cols = {'Student CPID', 'Participant', 'Name', 'Age', 'Gender', 'Age Category'}
        answered_cols = set(df.columns) - metadata_cols
        answered_q_codes = set([col for col in answered_cols if col in expected_q_codes])

        missing = expected_q_codes - answered_q_codes
        extra = answered_q_codes - expected_q_codes

        if missing:
            return False, f"❌ MISSING QUESTIONS: {len(missing)} questions not answered"
        if extra:
            return False, f"⚠️ EXTRA QUESTIONS: {len(extra)} unexpected columns"

        return True, f"✅ All {len(expected_q_codes)} questions answered"
    except Exception as e:
        return False, f"❌ ERROR checking questions: {str(e)}"

def main():
    print("=" * 80)
    print("🔍 COMPREHENSIVE QUALITY CHECK - 100% VERIFICATION")
    print("=" * 80)
    print()

    # Load questions
    questions_by_domain, questions_df = load_questions()

    results = {
        'adolescents': {'domains': {}, 'cognition': {}},
        'adults': {'domains': {}, 'cognition': {}}
    }

    all_passed = True

    # Check 5 domains for adolescents
    print("📊 ADOLESCENTS (14-17) - 5 DOMAINS")
    print("-" * 80)
    # Domain name to file name mapping (from config.py)
    domain_file_map = {
        'Personality': 'Personality_14-17.xlsx',
        'Grit': 'Grit_14-17.xlsx',
        'Emotional Intelligence': 'Emotional_Intelligence_14-17.xlsx',
        'Vocational Interest': 'Vocational_Interest_14-17.xlsx',
        'Learning Strategies': 'Learning_Strategies_14-17.xlsx'
    }
    age_group = '14-17'

    for domain, file_name in domain_file_map.items():
        file_path = OUTPUT_DIR / "adolescense" / "5_domain" / file_name
        passed, msg = check_file_completeness(file_path, EXPECTED_ADOLESCENTS, domain, age_group)
        results['adolescents']['domains'][domain] = {'passed': passed, 'message': msg}
        print(f"  {domain:30} {msg}")
        if not passed:
            all_passed = False

        # Check question completeness
        if passed and not questions_df.empty:
            q_passed, q_msg = check_question_completeness(file_path, domain, age_group, questions_df)
            if not q_passed:
                print(f"    {q_msg}")
                all_passed = False
            else:
                print(f"    {q_msg}")

    print()

    # Check 5 domains for adults
    print("📊 ADULTS (18-23) - 5 DOMAINS")
    print("-" * 80)
    # Domain name to file name mapping (from config.py)
    domain_file_map_adults = {
        'Personality': 'Personality_18-23.xlsx',
        'Grit': 'Grit_18-23.xlsx',
        'Emotional Intelligence': 'Emotional_Intelligence_18-23.xlsx',
        'Vocational Interest': 'Vocational_Interest_18-23.xlsx',
        'Learning Strategies': 'Learning_Strategies_18-23.xlsx'
    }
    age_group = '18-23'

    for domain, file_name in domain_file_map_adults.items():
        file_path = OUTPUT_DIR / "adults" / "5_domain" / file_name
        passed, msg = check_file_completeness(file_path, EXPECTED_ADULTS, domain, age_group)
        results['adults']['domains'][domain] = {'passed': passed, 'message': msg}
        print(f"  {domain:30} {msg}")
        if not passed:
            all_passed = False

        # Check question completeness
        if passed and not questions_df.empty:
            q_passed, q_msg = check_question_completeness(file_path, domain, age_group, questions_df)
            if not q_passed:
                print(f"    {q_msg}")
                all_passed = False
            else:
                print(f"    {q_msg}")

    print()

    # Check cognition tests
    print("🧠 COGNITION TESTS")
    print("-" * 80)
    cognition_tests = [
        'Cognitive_Flexibility_Test', 'Color_Stroop_Task',
        'Problem_Solving_Test_MRO', 'Problem_Solving_Test_MR',
        'Problem_Solving_Test_NPS', 'Problem_Solving_Test_SBDM',
        'Reasoning_Tasks_AR', 'Reasoning_Tasks_DR', 'Reasoning_Tasks_NR',
        'Response_Inhibition_Task', 'Sternberg_Working_Memory_Task',
        'Visual_Paired_Associates_Test'
    ]

    for test in cognition_tests:
        # Adolescents
        file_path = OUTPUT_DIR / "adolescense" / "cognition" / f"{test}_{age_group}.xlsx"
        if file_path.exists():
            passed, msg = check_file_completeness(file_path, EXPECTED_ADOLESCENTS, test, '14-17')
            results['adolescents']['cognition'][test] = {'passed': passed, 'message': msg}
            print(f"  Adolescent {test:35} {msg}")
            if not passed:
                all_passed = False
        else:
            print(f"  Adolescent {test:35} ⏭️ SKIPPED (not generated)")

        # Adults
        file_path = OUTPUT_DIR / "adults" / "cognition" / f"{test}_18-23.xlsx"
        if file_path.exists():
            passed, msg = check_file_completeness(file_path, EXPECTED_ADULTS, test, '18-23')
            results['adults']['cognition'][test] = {'passed': passed, 'message': msg}
            print(f"  Adult     {test:35} {msg}")
            if not passed:
                all_passed = False
        else:
            print(f"  Adult     {test:35} ⏭️ SKIPPED (not generated)")

    print()
    print("=" * 80)

    # Summary
    if all_passed:
        print("✅ ALL CHECKS PASSED - 100% COMPLETE AND ACCURATE")
    else:
        print("❌ SOME CHECKS FAILED - REVIEW REQUIRED")

    print("=" * 80)

    # Calculate totals
    total_domain_files = 10  # 5 domains × 2 age groups
    total_cognition_files = 24  # 12 tests × 2 age groups (if all generated)

    print()
    print("📈 SUMMARY STATISTICS")
    print("-" * 80)
    print(f"Total Domain Files: {total_domain_files}")
    print(f"Total Cognition Files: {len([f for age in ['adolescense', 'adults'] for f in (OUTPUT_DIR / age / 'cognition').glob('*.xlsx')])}")
    print(f"Adolescent Students: {EXPECTED_ADOLESCENTS}")
    print(f"Adult Students: {EXPECTED_ADULTS}")
    print(f"Total Students: {EXPECTED_ADOLESCENTS + EXPECTED_ADULTS}")

    return all_passed

if __name__ == "__main__":
    success = main()
    sys.exit(0 if success else 1)