CP_Assessment_engine/scripts/audit_tool.py

import pandas as pd
from pathlib import Path
import sys
import io

# Force UTF-8 for output
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')

# Add root to sys.path
root = Path(__file__).resolve().parent.parent
sys.path.append(str(root))

import config

def audit_missing_only():
    base_dir = Path(r'C:\work\CP_Automation\Simulated_Assessment_Engine\output\dry_run')
    expected_domains = [
        'Learning_Strategies_{age}.xlsx',
        'Personality_{age}.xlsx',
        'Emotional_Intelligence_{age}.xlsx',
        'Vocational_Interest_{age}.xlsx',
        'Grit_{age}.xlsx'
    ]
    cognition_tests = config.COGNITION_TESTS

    issues = []

    for age_label, age_suffix in [('adolescense', '14-17'), ('adults', '18-23')]:
        # Survey
        domain_dir = base_dir / age_label / "5_domain"
        for d_tmpl in expected_domains:
            f_name = d_tmpl.format(age=age_suffix)
            f_path = domain_dir / f_name
            check_issue(f_path, age_label, "Survey", f_name, issues)

        # Cognition
        cog_dir = base_dir / age_label / "cognition"
        for c_test in cognition_tests:
            f_name = config.COGNITION_FILE_NAMES.get(c_test, f'{c_test}_{age_suffix}.xlsx').replace('{age}', age_suffix)
            f_path = cog_dir / f_name
            check_issue(f_path, age_label, "Cognition", c_test, issues)

    if not issues:
        print("🎉 NO ISSUES FOUND! 100% PERFECT.")
    else:
        print(f"❌ FOUND {len(issues)} ISSUES:")
        for iss in issues:
            print(f"  - {iss}")

def check_issue(path, age, category, name, issues):
    if not path.exists():
        issues.append(f"{age} | {category} | {name}: MISSING")
        return

    try:
        df = pd.read_excel(path)
        if df.shape[0] == 0:
            issues.append(f"{age} | {category} | {name}: EMPTY ROWS")
            return

        # For Survey, check first row (one student)
        if category == "Survey":
            student_row = df.iloc[0]
            # Q-codes start after 'Participant'
            q_cols = [c for c in df.columns if c != 'Participant']
            missing = student_row[q_cols].isna().sum()
            if missing > 0:
                issues.append(f"{age} | {category} | {name}: {missing}/{len(q_cols)} answers missing")

        # For Cognition, check first row
        else:
            student_row = df.iloc[0]
            if student_row.isna().sum() > 0:
                issues.append(f"{age} | {category} | {name}: contains NaNs")

    except Exception as e:
        issues.append(f"{age} | {category} | {name}: ERROR {e}")

if __name__ == "__main__":
    audit_missing_only()