CP_Assessment_engine/scripts/investigate_persona_issues.py

"""
Deep investigation of merged_personas.xlsx issues
"""
import pandas as pd
from pathlib import Path
import sys
import io

if sys.platform == 'win32':
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')

BASE_DIR = Path(__file__).resolve().parent.parent

def investigate():
    df = pd.read_excel(BASE_DIR / "data" / "merged_personas.xlsx", engine='openpyxl')

    print("=" * 80)
    print("🔍 DEEP INVESTIGATION: merged_personas.xlsx Issues")
    print("=" * 80)

    # Check Current Grade/Class vs Class_DB
    print("\n1. GRADE/CLASS COLUMN ANALYSIS:")
    print("-" * 80)

    if 'Current Grade/Class' in df.columns and 'Class_DB' in df.columns:
        print("   Comparing 'Current Grade/Class' vs 'Class_DB':")

        # Check if they match
        matches = (df['Current Grade/Class'].astype(str) == df['Class_DB'].astype(str)).sum()
        total = len(df)
        mismatches = total - matches

        print(f"   Matching rows: {matches}/{total}")
        print(f"   Mismatches: {mismatches}")

        if mismatches > 0:
            print(f"   ⚠️ MISMATCH FOUND - Showing sample mismatches:")
            mismatched = df[df['Current Grade/Class'].astype(str) != df['Class_DB'].astype(str)]
            for idx, row in mismatched.head(5).iterrows():
                print(f"      Row {idx}: '{row['Current Grade/Class']}' vs '{row['Class_DB']}'")
        else:
            print(f"   ✅ Columns match perfectly - 'Class_DB' is redundant")

    # Check Section vs Section_DB
    print("\n2. SECTION COLUMN ANALYSIS:")
    print("-" * 80)

    if 'Section' in df.columns and 'Section_DB' in df.columns:
        matches = (df['Section'].astype(str) == df['Section_DB'].astype(str)).sum()
        total = len(df)
        mismatches = total - matches

        print(f"   Matching rows: {matches}/{total}")
        print(f"   Mismatches: {mismatches}")

        if mismatches > 0:
            print(f"   ⚠️ MISMATCH FOUND")
        else:
            print(f"   ✅ Columns match perfectly - 'Section_DB' is redundant")

    # Check Nationality and Native State
    print("\n3. NATIONALITY/NATIVE STATE ANALYSIS:")
    print("-" * 80)

    if 'Nationality' in df.columns:
        unique_nationality = df['Nationality'].nunique()
        print(f"   Nationality unique values: {unique_nationality}")
        if unique_nationality == 1:
            print(f"   ⚠️ All students have same nationality: {df['Nationality'].iloc[0]}")
            print(f"   ⚠️ This may be intentional but could be flagged by client")

    if 'Native State' in df.columns:
        unique_state = df['Native State'].nunique()
        print(f"   Native State unique values: {unique_state}")
        if unique_state == 1:
            print(f"   ⚠️ All students from same state: {df['Native State'].iloc[0]}")
            print(f"   ⚠️ This may be intentional but could be flagged by client")

    # Check for other potential issues
    print("\n4. OTHER POTENTIAL ISSUES:")
    print("-" * 80)

    # Check for empty columns
    empty_cols = []
    for col in df.columns:
        non_null = df[col].notna().sum()
        if non_null == 0:
            empty_cols.append(col)

    if empty_cols:
        print(f"   ⚠️ EMPTY COLUMNS: {empty_cols}")
    else:
        print(f"   ✅ No completely empty columns")

    # Check for columns with mostly empty values
    mostly_empty = []
    for col in df.columns:
        non_null_pct = (df[col].notna().sum() / len(df)) * 100
        if non_null_pct < 10 and non_null_pct > 0:
            mostly_empty.append((col, non_null_pct))

    if mostly_empty:
        print(f"   ⚠️ MOSTLY EMPTY COLUMNS (<10% filled):")
        for col, pct in mostly_empty:
            print(f"      {col}: {pct:.1f}% filled")

    # Recommendations
    print("\n" + "=" * 80)
    print("💡 RECOMMENDATIONS:")
    print("=" * 80)

    recommendations = []

    if 'Class_DB' in df.columns and 'Current Grade/Class' in df.columns:
        if (df['Current Grade/Class'].astype(str) == df['Class_DB'].astype(str)).all():
            recommendations.append("Remove 'Class_DB' column (duplicate of 'Current Grade/Class')")

    if 'Section_DB' in df.columns and 'Section' in df.columns:
        if (df['Section'].astype(str) == df['Section_DB'].astype(str)).all():
            recommendations.append("Remove 'Section_DB' column (duplicate of 'Section')")

    if 'Nationality' in df.columns and df['Nationality'].nunique() == 1:
        recommendations.append("Review 'Nationality' column - all students have same value (may be intentional)")

    if 'Native State' in df.columns and df['Native State'].nunique() == 1:
        recommendations.append("Review 'Native State' column - all students from same state (may be intentional)")

    if recommendations:
        for i, rec in enumerate(recommendations, 1):
            print(f"   {i}. {rec}")
    else:
        print("   ✅ No critical issues requiring action")

    print("=" * 80)

if __name__ == "__main__":
    investigate()