CP_Assessment_engine/scripts/cleanup_merged_personas.py

"""
Clean up merged_personas.xlsx for client delivery
Removes redundant columns and ensures data quality
"""
import pandas as pd
from pathlib import Path
import sys
import io

if sys.platform == 'win32':
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')

BASE_DIR = Path(__file__).resolve().parent.parent

def cleanup_merged_personas():
    """Clean up merged_personas.xlsx for client delivery"""
    print("=" * 80)
    print("🧹 CLEANING UP: merged_personas.xlsx for Client Delivery")
    print("=" * 80)

    file_path = BASE_DIR / "data" / "merged_personas.xlsx"
    backup_path = BASE_DIR / "data" / "merged_personas_backup.xlsx"

    if not file_path.exists():
        print("❌ FILE NOT FOUND")
        return False

    # Create backup
    print("\n📦 Creating backup...")
    df_original = pd.read_excel(file_path, engine='openpyxl')
    df_original.to_excel(backup_path, index=False)
    print(f"   ✅ Backup created: {backup_path.name}")

    # Load data
    df = df_original.copy()

    print(f"\n📊 Original file: {len(df)} rows, {len(df.columns)} columns")

    # Columns to remove (redundant/DB-derived)
    columns_to_remove = []

    # Remove Class_DB if it matches Current Grade/Class
    if 'Class_DB' in df.columns and 'Current Grade/Class' in df.columns:
        if (df['Class_DB'].astype(str) == df['Current Grade/Class'].astype(str)).all():
            columns_to_remove.append('Class_DB')
            print(f"   🗑️ Removing 'Class_DB' (duplicate of 'Current Grade/Class')")

    # Remove Section_DB if it matches Section
    if 'Section_DB' in df.columns and 'Section' in df.columns:
        if (df['Section_DB'].astype(str) == df['Section'].astype(str)).all():
            columns_to_remove.append('Section_DB')
            print(f"   🗑️ Removing 'Section_DB' (duplicate of 'Section')")

    # Remove SchoolCode_DB if School Code exists
    if 'SchoolCode_DB' in df.columns and 'School Code' in df.columns:
        if (df['SchoolCode_DB'].astype(str) == df['School Code'].astype(str)).all():
            columns_to_remove.append('SchoolCode_DB')
            print(f"   🗑️ Removing 'SchoolCode_DB' (duplicate of 'School Code')")

    # Remove SchoolName_DB if School Name exists
    if 'SchoolName_DB' in df.columns and 'School Name' in df.columns:
        if (df['SchoolName_DB'].astype(str) == df['School Name'].astype(str)).all():
            columns_to_remove.append('SchoolName_DB')
            print(f"   🗑️ Removing 'SchoolName_DB' (duplicate of 'School Name')")

    # Remove columns
    if columns_to_remove:
        df = df.drop(columns=columns_to_remove)
        print(f"\n   ✅ Removed {len(columns_to_remove)} redundant columns")
    else:
        print(f"\n   ℹ️ No redundant columns found to remove")

    # Final validation
    print(f"\n📊 Cleaned file: {len(df)} rows, {len(df.columns)} columns")

    # Verify critical columns still present
    critical_cols = ['StudentCPID', 'First Name', 'Last Name', 'Age', 'Age Category']
    missing = [c for c in critical_cols if c not in df.columns]
    if missing:
        print(f"   ❌ ERROR: Removed critical columns: {missing}")
        return False

    # Save cleaned file
    print(f"\n💾 Saving cleaned file...")
    df.to_excel(file_path, index=False)
    print(f"   ✅ Cleaned file saved")

    print(f"\n" + "=" * 80)
    print(f"✅ CLEANUP COMPLETE")
    print(f"   Removed: {len(columns_to_remove)} redundant columns")
    print(f"   Final columns: {len(df.columns)}")
    print(f"   Backup saved: {backup_path.name}")
    print("=" * 80)

    return True

if __name__ == "__main__":
    success = cleanup_merged_personas()
    sys.exit(0 if success else 1)