CP_Assessment_engine/scripts/cleanup_merged_personas.py
2026-02-10 12:59:40 +05:30

100 lines
3.7 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Clean up merged_personas.xlsx for client delivery
Removes redundant columns and ensures data quality
"""
import pandas as pd
from pathlib import Path
import sys
import io
if sys.platform == 'win32':
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
BASE_DIR = Path(__file__).resolve().parent.parent
def cleanup_merged_personas():
"""Clean up merged_personas.xlsx for client delivery"""
print("=" * 80)
print("🧹 CLEANING UP: merged_personas.xlsx for Client Delivery")
print("=" * 80)
file_path = BASE_DIR / "data" / "merged_personas.xlsx"
backup_path = BASE_DIR / "data" / "merged_personas_backup.xlsx"
if not file_path.exists():
print("❌ FILE NOT FOUND")
return False
# Create backup
print("\n📦 Creating backup...")
df_original = pd.read_excel(file_path, engine='openpyxl')
df_original.to_excel(backup_path, index=False)
print(f" ✅ Backup created: {backup_path.name}")
# Load data
df = df_original.copy()
print(f"\n📊 Original file: {len(df)} rows, {len(df.columns)} columns")
# Columns to remove (redundant/DB-derived)
columns_to_remove = []
# Remove Class_DB if it matches Current Grade/Class
if 'Class_DB' in df.columns and 'Current Grade/Class' in df.columns:
if (df['Class_DB'].astype(str) == df['Current Grade/Class'].astype(str)).all():
columns_to_remove.append('Class_DB')
print(f" 🗑️ Removing 'Class_DB' (duplicate of 'Current Grade/Class')")
# Remove Section_DB if it matches Section
if 'Section_DB' in df.columns and 'Section' in df.columns:
if (df['Section_DB'].astype(str) == df['Section'].astype(str)).all():
columns_to_remove.append('Section_DB')
print(f" 🗑️ Removing 'Section_DB' (duplicate of 'Section')")
# Remove SchoolCode_DB if School Code exists
if 'SchoolCode_DB' in df.columns and 'School Code' in df.columns:
if (df['SchoolCode_DB'].astype(str) == df['School Code'].astype(str)).all():
columns_to_remove.append('SchoolCode_DB')
print(f" 🗑️ Removing 'SchoolCode_DB' (duplicate of 'School Code')")
# Remove SchoolName_DB if School Name exists
if 'SchoolName_DB' in df.columns and 'School Name' in df.columns:
if (df['SchoolName_DB'].astype(str) == df['School Name'].astype(str)).all():
columns_to_remove.append('SchoolName_DB')
print(f" 🗑️ Removing 'SchoolName_DB' (duplicate of 'School Name')")
# Remove columns
if columns_to_remove:
df = df.drop(columns=columns_to_remove)
print(f"\n ✅ Removed {len(columns_to_remove)} redundant columns")
else:
print(f"\n No redundant columns found to remove")
# Final validation
print(f"\n📊 Cleaned file: {len(df)} rows, {len(df.columns)} columns")
# Verify critical columns still present
critical_cols = ['StudentCPID', 'First Name', 'Last Name', 'Age', 'Age Category']
missing = [c for c in critical_cols if c not in df.columns]
if missing:
print(f" ❌ ERROR: Removed critical columns: {missing}")
return False
# Save cleaned file
print(f"\n💾 Saving cleaned file...")
df.to_excel(file_path, index=False)
print(f" ✅ Cleaned file saved")
print(f"\n" + "=" * 80)
print(f"✅ CLEANUP COMPLETE")
print(f" Removed: {len(columns_to_remove)} redundant columns")
print(f" Final columns: {len(df.columns)}")
print(f" Backup saved: {backup_path.name}")
print("=" * 80)
return True
if __name__ == "__main__":
success = cleanup_merged_personas()
sys.exit(0 if success else 1)