""" Clean up merged_personas.xlsx for client delivery Removes redundant columns and ensures data quality """ import pandas as pd from pathlib import Path import sys import io if sys.platform == 'win32': sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') BASE_DIR = Path(__file__).resolve().parent.parent def cleanup_merged_personas(): """Clean up merged_personas.xlsx for client delivery""" print("=" * 80) print("๐Ÿงน CLEANING UP: merged_personas.xlsx for Client Delivery") print("=" * 80) file_path = BASE_DIR / "data" / "merged_personas.xlsx" backup_path = BASE_DIR / "data" / "merged_personas_backup.xlsx" if not file_path.exists(): print("โŒ FILE NOT FOUND") return False # Create backup print("\n๐Ÿ“ฆ Creating backup...") df_original = pd.read_excel(file_path, engine='openpyxl') df_original.to_excel(backup_path, index=False) print(f" โœ… Backup created: {backup_path.name}") # Load data df = df_original.copy() print(f"\n๐Ÿ“Š Original file: {len(df)} rows, {len(df.columns)} columns") # Columns to remove (redundant/DB-derived) columns_to_remove = [] # Remove Class_DB if it matches Current Grade/Class if 'Class_DB' in df.columns and 'Current Grade/Class' in df.columns: if (df['Class_DB'].astype(str) == df['Current Grade/Class'].astype(str)).all(): columns_to_remove.append('Class_DB') print(f" ๐Ÿ—‘๏ธ Removing 'Class_DB' (duplicate of 'Current Grade/Class')") # Remove Section_DB if it matches Section if 'Section_DB' in df.columns and 'Section' in df.columns: if (df['Section_DB'].astype(str) == df['Section'].astype(str)).all(): columns_to_remove.append('Section_DB') print(f" ๐Ÿ—‘๏ธ Removing 'Section_DB' (duplicate of 'Section')") # Remove SchoolCode_DB if School Code exists if 'SchoolCode_DB' in df.columns and 'School Code' in df.columns: if (df['SchoolCode_DB'].astype(str) == df['School Code'].astype(str)).all(): columns_to_remove.append('SchoolCode_DB') print(f" ๐Ÿ—‘๏ธ Removing 'SchoolCode_DB' (duplicate of 'School Code')") # Remove SchoolName_DB if School Name exists if 'SchoolName_DB' in df.columns and 'School Name' in df.columns: if (df['SchoolName_DB'].astype(str) == df['School Name'].astype(str)).all(): columns_to_remove.append('SchoolName_DB') print(f" ๐Ÿ—‘๏ธ Removing 'SchoolName_DB' (duplicate of 'School Name')") # Remove columns if columns_to_remove: df = df.drop(columns=columns_to_remove) print(f"\n โœ… Removed {len(columns_to_remove)} redundant columns") else: print(f"\n โ„น๏ธ No redundant columns found to remove") # Final validation print(f"\n๐Ÿ“Š Cleaned file: {len(df)} rows, {len(df.columns)} columns") # Verify critical columns still present critical_cols = ['StudentCPID', 'First Name', 'Last Name', 'Age', 'Age Category'] missing = [c for c in critical_cols if c not in df.columns] if missing: print(f" โŒ ERROR: Removed critical columns: {missing}") return False # Save cleaned file print(f"\n๐Ÿ’พ Saving cleaned file...") df.to_excel(file_path, index=False) print(f" โœ… Cleaned file saved") print(f"\n" + "=" * 80) print(f"โœ… CLEANUP COMPLETE") print(f" Removed: {len(columns_to_remove)} redundant columns") print(f" Final columns: {len(df.columns)}") print(f" Backup saved: {backup_path.name}") print("=" * 80) return True if __name__ == "__main__": success = cleanup_merged_personas() sys.exit(0 if success else 1)