100 lines
3.7 KiB
Python
100 lines
3.7 KiB
Python
"""
|
||
Clean up merged_personas.xlsx for client delivery
|
||
Removes redundant columns and ensures data quality
|
||
"""
|
||
import pandas as pd
|
||
from pathlib import Path
|
||
import sys
|
||
import io
|
||
|
||
if sys.platform == 'win32':
|
||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
||
|
||
BASE_DIR = Path(__file__).resolve().parent.parent
|
||
|
||
def cleanup_merged_personas():
|
||
"""Clean up merged_personas.xlsx for client delivery"""
|
||
print("=" * 80)
|
||
print("🧹 CLEANING UP: merged_personas.xlsx for Client Delivery")
|
||
print("=" * 80)
|
||
|
||
file_path = BASE_DIR / "data" / "merged_personas.xlsx"
|
||
backup_path = BASE_DIR / "data" / "merged_personas_backup.xlsx"
|
||
|
||
if not file_path.exists():
|
||
print("❌ FILE NOT FOUND")
|
||
return False
|
||
|
||
# Create backup
|
||
print("\n📦 Creating backup...")
|
||
df_original = pd.read_excel(file_path, engine='openpyxl')
|
||
df_original.to_excel(backup_path, index=False)
|
||
print(f" ✅ Backup created: {backup_path.name}")
|
||
|
||
# Load data
|
||
df = df_original.copy()
|
||
|
||
print(f"\n📊 Original file: {len(df)} rows, {len(df.columns)} columns")
|
||
|
||
# Columns to remove (redundant/DB-derived)
|
||
columns_to_remove = []
|
||
|
||
# Remove Class_DB if it matches Current Grade/Class
|
||
if 'Class_DB' in df.columns and 'Current Grade/Class' in df.columns:
|
||
if (df['Class_DB'].astype(str) == df['Current Grade/Class'].astype(str)).all():
|
||
columns_to_remove.append('Class_DB')
|
||
print(f" 🗑️ Removing 'Class_DB' (duplicate of 'Current Grade/Class')")
|
||
|
||
# Remove Section_DB if it matches Section
|
||
if 'Section_DB' in df.columns and 'Section' in df.columns:
|
||
if (df['Section_DB'].astype(str) == df['Section'].astype(str)).all():
|
||
columns_to_remove.append('Section_DB')
|
||
print(f" 🗑️ Removing 'Section_DB' (duplicate of 'Section')")
|
||
|
||
# Remove SchoolCode_DB if School Code exists
|
||
if 'SchoolCode_DB' in df.columns and 'School Code' in df.columns:
|
||
if (df['SchoolCode_DB'].astype(str) == df['School Code'].astype(str)).all():
|
||
columns_to_remove.append('SchoolCode_DB')
|
||
print(f" 🗑️ Removing 'SchoolCode_DB' (duplicate of 'School Code')")
|
||
|
||
# Remove SchoolName_DB if School Name exists
|
||
if 'SchoolName_DB' in df.columns and 'School Name' in df.columns:
|
||
if (df['SchoolName_DB'].astype(str) == df['School Name'].astype(str)).all():
|
||
columns_to_remove.append('SchoolName_DB')
|
||
print(f" 🗑️ Removing 'SchoolName_DB' (duplicate of 'School Name')")
|
||
|
||
# Remove columns
|
||
if columns_to_remove:
|
||
df = df.drop(columns=columns_to_remove)
|
||
print(f"\n ✅ Removed {len(columns_to_remove)} redundant columns")
|
||
else:
|
||
print(f"\n ℹ️ No redundant columns found to remove")
|
||
|
||
# Final validation
|
||
print(f"\n📊 Cleaned file: {len(df)} rows, {len(df.columns)} columns")
|
||
|
||
# Verify critical columns still present
|
||
critical_cols = ['StudentCPID', 'First Name', 'Last Name', 'Age', 'Age Category']
|
||
missing = [c for c in critical_cols if c not in df.columns]
|
||
if missing:
|
||
print(f" ❌ ERROR: Removed critical columns: {missing}")
|
||
return False
|
||
|
||
# Save cleaned file
|
||
print(f"\n💾 Saving cleaned file...")
|
||
df.to_excel(file_path, index=False)
|
||
print(f" ✅ Cleaned file saved")
|
||
|
||
print(f"\n" + "=" * 80)
|
||
print(f"✅ CLEANUP COMPLETE")
|
||
print(f" Removed: {len(columns_to_remove)} redundant columns")
|
||
print(f" Final columns: {len(df.columns)}")
|
||
print(f" Backup saved: {backup_path.name}")
|
||
print("=" * 80)
|
||
|
||
return True
|
||
|
||
if __name__ == "__main__":
|
||
success = cleanup_merged_personas()
|
||
sys.exit(0 if success else 1)
|