""" Deep investigation of merged_personas.xlsx issues """ import pandas as pd from pathlib import Path import sys import io if sys.platform == 'win32': sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') BASE_DIR = Path(__file__).resolve().parent.parent def investigate(): df = pd.read_excel(BASE_DIR / "data" / "merged_personas.xlsx", engine='openpyxl') print("=" * 80) print("🔍 DEEP INVESTIGATION: merged_personas.xlsx Issues") print("=" * 80) # Check Current Grade/Class vs Class_DB print("\n1. GRADE/CLASS COLUMN ANALYSIS:") print("-" * 80) if 'Current Grade/Class' in df.columns and 'Class_DB' in df.columns: print(" Comparing 'Current Grade/Class' vs 'Class_DB':") # Check if they match matches = (df['Current Grade/Class'].astype(str) == df['Class_DB'].astype(str)).sum() total = len(df) mismatches = total - matches print(f" Matching rows: {matches}/{total}") print(f" Mismatches: {mismatches}") if mismatches > 0: print(f" ⚠️ MISMATCH FOUND - Showing sample mismatches:") mismatched = df[df['Current Grade/Class'].astype(str) != df['Class_DB'].astype(str)] for idx, row in mismatched.head(5).iterrows(): print(f" Row {idx}: '{row['Current Grade/Class']}' vs '{row['Class_DB']}'") else: print(f" ✅ Columns match perfectly - 'Class_DB' is redundant") # Check Section vs Section_DB print("\n2. SECTION COLUMN ANALYSIS:") print("-" * 80) if 'Section' in df.columns and 'Section_DB' in df.columns: matches = (df['Section'].astype(str) == df['Section_DB'].astype(str)).sum() total = len(df) mismatches = total - matches print(f" Matching rows: {matches}/{total}") print(f" Mismatches: {mismatches}") if mismatches > 0: print(f" ⚠️ MISMATCH FOUND") else: print(f" ✅ Columns match perfectly - 'Section_DB' is redundant") # Check Nationality and Native State print("\n3. NATIONALITY/NATIVE STATE ANALYSIS:") print("-" * 80) if 'Nationality' in df.columns: unique_nationality = df['Nationality'].nunique() print(f" Nationality unique values: {unique_nationality}") if unique_nationality == 1: print(f" ⚠️ All students have same nationality: {df['Nationality'].iloc[0]}") print(f" ⚠️ This may be intentional but could be flagged by client") if 'Native State' in df.columns: unique_state = df['Native State'].nunique() print(f" Native State unique values: {unique_state}") if unique_state == 1: print(f" ⚠️ All students from same state: {df['Native State'].iloc[0]}") print(f" ⚠️ This may be intentional but could be flagged by client") # Check for other potential issues print("\n4. OTHER POTENTIAL ISSUES:") print("-" * 80) # Check for empty columns empty_cols = [] for col in df.columns: non_null = df[col].notna().sum() if non_null == 0: empty_cols.append(col) if empty_cols: print(f" ⚠️ EMPTY COLUMNS: {empty_cols}") else: print(f" ✅ No completely empty columns") # Check for columns with mostly empty values mostly_empty = [] for col in df.columns: non_null_pct = (df[col].notna().sum() / len(df)) * 100 if non_null_pct < 10 and non_null_pct > 0: mostly_empty.append((col, non_null_pct)) if mostly_empty: print(f" ⚠️ MOSTLY EMPTY COLUMNS (<10% filled):") for col, pct in mostly_empty: print(f" {col}: {pct:.1f}% filled") # Recommendations print("\n" + "=" * 80) print("💡 RECOMMENDATIONS:") print("=" * 80) recommendations = [] if 'Class_DB' in df.columns and 'Current Grade/Class' in df.columns: if (df['Current Grade/Class'].astype(str) == df['Class_DB'].astype(str)).all(): recommendations.append("Remove 'Class_DB' column (duplicate of 'Current Grade/Class')") if 'Section_DB' in df.columns and 'Section' in df.columns: if (df['Section'].astype(str) == df['Section_DB'].astype(str)).all(): recommendations.append("Remove 'Section_DB' column (duplicate of 'Section')") if 'Nationality' in df.columns and df['Nationality'].nunique() == 1: recommendations.append("Review 'Nationality' column - all students have same value (may be intentional)") if 'Native State' in df.columns and df['Native State'].nunique() == 1: recommendations.append("Review 'Native State' column - all students from same state (may be intentional)") if recommendations: for i, rec in enumerate(recommendations, 1): print(f" {i}. {rec}") else: print(" ✅ No critical issues requiring action") print("=" * 80) if __name__ == "__main__": investigate()