138 lines
5.0 KiB
Python
138 lines
5.0 KiB
Python
"""
|
|
Deep investigation of merged_personas.xlsx issues
|
|
"""
|
|
import pandas as pd
|
|
from pathlib import Path
|
|
import sys
|
|
import io
|
|
|
|
if sys.platform == 'win32':
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
|
|
|
BASE_DIR = Path(__file__).resolve().parent.parent
|
|
|
|
def investigate():
|
|
df = pd.read_excel(BASE_DIR / "data" / "merged_personas.xlsx", engine='openpyxl')
|
|
|
|
print("=" * 80)
|
|
print("🔍 DEEP INVESTIGATION: merged_personas.xlsx Issues")
|
|
print("=" * 80)
|
|
|
|
# Check Current Grade/Class vs Class_DB
|
|
print("\n1. GRADE/CLASS COLUMN ANALYSIS:")
|
|
print("-" * 80)
|
|
|
|
if 'Current Grade/Class' in df.columns and 'Class_DB' in df.columns:
|
|
print(" Comparing 'Current Grade/Class' vs 'Class_DB':")
|
|
|
|
# Check if they match
|
|
matches = (df['Current Grade/Class'].astype(str) == df['Class_DB'].astype(str)).sum()
|
|
total = len(df)
|
|
mismatches = total - matches
|
|
|
|
print(f" Matching rows: {matches}/{total}")
|
|
print(f" Mismatches: {mismatches}")
|
|
|
|
if mismatches > 0:
|
|
print(f" ⚠️ MISMATCH FOUND - Showing sample mismatches:")
|
|
mismatched = df[df['Current Grade/Class'].astype(str) != df['Class_DB'].astype(str)]
|
|
for idx, row in mismatched.head(5).iterrows():
|
|
print(f" Row {idx}: '{row['Current Grade/Class']}' vs '{row['Class_DB']}'")
|
|
else:
|
|
print(f" ✅ Columns match perfectly - 'Class_DB' is redundant")
|
|
|
|
# Check Section vs Section_DB
|
|
print("\n2. SECTION COLUMN ANALYSIS:")
|
|
print("-" * 80)
|
|
|
|
if 'Section' in df.columns and 'Section_DB' in df.columns:
|
|
matches = (df['Section'].astype(str) == df['Section_DB'].astype(str)).sum()
|
|
total = len(df)
|
|
mismatches = total - matches
|
|
|
|
print(f" Matching rows: {matches}/{total}")
|
|
print(f" Mismatches: {mismatches}")
|
|
|
|
if mismatches > 0:
|
|
print(f" ⚠️ MISMATCH FOUND")
|
|
else:
|
|
print(f" ✅ Columns match perfectly - 'Section_DB' is redundant")
|
|
|
|
# Check Nationality and Native State
|
|
print("\n3. NATIONALITY/NATIVE STATE ANALYSIS:")
|
|
print("-" * 80)
|
|
|
|
if 'Nationality' in df.columns:
|
|
unique_nationality = df['Nationality'].nunique()
|
|
print(f" Nationality unique values: {unique_nationality}")
|
|
if unique_nationality == 1:
|
|
print(f" ⚠️ All students have same nationality: {df['Nationality'].iloc[0]}")
|
|
print(f" ⚠️ This may be intentional but could be flagged by client")
|
|
|
|
if 'Native State' in df.columns:
|
|
unique_state = df['Native State'].nunique()
|
|
print(f" Native State unique values: {unique_state}")
|
|
if unique_state == 1:
|
|
print(f" ⚠️ All students from same state: {df['Native State'].iloc[0]}")
|
|
print(f" ⚠️ This may be intentional but could be flagged by client")
|
|
|
|
# Check for other potential issues
|
|
print("\n4. OTHER POTENTIAL ISSUES:")
|
|
print("-" * 80)
|
|
|
|
# Check for empty columns
|
|
empty_cols = []
|
|
for col in df.columns:
|
|
non_null = df[col].notna().sum()
|
|
if non_null == 0:
|
|
empty_cols.append(col)
|
|
|
|
if empty_cols:
|
|
print(f" ⚠️ EMPTY COLUMNS: {empty_cols}")
|
|
else:
|
|
print(f" ✅ No completely empty columns")
|
|
|
|
# Check for columns with mostly empty values
|
|
mostly_empty = []
|
|
for col in df.columns:
|
|
non_null_pct = (df[col].notna().sum() / len(df)) * 100
|
|
if non_null_pct < 10 and non_null_pct > 0:
|
|
mostly_empty.append((col, non_null_pct))
|
|
|
|
if mostly_empty:
|
|
print(f" ⚠️ MOSTLY EMPTY COLUMNS (<10% filled):")
|
|
for col, pct in mostly_empty:
|
|
print(f" {col}: {pct:.1f}% filled")
|
|
|
|
# Recommendations
|
|
print("\n" + "=" * 80)
|
|
print("💡 RECOMMENDATIONS:")
|
|
print("=" * 80)
|
|
|
|
recommendations = []
|
|
|
|
if 'Class_DB' in df.columns and 'Current Grade/Class' in df.columns:
|
|
if (df['Current Grade/Class'].astype(str) == df['Class_DB'].astype(str)).all():
|
|
recommendations.append("Remove 'Class_DB' column (duplicate of 'Current Grade/Class')")
|
|
|
|
if 'Section_DB' in df.columns and 'Section' in df.columns:
|
|
if (df['Section'].astype(str) == df['Section_DB'].astype(str)).all():
|
|
recommendations.append("Remove 'Section_DB' column (duplicate of 'Section')")
|
|
|
|
if 'Nationality' in df.columns and df['Nationality'].nunique() == 1:
|
|
recommendations.append("Review 'Nationality' column - all students have same value (may be intentional)")
|
|
|
|
if 'Native State' in df.columns and df['Native State'].nunique() == 1:
|
|
recommendations.append("Review 'Native State' column - all students from same state (may be intentional)")
|
|
|
|
if recommendations:
|
|
for i, rec in enumerate(recommendations, 1):
|
|
print(f" {i}. {rec}")
|
|
else:
|
|
print(" ✅ No critical issues requiring action")
|
|
|
|
print("=" * 80)
|
|
|
|
if __name__ == "__main__":
|
|
investigate()
|