CP_Assessment_engine/scripts/investigate_persona_issues.py
2026-02-10 12:59:40 +05:30

138 lines
5.0 KiB
Python

"""
Deep investigation of merged_personas.xlsx issues
"""
import pandas as pd
from pathlib import Path
import sys
import io
if sys.platform == 'win32':
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
BASE_DIR = Path(__file__).resolve().parent.parent
def investigate():
df = pd.read_excel(BASE_DIR / "data" / "merged_personas.xlsx", engine='openpyxl')
print("=" * 80)
print("🔍 DEEP INVESTIGATION: merged_personas.xlsx Issues")
print("=" * 80)
# Check Current Grade/Class vs Class_DB
print("\n1. GRADE/CLASS COLUMN ANALYSIS:")
print("-" * 80)
if 'Current Grade/Class' in df.columns and 'Class_DB' in df.columns:
print(" Comparing 'Current Grade/Class' vs 'Class_DB':")
# Check if they match
matches = (df['Current Grade/Class'].astype(str) == df['Class_DB'].astype(str)).sum()
total = len(df)
mismatches = total - matches
print(f" Matching rows: {matches}/{total}")
print(f" Mismatches: {mismatches}")
if mismatches > 0:
print(f" ⚠️ MISMATCH FOUND - Showing sample mismatches:")
mismatched = df[df['Current Grade/Class'].astype(str) != df['Class_DB'].astype(str)]
for idx, row in mismatched.head(5).iterrows():
print(f" Row {idx}: '{row['Current Grade/Class']}' vs '{row['Class_DB']}'")
else:
print(f" ✅ Columns match perfectly - 'Class_DB' is redundant")
# Check Section vs Section_DB
print("\n2. SECTION COLUMN ANALYSIS:")
print("-" * 80)
if 'Section' in df.columns and 'Section_DB' in df.columns:
matches = (df['Section'].astype(str) == df['Section_DB'].astype(str)).sum()
total = len(df)
mismatches = total - matches
print(f" Matching rows: {matches}/{total}")
print(f" Mismatches: {mismatches}")
if mismatches > 0:
print(f" ⚠️ MISMATCH FOUND")
else:
print(f" ✅ Columns match perfectly - 'Section_DB' is redundant")
# Check Nationality and Native State
print("\n3. NATIONALITY/NATIVE STATE ANALYSIS:")
print("-" * 80)
if 'Nationality' in df.columns:
unique_nationality = df['Nationality'].nunique()
print(f" Nationality unique values: {unique_nationality}")
if unique_nationality == 1:
print(f" ⚠️ All students have same nationality: {df['Nationality'].iloc[0]}")
print(f" ⚠️ This may be intentional but could be flagged by client")
if 'Native State' in df.columns:
unique_state = df['Native State'].nunique()
print(f" Native State unique values: {unique_state}")
if unique_state == 1:
print(f" ⚠️ All students from same state: {df['Native State'].iloc[0]}")
print(f" ⚠️ This may be intentional but could be flagged by client")
# Check for other potential issues
print("\n4. OTHER POTENTIAL ISSUES:")
print("-" * 80)
# Check for empty columns
empty_cols = []
for col in df.columns:
non_null = df[col].notna().sum()
if non_null == 0:
empty_cols.append(col)
if empty_cols:
print(f" ⚠️ EMPTY COLUMNS: {empty_cols}")
else:
print(f" ✅ No completely empty columns")
# Check for columns with mostly empty values
mostly_empty = []
for col in df.columns:
non_null_pct = (df[col].notna().sum() / len(df)) * 100
if non_null_pct < 10 and non_null_pct > 0:
mostly_empty.append((col, non_null_pct))
if mostly_empty:
print(f" ⚠️ MOSTLY EMPTY COLUMNS (<10% filled):")
for col, pct in mostly_empty:
print(f" {col}: {pct:.1f}% filled")
# Recommendations
print("\n" + "=" * 80)
print("💡 RECOMMENDATIONS:")
print("=" * 80)
recommendations = []
if 'Class_DB' in df.columns and 'Current Grade/Class' in df.columns:
if (df['Current Grade/Class'].astype(str) == df['Class_DB'].astype(str)).all():
recommendations.append("Remove 'Class_DB' column (duplicate of 'Current Grade/Class')")
if 'Section_DB' in df.columns and 'Section' in df.columns:
if (df['Section'].astype(str) == df['Section_DB'].astype(str)).all():
recommendations.append("Remove 'Section_DB' column (duplicate of 'Section')")
if 'Nationality' in df.columns and df['Nationality'].nunique() == 1:
recommendations.append("Review 'Nationality' column - all students have same value (may be intentional)")
if 'Native State' in df.columns and df['Native State'].nunique() == 1:
recommendations.append("Review 'Native State' column - all students from same state (may be intentional)")
if recommendations:
for i, rec in enumerate(recommendations, 1):
print(f" {i}. {rec}")
else:
print(" ✅ No critical issues requiring action")
print("=" * 80)
if __name__ == "__main__":
investigate()