""" Analysis script to check compatibility of additional persona columns """ import pandas as pd from pathlib import Path BASE_DIR = Path(__file__).resolve().parent.parent print("="*80) print("PERSONA COLUMNS COMPATIBILITY ANALYSIS") print("="*80) # Load files df_fixed = pd.read_excel(BASE_DIR / 'support' / 'fixed_3k_personas.xlsx') df_students = pd.read_excel(BASE_DIR / 'support' / '3000-students.xlsx') df_merged = pd.read_excel(BASE_DIR / 'data' / 'merged_personas.xlsx') print(f"\nFILE STATISTICS:") print(f" fixed_3k_personas.xlsx: {len(df_fixed)} rows, {len(df_fixed.columns)} columns") print(f" 3000-students.xlsx: {len(df_students)} rows, {len(df_students.columns)} columns") print(f" merged_personas.xlsx: {len(df_merged)} rows, {len(df_merged.columns)} columns") # Target columns to check target_columns = [ 'short_term_focus_1', 'short_term_focus_2', 'short_term_focus_3', 'long_term_focus_1', 'long_term_focus_2', 'long_term_focus_3', 'strength_1', 'strength_2', 'strength_3', 'improvement_area_1', 'improvement_area_2', 'improvement_area_3', 'hobby_1', 'hobby_2', 'hobby_3', 'clubs', 'achievements' ] print(f"\nTARGET COLUMNS CHECK:") print(f" Checking {len(target_columns)} columns...") # Check in fixed_3k_personas in_fixed = [col for col in target_columns if col in df_fixed.columns] missing_in_fixed = [col for col in target_columns if col not in df_fixed.columns] print(f"\n [OK] In fixed_3k_personas.xlsx: {len(in_fixed)}/{len(target_columns)}") if missing_in_fixed: print(f" [MISSING] Missing: {missing_in_fixed}") # Check in merged_personas in_merged = [col for col in target_columns if col in df_merged.columns] missing_in_merged = [col for col in target_columns if col not in df_merged.columns] print(f"\n [OK] In merged_personas.xlsx: {len(in_merged)}/{len(target_columns)}") if missing_in_merged: print(f" [MISSING] Missing: {missing_in_merged}") # Check for column conflicts print(f"\nCOLUMN CONFLICT CHECK:") fixed_cols = set(df_fixed.columns) students_cols = set(df_students.columns) overlap = fixed_cols.intersection(students_cols) print(f" Overlapping columns between fixed_3k and 3000-students: {len(overlap)}") if overlap: print(f" [WARNING] These columns exist in both files (may need suffix handling):") for col in sorted(list(overlap))[:10]: print(f" - {col}") if len(overlap) > 10: print(f" ... and {len(overlap) - 10} more") # Check merge key print(f"\nMERGE KEY CHECK:") print(f" Roll Number in fixed_3k_personas: {'Roll Number' in df_fixed.columns or 'roll_number' in df_fixed.columns}") print(f" Roll Number in 3000-students: {'Roll Number' in df_students.columns}") # Sample data quality check print(f"\nSAMPLE DATA QUALITY:") if len(df_fixed) > 0: sample = df_fixed.iloc[0] print(f" Sample row from fixed_3k_personas.xlsx:") for col in ['short_term_focus_1', 'strength_1', 'hobby_1', 'clubs']: if col in df_fixed.columns: val = str(sample.get(col, 'N/A')) print(f" {col}: {val[:60]}") # Additional useful columns print(f"\nADDITIONAL USEFUL COLUMNS IN fixed_3k_personas.xlsx:") additional_useful = ['expectation_1', 'expectation_2', 'expectation_3', 'segment', 'archetype'] for col in additional_useful: if col in df_fixed.columns: print(f" [OK] {col}") print("\n" + "="*80) print("ANALYSIS COMPLETE") print("="*80)