90 lines
3.4 KiB
Python
90 lines
3.4 KiB
Python
"""
|
|
Analysis script to check compatibility of additional persona columns
|
|
"""
|
|
import pandas as pd
|
|
from pathlib import Path
|
|
|
|
BASE_DIR = Path(__file__).resolve().parent.parent
|
|
|
|
print("="*80)
|
|
print("PERSONA COLUMNS COMPATIBILITY ANALYSIS")
|
|
print("="*80)
|
|
|
|
# Load files
|
|
df_fixed = pd.read_excel(BASE_DIR / 'support' / 'fixed_3k_personas.xlsx')
|
|
df_students = pd.read_excel(BASE_DIR / 'support' / '3000-students.xlsx')
|
|
df_merged = pd.read_excel(BASE_DIR / 'data' / 'merged_personas.xlsx')
|
|
|
|
print(f"\nFILE STATISTICS:")
|
|
print(f" fixed_3k_personas.xlsx: {len(df_fixed)} rows, {len(df_fixed.columns)} columns")
|
|
print(f" 3000-students.xlsx: {len(df_students)} rows, {len(df_students.columns)} columns")
|
|
print(f" merged_personas.xlsx: {len(df_merged)} rows, {len(df_merged.columns)} columns")
|
|
|
|
# Target columns to check
|
|
target_columns = [
|
|
'short_term_focus_1', 'short_term_focus_2', 'short_term_focus_3',
|
|
'long_term_focus_1', 'long_term_focus_2', 'long_term_focus_3',
|
|
'strength_1', 'strength_2', 'strength_3',
|
|
'improvement_area_1', 'improvement_area_2', 'improvement_area_3',
|
|
'hobby_1', 'hobby_2', 'hobby_3',
|
|
'clubs', 'achievements'
|
|
]
|
|
|
|
print(f"\nTARGET COLUMNS CHECK:")
|
|
print(f" Checking {len(target_columns)} columns...")
|
|
|
|
# Check in fixed_3k_personas
|
|
in_fixed = [col for col in target_columns if col in df_fixed.columns]
|
|
missing_in_fixed = [col for col in target_columns if col not in df_fixed.columns]
|
|
|
|
print(f"\n [OK] In fixed_3k_personas.xlsx: {len(in_fixed)}/{len(target_columns)}")
|
|
if missing_in_fixed:
|
|
print(f" [MISSING] Missing: {missing_in_fixed}")
|
|
|
|
# Check in merged_personas
|
|
in_merged = [col for col in target_columns if col in df_merged.columns]
|
|
missing_in_merged = [col for col in target_columns if col not in df_merged.columns]
|
|
|
|
print(f"\n [OK] In merged_personas.xlsx: {len(in_merged)}/{len(target_columns)}")
|
|
if missing_in_merged:
|
|
print(f" [MISSING] Missing: {missing_in_merged}")
|
|
|
|
# Check for column conflicts
|
|
print(f"\nCOLUMN CONFLICT CHECK:")
|
|
fixed_cols = set(df_fixed.columns)
|
|
students_cols = set(df_students.columns)
|
|
overlap = fixed_cols.intersection(students_cols)
|
|
print(f" Overlapping columns between fixed_3k and 3000-students: {len(overlap)}")
|
|
if overlap:
|
|
print(f" [WARNING] These columns exist in both files (may need suffix handling):")
|
|
for col in sorted(list(overlap))[:10]:
|
|
print(f" - {col}")
|
|
if len(overlap) > 10:
|
|
print(f" ... and {len(overlap) - 10} more")
|
|
|
|
# Check merge key
|
|
print(f"\nMERGE KEY CHECK:")
|
|
print(f" Roll Number in fixed_3k_personas: {'Roll Number' in df_fixed.columns or 'roll_number' in df_fixed.columns}")
|
|
print(f" Roll Number in 3000-students: {'Roll Number' in df_students.columns}")
|
|
|
|
# Sample data quality check
|
|
print(f"\nSAMPLE DATA QUALITY:")
|
|
if len(df_fixed) > 0:
|
|
sample = df_fixed.iloc[0]
|
|
print(f" Sample row from fixed_3k_personas.xlsx:")
|
|
for col in ['short_term_focus_1', 'strength_1', 'hobby_1', 'clubs']:
|
|
if col in df_fixed.columns:
|
|
val = str(sample.get(col, 'N/A'))
|
|
print(f" {col}: {val[:60]}")
|
|
|
|
# Additional useful columns
|
|
print(f"\nADDITIONAL USEFUL COLUMNS IN fixed_3k_personas.xlsx:")
|
|
additional_useful = ['expectation_1', 'expectation_2', 'expectation_3', 'segment', 'archetype']
|
|
for col in additional_useful:
|
|
if col in df_fixed.columns:
|
|
print(f" [OK] {col}")
|
|
|
|
print("\n" + "="*80)
|
|
print("ANALYSIS COMPLETE")
|
|
print("="*80)
|