CP_Assessment_engine/scripts/analyze_persona_columns.py

"""
Analysis script to check compatibility of additional persona columns
"""
import pandas as pd
from pathlib import Path

BASE_DIR = Path(__file__).resolve().parent.parent

print("="*80)
print("PERSONA COLUMNS COMPATIBILITY ANALYSIS")
print("="*80)

# Load files
df_fixed = pd.read_excel(BASE_DIR / 'support' / 'fixed_3k_personas.xlsx')
df_students = pd.read_excel(BASE_DIR / 'support' / '3000-students.xlsx')
df_merged = pd.read_excel(BASE_DIR / 'data' / 'merged_personas.xlsx')

print(f"\nFILE STATISTICS:")
print(f"   fixed_3k_personas.xlsx: {len(df_fixed)} rows, {len(df_fixed.columns)} columns")
print(f"   3000-students.xlsx: {len(df_students)} rows, {len(df_students.columns)} columns")
print(f"   merged_personas.xlsx: {len(df_merged)} rows, {len(df_merged.columns)} columns")

# Target columns to check
target_columns = [
    'short_term_focus_1', 'short_term_focus_2', 'short_term_focus_3',
    'long_term_focus_1', 'long_term_focus_2', 'long_term_focus_3',
    'strength_1', 'strength_2', 'strength_3',
    'improvement_area_1', 'improvement_area_2', 'improvement_area_3',
    'hobby_1', 'hobby_2', 'hobby_3',
    'clubs', 'achievements'
]

print(f"\nTARGET COLUMNS CHECK:")
print(f"   Checking {len(target_columns)} columns...")

# Check in fixed_3k_personas
in_fixed = [col for col in target_columns if col in df_fixed.columns]
missing_in_fixed = [col for col in target_columns if col not in df_fixed.columns]

print(f"\n   [OK] In fixed_3k_personas.xlsx: {len(in_fixed)}/{len(target_columns)}")
if missing_in_fixed:
    print(f"   [MISSING] Missing: {missing_in_fixed}")

# Check in merged_personas
in_merged = [col for col in target_columns if col in df_merged.columns]
missing_in_merged = [col for col in target_columns if col not in df_merged.columns]

print(f"\n   [OK] In merged_personas.xlsx: {len(in_merged)}/{len(target_columns)}")
if missing_in_merged:
    print(f"   [MISSING] Missing: {missing_in_merged}")

# Check for column conflicts
print(f"\nCOLUMN CONFLICT CHECK:")
fixed_cols = set(df_fixed.columns)
students_cols = set(df_students.columns)
overlap = fixed_cols.intersection(students_cols)
print(f"   Overlapping columns between fixed_3k and 3000-students: {len(overlap)}")
if overlap:
    print(f"   [WARNING] These columns exist in both files (may need suffix handling):")
    for col in sorted(list(overlap))[:10]:
        print(f"      - {col}")
    if len(overlap) > 10:
        print(f"      ... and {len(overlap) - 10} more")

# Check merge key
print(f"\nMERGE KEY CHECK:")
print(f"   Roll Number in fixed_3k_personas: {'Roll Number' in df_fixed.columns or 'roll_number' in df_fixed.columns}")
print(f"   Roll Number in 3000-students: {'Roll Number' in df_students.columns}")

# Sample data quality check
print(f"\nSAMPLE DATA QUALITY:")
if len(df_fixed) > 0:
    sample = df_fixed.iloc[0]
    print(f"   Sample row from fixed_3k_personas.xlsx:")
    for col in ['short_term_focus_1', 'strength_1', 'hobby_1', 'clubs']:
        if col in df_fixed.columns:
            val = str(sample.get(col, 'N/A'))
            print(f"      {col}: {val[:60]}")

# Additional useful columns
print(f"\nADDITIONAL USEFUL COLUMNS IN fixed_3k_personas.xlsx:")
additional_useful = ['expectation_1', 'expectation_2', 'expectation_3', 'segment', 'archetype']
for col in additional_useful:
    if col in df_fixed.columns:
        print(f"   [OK] {col}")

print("\n" + "="*80)
print("ANALYSIS COMPLETE")
print("="*80)