CP_Assessment_engine/scripts/analyze_persona_columns.py
2026-02-10 12:59:40 +05:30

90 lines
3.4 KiB
Python

"""
Analysis script to check compatibility of additional persona columns
"""
import pandas as pd
from pathlib import Path
BASE_DIR = Path(__file__).resolve().parent.parent
print("="*80)
print("PERSONA COLUMNS COMPATIBILITY ANALYSIS")
print("="*80)
# Load files
df_fixed = pd.read_excel(BASE_DIR / 'support' / 'fixed_3k_personas.xlsx')
df_students = pd.read_excel(BASE_DIR / 'support' / '3000-students.xlsx')
df_merged = pd.read_excel(BASE_DIR / 'data' / 'merged_personas.xlsx')
print(f"\nFILE STATISTICS:")
print(f" fixed_3k_personas.xlsx: {len(df_fixed)} rows, {len(df_fixed.columns)} columns")
print(f" 3000-students.xlsx: {len(df_students)} rows, {len(df_students.columns)} columns")
print(f" merged_personas.xlsx: {len(df_merged)} rows, {len(df_merged.columns)} columns")
# Target columns to check
target_columns = [
'short_term_focus_1', 'short_term_focus_2', 'short_term_focus_3',
'long_term_focus_1', 'long_term_focus_2', 'long_term_focus_3',
'strength_1', 'strength_2', 'strength_3',
'improvement_area_1', 'improvement_area_2', 'improvement_area_3',
'hobby_1', 'hobby_2', 'hobby_3',
'clubs', 'achievements'
]
print(f"\nTARGET COLUMNS CHECK:")
print(f" Checking {len(target_columns)} columns...")
# Check in fixed_3k_personas
in_fixed = [col for col in target_columns if col in df_fixed.columns]
missing_in_fixed = [col for col in target_columns if col not in df_fixed.columns]
print(f"\n [OK] In fixed_3k_personas.xlsx: {len(in_fixed)}/{len(target_columns)}")
if missing_in_fixed:
print(f" [MISSING] Missing: {missing_in_fixed}")
# Check in merged_personas
in_merged = [col for col in target_columns if col in df_merged.columns]
missing_in_merged = [col for col in target_columns if col not in df_merged.columns]
print(f"\n [OK] In merged_personas.xlsx: {len(in_merged)}/{len(target_columns)}")
if missing_in_merged:
print(f" [MISSING] Missing: {missing_in_merged}")
# Check for column conflicts
print(f"\nCOLUMN CONFLICT CHECK:")
fixed_cols = set(df_fixed.columns)
students_cols = set(df_students.columns)
overlap = fixed_cols.intersection(students_cols)
print(f" Overlapping columns between fixed_3k and 3000-students: {len(overlap)}")
if overlap:
print(f" [WARNING] These columns exist in both files (may need suffix handling):")
for col in sorted(list(overlap))[:10]:
print(f" - {col}")
if len(overlap) > 10:
print(f" ... and {len(overlap) - 10} more")
# Check merge key
print(f"\nMERGE KEY CHECK:")
print(f" Roll Number in fixed_3k_personas: {'Roll Number' in df_fixed.columns or 'roll_number' in df_fixed.columns}")
print(f" Roll Number in 3000-students: {'Roll Number' in df_students.columns}")
# Sample data quality check
print(f"\nSAMPLE DATA QUALITY:")
if len(df_fixed) > 0:
sample = df_fixed.iloc[0]
print(f" Sample row from fixed_3k_personas.xlsx:")
for col in ['short_term_focus_1', 'strength_1', 'hobby_1', 'clubs']:
if col in df_fixed.columns:
val = str(sample.get(col, 'N/A'))
print(f" {col}: {val[:60]}")
# Additional useful columns
print(f"\nADDITIONAL USEFUL COLUMNS IN fixed_3k_personas.xlsx:")
additional_useful = ['expectation_1', 'expectation_2', 'expectation_3', 'segment', 'archetype']
for col in additional_useful:
if col in df_fixed.columns:
print(f" [OK] {col}")
print("\n" + "="*80)
print("ANALYSIS COMPLETE")
print("="*80)