CP_Assessment_engine/scripts/prepare_data.py

# Data Preparation: Create merged personas with zero schema drift
import pandas as pd
from pathlib import Path

# Use relative path from script location
BASE_DIR = Path(__file__).resolve().parent.parent
OUTPUT_FILE = BASE_DIR / 'data' / 'merged_personas.xlsx'

print("="*80)
print("DATA PREPARATION - ZERO RISK MERGE")
print("="*80)

# Step 1: Load ground truth sources
print("\n📂 Loading ground truth sources...")

# Try multiple possible locations for files
possible_students = [
    BASE_DIR / '3000-students.xlsx',
    BASE_DIR / 'support' / '3000-students.xlsx',
]
possible_cpids = [
    BASE_DIR / '3000_students_output.xlsx',
    BASE_DIR / 'support' / '3000_students_output.xlsx',
]
possible_personas = [
    BASE_DIR / 'fixed_3k_personas.xlsx',
    BASE_DIR / 'support' / 'fixed_3k_personas.xlsx',
]

# Find existing files
students_file = next((f for f in possible_students if f.exists()), None)
cpids_file = next((f for f in possible_cpids if f.exists()), None)
personas_file = next((f for f in possible_personas if f.exists()), None)

if not students_file:
    raise FileNotFoundError(f"3000-students.xlsx not found in: {possible_students}")
if not cpids_file:
    raise FileNotFoundError(f"3000_students_output.xlsx not found in: {possible_cpids}")
if not personas_file:
    raise FileNotFoundError(f"fixed_3k_personas.xlsx not found in: {possible_personas}")

df_students = pd.read_excel(students_file)
df_cpids = pd.read_excel(cpids_file)
df_personas = pd.read_excel(personas_file)

print(f"   3000-students.xlsx: {len(df_students)} rows, {len(df_students.columns)} columns")
print(f"   3000_students_output.xlsx: {len(df_cpids)} rows")
print(f"   fixed_3k_personas.xlsx: {len(df_personas)} rows")

# Step 2: Join on Roll Number
print("\n🔗 Merging on Roll Number...")

# Rename for consistency
df_cpids_clean = df_cpids[['RollNo', 'StudentCPID', 'SchoolCode', 'SchoolName', 'Class', 'Section']].copy()
df_cpids_clean.columns = ['Roll Number', 'StudentCPID', 'SchoolCode_DB', 'SchoolName_DB', 'Class_DB', 'Section_DB']

merged = df_students.merge(df_cpids_clean, on='Roll Number', how='inner')
print(f"   After joining with CPIDs: {len(merged)} rows")

# Step 3: Add behavioral fingerprint and additional persona columns
print("\n🧠 Adding behavioral fingerprint and persona enrichment columns...")

# Define columns to add from fixed_3k_personas.xlsx
persona_columns = [
    'short_term_focus_1', 'short_term_focus_2', 'short_term_focus_3',
    'long_term_focus_1', 'long_term_focus_2', 'long_term_focus_3',
    'strength_1', 'strength_2', 'strength_3',
    'improvement_area_1', 'improvement_area_2', 'improvement_area_3',
    'hobby_1', 'hobby_2', 'hobby_3',
    'clubs', 'achievements',
    'expectation_1', 'expectation_2', 'expectation_3',
    'segment', 'archetype',
    'behavioral_fingerprint'
]

# Extract available columns from df_personas
available_cols = [col for col in persona_columns if col in df_personas.columns]
print(f"   Found {len(available_cols)} persona enrichment columns in fixed_3k_personas.xlsx")

# Add columns positionally (both files have 3000 rows, safe positional match)
if available_cols:
    for col in available_cols:
        if len(df_personas) == len(merged):
            merged[col] = df_personas[col].values
        else:
            # Fallback: match by index if row counts differ
            merged[col] = df_personas[col].values[:len(merged)]

    # Count non-null values for behavioral_fingerprint
    if 'behavioral_fingerprint' in merged.columns:
        fp_count = merged['behavioral_fingerprint'].notna().sum()
        print(f"   Behavioral fingerprints added: {fp_count}/{len(merged)}")

    print(f"   ✅ Added {len(available_cols)} persona enrichment columns")
else:
    print(f"   ⚠️ No persona enrichment columns found in fixed_3k_personas.xlsx")

# Step 4: Validate columns
print("\n✅ VALIDATION:")
required_cols = [
    'Roll Number', 'First Name', 'Last Name', 'Age', 'Gender', 'Age Category',
    'StudentCPID',
    'Openness Score', 'Conscientiousness Score', 'Extraversion Score',
    'Agreeableness Score', 'Neuroticism Score',
    'Cognitive Style', 'Learning Preferences', 'Emotional Intelligence Profile'
]
missing = [c for c in required_cols if c not in merged.columns]
if missing:
    print(f"   ❌ MISSING COLUMNS: {missing}")
else:
    print(f"   ✅ All required columns present")

# Step 5: Split by age group
adolescents = merged[merged['Age Category'].str.lower().str.contains('adolescent', na=False)]
adults = merged[merged['Age Category'].str.lower().str.contains('adult', na=False)]
print(f"\n📊 DISTRIBUTION:")
print(f"   Adolescents (14-17): {len(adolescents)}")
print(f"   Adults (18-23):      {len(adults)}")

# Step 6: Save output
print(f"\n💾 Saving to: {OUTPUT_FILE}")
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
merged.to_excel(OUTPUT_FILE, index=False)
print(f"   ✅ Saved {len(merged)} rows, {len(merged.columns)} columns")

# Step 7: Show sample
print(f"\n📋 SAMPLE PERSONA:")
sample = merged.iloc[0]
key_cols = ['StudentCPID', 'First Name', 'Last Name', 'Age', 'Age Category',
            'Openness Score', 'Conscientiousness Score', 'Cognitive Style']
for col in key_cols:
    val = str(sample.get(col, 'N/A'))[:80]
    print(f"   {col}: {val}")