CP_Assessment_engine/scripts/prepare_data.py
2026-02-10 12:59:40 +05:30

134 lines
5.2 KiB
Python

# Data Preparation: Create merged personas with zero schema drift
import pandas as pd
from pathlib import Path
# Use relative path from script location
BASE_DIR = Path(__file__).resolve().parent.parent
OUTPUT_FILE = BASE_DIR / 'data' / 'merged_personas.xlsx'
print("="*80)
print("DATA PREPARATION - ZERO RISK MERGE")
print("="*80)
# Step 1: Load ground truth sources
print("\n📂 Loading ground truth sources...")
# Try multiple possible locations for files
possible_students = [
BASE_DIR / '3000-students.xlsx',
BASE_DIR / 'support' / '3000-students.xlsx',
]
possible_cpids = [
BASE_DIR / '3000_students_output.xlsx',
BASE_DIR / 'support' / '3000_students_output.xlsx',
]
possible_personas = [
BASE_DIR / 'fixed_3k_personas.xlsx',
BASE_DIR / 'support' / 'fixed_3k_personas.xlsx',
]
# Find existing files
students_file = next((f for f in possible_students if f.exists()), None)
cpids_file = next((f for f in possible_cpids if f.exists()), None)
personas_file = next((f for f in possible_personas if f.exists()), None)
if not students_file:
raise FileNotFoundError(f"3000-students.xlsx not found in: {possible_students}")
if not cpids_file:
raise FileNotFoundError(f"3000_students_output.xlsx not found in: {possible_cpids}")
if not personas_file:
raise FileNotFoundError(f"fixed_3k_personas.xlsx not found in: {possible_personas}")
df_students = pd.read_excel(students_file)
df_cpids = pd.read_excel(cpids_file)
df_personas = pd.read_excel(personas_file)
print(f" 3000-students.xlsx: {len(df_students)} rows, {len(df_students.columns)} columns")
print(f" 3000_students_output.xlsx: {len(df_cpids)} rows")
print(f" fixed_3k_personas.xlsx: {len(df_personas)} rows")
# Step 2: Join on Roll Number
print("\n🔗 Merging on Roll Number...")
# Rename for consistency
df_cpids_clean = df_cpids[['RollNo', 'StudentCPID', 'SchoolCode', 'SchoolName', 'Class', 'Section']].copy()
df_cpids_clean.columns = ['Roll Number', 'StudentCPID', 'SchoolCode_DB', 'SchoolName_DB', 'Class_DB', 'Section_DB']
merged = df_students.merge(df_cpids_clean, on='Roll Number', how='inner')
print(f" After joining with CPIDs: {len(merged)} rows")
# Step 3: Add behavioral fingerprint and additional persona columns
print("\n🧠 Adding behavioral fingerprint and persona enrichment columns...")
# Define columns to add from fixed_3k_personas.xlsx
persona_columns = [
'short_term_focus_1', 'short_term_focus_2', 'short_term_focus_3',
'long_term_focus_1', 'long_term_focus_2', 'long_term_focus_3',
'strength_1', 'strength_2', 'strength_3',
'improvement_area_1', 'improvement_area_2', 'improvement_area_3',
'hobby_1', 'hobby_2', 'hobby_3',
'clubs', 'achievements',
'expectation_1', 'expectation_2', 'expectation_3',
'segment', 'archetype',
'behavioral_fingerprint'
]
# Extract available columns from df_personas
available_cols = [col for col in persona_columns if col in df_personas.columns]
print(f" Found {len(available_cols)} persona enrichment columns in fixed_3k_personas.xlsx")
# Add columns positionally (both files have 3000 rows, safe positional match)
if available_cols:
for col in available_cols:
if len(df_personas) == len(merged):
merged[col] = df_personas[col].values
else:
# Fallback: match by index if row counts differ
merged[col] = df_personas[col].values[:len(merged)]
# Count non-null values for behavioral_fingerprint
if 'behavioral_fingerprint' in merged.columns:
fp_count = merged['behavioral_fingerprint'].notna().sum()
print(f" Behavioral fingerprints added: {fp_count}/{len(merged)}")
print(f" ✅ Added {len(available_cols)} persona enrichment columns")
else:
print(f" ⚠️ No persona enrichment columns found in fixed_3k_personas.xlsx")
# Step 4: Validate columns
print("\n✅ VALIDATION:")
required_cols = [
'Roll Number', 'First Name', 'Last Name', 'Age', 'Gender', 'Age Category',
'StudentCPID',
'Openness Score', 'Conscientiousness Score', 'Extraversion Score',
'Agreeableness Score', 'Neuroticism Score',
'Cognitive Style', 'Learning Preferences', 'Emotional Intelligence Profile'
]
missing = [c for c in required_cols if c not in merged.columns]
if missing:
print(f" ❌ MISSING COLUMNS: {missing}")
else:
print(f" ✅ All required columns present")
# Step 5: Split by age group
adolescents = merged[merged['Age Category'].str.lower().str.contains('adolescent', na=False)]
adults = merged[merged['Age Category'].str.lower().str.contains('adult', na=False)]
print(f"\n📊 DISTRIBUTION:")
print(f" Adolescents (14-17): {len(adolescents)}")
print(f" Adults (18-23): {len(adults)}")
# Step 6: Save output
print(f"\n💾 Saving to: {OUTPUT_FILE}")
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
merged.to_excel(OUTPUT_FILE, index=False)
print(f" ✅ Saved {len(merged)} rows, {len(merged.columns)} columns")
# Step 7: Show sample
print(f"\n📋 SAMPLE PERSONA:")
sample = merged.iloc[0]
key_cols = ['StudentCPID', 'First Name', 'Last Name', 'Age', 'Age Category',
'Openness Score', 'Conscientiousness Score', 'Cognitive Style']
for col in key_cols:
val = str(sample.get(col, 'N/A'))[:80]
print(f" {col}: {val}")