134 lines
5.2 KiB
Python
134 lines
5.2 KiB
Python
# Data Preparation: Create merged personas with zero schema drift
|
|
import pandas as pd
|
|
from pathlib import Path
|
|
|
|
# Use relative path from script location
|
|
BASE_DIR = Path(__file__).resolve().parent.parent
|
|
OUTPUT_FILE = BASE_DIR / 'data' / 'merged_personas.xlsx'
|
|
|
|
print("="*80)
|
|
print("DATA PREPARATION - ZERO RISK MERGE")
|
|
print("="*80)
|
|
|
|
# Step 1: Load ground truth sources
|
|
print("\n📂 Loading ground truth sources...")
|
|
|
|
# Try multiple possible locations for files
|
|
possible_students = [
|
|
BASE_DIR / '3000-students.xlsx',
|
|
BASE_DIR / 'support' / '3000-students.xlsx',
|
|
]
|
|
possible_cpids = [
|
|
BASE_DIR / '3000_students_output.xlsx',
|
|
BASE_DIR / 'support' / '3000_students_output.xlsx',
|
|
]
|
|
possible_personas = [
|
|
BASE_DIR / 'fixed_3k_personas.xlsx',
|
|
BASE_DIR / 'support' / 'fixed_3k_personas.xlsx',
|
|
]
|
|
|
|
# Find existing files
|
|
students_file = next((f for f in possible_students if f.exists()), None)
|
|
cpids_file = next((f for f in possible_cpids if f.exists()), None)
|
|
personas_file = next((f for f in possible_personas if f.exists()), None)
|
|
|
|
if not students_file:
|
|
raise FileNotFoundError(f"3000-students.xlsx not found in: {possible_students}")
|
|
if not cpids_file:
|
|
raise FileNotFoundError(f"3000_students_output.xlsx not found in: {possible_cpids}")
|
|
if not personas_file:
|
|
raise FileNotFoundError(f"fixed_3k_personas.xlsx not found in: {possible_personas}")
|
|
|
|
df_students = pd.read_excel(students_file)
|
|
df_cpids = pd.read_excel(cpids_file)
|
|
df_personas = pd.read_excel(personas_file)
|
|
|
|
print(f" 3000-students.xlsx: {len(df_students)} rows, {len(df_students.columns)} columns")
|
|
print(f" 3000_students_output.xlsx: {len(df_cpids)} rows")
|
|
print(f" fixed_3k_personas.xlsx: {len(df_personas)} rows")
|
|
|
|
# Step 2: Join on Roll Number
|
|
print("\n🔗 Merging on Roll Number...")
|
|
|
|
# Rename for consistency
|
|
df_cpids_clean = df_cpids[['RollNo', 'StudentCPID', 'SchoolCode', 'SchoolName', 'Class', 'Section']].copy()
|
|
df_cpids_clean.columns = ['Roll Number', 'StudentCPID', 'SchoolCode_DB', 'SchoolName_DB', 'Class_DB', 'Section_DB']
|
|
|
|
merged = df_students.merge(df_cpids_clean, on='Roll Number', how='inner')
|
|
print(f" After joining with CPIDs: {len(merged)} rows")
|
|
|
|
# Step 3: Add behavioral fingerprint and additional persona columns
|
|
print("\n🧠 Adding behavioral fingerprint and persona enrichment columns...")
|
|
|
|
# Define columns to add from fixed_3k_personas.xlsx
|
|
persona_columns = [
|
|
'short_term_focus_1', 'short_term_focus_2', 'short_term_focus_3',
|
|
'long_term_focus_1', 'long_term_focus_2', 'long_term_focus_3',
|
|
'strength_1', 'strength_2', 'strength_3',
|
|
'improvement_area_1', 'improvement_area_2', 'improvement_area_3',
|
|
'hobby_1', 'hobby_2', 'hobby_3',
|
|
'clubs', 'achievements',
|
|
'expectation_1', 'expectation_2', 'expectation_3',
|
|
'segment', 'archetype',
|
|
'behavioral_fingerprint'
|
|
]
|
|
|
|
# Extract available columns from df_personas
|
|
available_cols = [col for col in persona_columns if col in df_personas.columns]
|
|
print(f" Found {len(available_cols)} persona enrichment columns in fixed_3k_personas.xlsx")
|
|
|
|
# Add columns positionally (both files have 3000 rows, safe positional match)
|
|
if available_cols:
|
|
for col in available_cols:
|
|
if len(df_personas) == len(merged):
|
|
merged[col] = df_personas[col].values
|
|
else:
|
|
# Fallback: match by index if row counts differ
|
|
merged[col] = df_personas[col].values[:len(merged)]
|
|
|
|
# Count non-null values for behavioral_fingerprint
|
|
if 'behavioral_fingerprint' in merged.columns:
|
|
fp_count = merged['behavioral_fingerprint'].notna().sum()
|
|
print(f" Behavioral fingerprints added: {fp_count}/{len(merged)}")
|
|
|
|
print(f" ✅ Added {len(available_cols)} persona enrichment columns")
|
|
else:
|
|
print(f" ⚠️ No persona enrichment columns found in fixed_3k_personas.xlsx")
|
|
|
|
# Step 4: Validate columns
|
|
print("\n✅ VALIDATION:")
|
|
required_cols = [
|
|
'Roll Number', 'First Name', 'Last Name', 'Age', 'Gender', 'Age Category',
|
|
'StudentCPID',
|
|
'Openness Score', 'Conscientiousness Score', 'Extraversion Score',
|
|
'Agreeableness Score', 'Neuroticism Score',
|
|
'Cognitive Style', 'Learning Preferences', 'Emotional Intelligence Profile'
|
|
]
|
|
missing = [c for c in required_cols if c not in merged.columns]
|
|
if missing:
|
|
print(f" ❌ MISSING COLUMNS: {missing}")
|
|
else:
|
|
print(f" ✅ All required columns present")
|
|
|
|
# Step 5: Split by age group
|
|
adolescents = merged[merged['Age Category'].str.lower().str.contains('adolescent', na=False)]
|
|
adults = merged[merged['Age Category'].str.lower().str.contains('adult', na=False)]
|
|
print(f"\n📊 DISTRIBUTION:")
|
|
print(f" Adolescents (14-17): {len(adolescents)}")
|
|
print(f" Adults (18-23): {len(adults)}")
|
|
|
|
# Step 6: Save output
|
|
print(f"\n💾 Saving to: {OUTPUT_FILE}")
|
|
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
merged.to_excel(OUTPUT_FILE, index=False)
|
|
print(f" ✅ Saved {len(merged)} rows, {len(merged.columns)} columns")
|
|
|
|
# Step 7: Show sample
|
|
print(f"\n📋 SAMPLE PERSONA:")
|
|
sample = merged.iloc[0]
|
|
key_cols = ['StudentCPID', 'First Name', 'Last Name', 'Age', 'Age Category',
|
|
'Openness Score', 'Conscientiousness Score', 'Cognitive Style']
|
|
for col in key_cols:
|
|
val = str(sample.get(col, 'N/A'))[:80]
|
|
print(f" {col}: {val}")
|