# Data Preparation: Create merged personas with zero schema drift import pandas as pd from pathlib import Path # Use relative path from script location BASE_DIR = Path(__file__).resolve().parent.parent OUTPUT_FILE = BASE_DIR / 'data' / 'merged_personas.xlsx' print("="*80) print("DATA PREPARATION - ZERO RISK MERGE") print("="*80) # Step 1: Load ground truth sources print("\nšŸ“‚ Loading ground truth sources...") # Try multiple possible locations for files possible_students = [ BASE_DIR / '3000-students.xlsx', BASE_DIR / 'support' / '3000-students.xlsx', ] possible_cpids = [ BASE_DIR / '3000_students_output.xlsx', BASE_DIR / 'support' / '3000_students_output.xlsx', ] possible_personas = [ BASE_DIR / 'fixed_3k_personas.xlsx', BASE_DIR / 'support' / 'fixed_3k_personas.xlsx', ] # Find existing files students_file = next((f for f in possible_students if f.exists()), None) cpids_file = next((f for f in possible_cpids if f.exists()), None) personas_file = next((f for f in possible_personas if f.exists()), None) if not students_file: raise FileNotFoundError(f"3000-students.xlsx not found in: {possible_students}") if not cpids_file: raise FileNotFoundError(f"3000_students_output.xlsx not found in: {possible_cpids}") if not personas_file: raise FileNotFoundError(f"fixed_3k_personas.xlsx not found in: {possible_personas}") df_students = pd.read_excel(students_file) df_cpids = pd.read_excel(cpids_file) df_personas = pd.read_excel(personas_file) print(f" 3000-students.xlsx: {len(df_students)} rows, {len(df_students.columns)} columns") print(f" 3000_students_output.xlsx: {len(df_cpids)} rows") print(f" fixed_3k_personas.xlsx: {len(df_personas)} rows") # Step 2: Join on Roll Number print("\nšŸ”— Merging on Roll Number...") # Rename for consistency df_cpids_clean = df_cpids[['RollNo', 'StudentCPID', 'SchoolCode', 'SchoolName', 'Class', 'Section']].copy() df_cpids_clean.columns = ['Roll Number', 'StudentCPID', 'SchoolCode_DB', 'SchoolName_DB', 'Class_DB', 'Section_DB'] merged = df_students.merge(df_cpids_clean, on='Roll Number', how='inner') print(f" After joining with CPIDs: {len(merged)} rows") # Step 3: Add behavioral fingerprint and additional persona columns print("\n🧠 Adding behavioral fingerprint and persona enrichment columns...") # Define columns to add from fixed_3k_personas.xlsx persona_columns = [ 'short_term_focus_1', 'short_term_focus_2', 'short_term_focus_3', 'long_term_focus_1', 'long_term_focus_2', 'long_term_focus_3', 'strength_1', 'strength_2', 'strength_3', 'improvement_area_1', 'improvement_area_2', 'improvement_area_3', 'hobby_1', 'hobby_2', 'hobby_3', 'clubs', 'achievements', 'expectation_1', 'expectation_2', 'expectation_3', 'segment', 'archetype', 'behavioral_fingerprint' ] # Extract available columns from df_personas available_cols = [col for col in persona_columns if col in df_personas.columns] print(f" Found {len(available_cols)} persona enrichment columns in fixed_3k_personas.xlsx") # Add columns positionally (both files have 3000 rows, safe positional match) if available_cols: for col in available_cols: if len(df_personas) == len(merged): merged[col] = df_personas[col].values else: # Fallback: match by index if row counts differ merged[col] = df_personas[col].values[:len(merged)] # Count non-null values for behavioral_fingerprint if 'behavioral_fingerprint' in merged.columns: fp_count = merged['behavioral_fingerprint'].notna().sum() print(f" Behavioral fingerprints added: {fp_count}/{len(merged)}") print(f" āœ… Added {len(available_cols)} persona enrichment columns") else: print(f" āš ļø No persona enrichment columns found in fixed_3k_personas.xlsx") # Step 4: Validate columns print("\nāœ… VALIDATION:") required_cols = [ 'Roll Number', 'First Name', 'Last Name', 'Age', 'Gender', 'Age Category', 'StudentCPID', 'Openness Score', 'Conscientiousness Score', 'Extraversion Score', 'Agreeableness Score', 'Neuroticism Score', 'Cognitive Style', 'Learning Preferences', 'Emotional Intelligence Profile' ] missing = [c for c in required_cols if c not in merged.columns] if missing: print(f" āŒ MISSING COLUMNS: {missing}") else: print(f" āœ… All required columns present") # Step 5: Split by age group adolescents = merged[merged['Age Category'].str.lower().str.contains('adolescent', na=False)] adults = merged[merged['Age Category'].str.lower().str.contains('adult', na=False)] print(f"\nšŸ“Š DISTRIBUTION:") print(f" Adolescents (14-17): {len(adolescents)}") print(f" Adults (18-23): {len(adults)}") # Step 6: Save output print(f"\nšŸ’¾ Saving to: {OUTPUT_FILE}") OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True) merged.to_excel(OUTPUT_FILE, index=False) print(f" āœ… Saved {len(merged)} rows, {len(merged.columns)} columns") # Step 7: Show sample print(f"\nšŸ“‹ SAMPLE PERSONA:") sample = merged.iloc[0] key_cols = ['StudentCPID', 'First Name', 'Last Name', 'Age', 'Age Category', 'Openness Score', 'Conscientiousness Score', 'Cognitive Style'] for col in key_cols: val = str(sample.get(col, 'N/A'))[:80] print(f" {col}: {val}")