""" Data Loader v2.0 - Zero Risk Edition Loads merged personas and questions with full psychometric profiles. """ import pandas as pd import json from pathlib import Path from typing import List, Dict, Tuple, Any import ast # Path Configuration BASE_DIR = Path(__file__).resolve().parent.parent PERSONAS_FILE = BASE_DIR / "data" / "merged_personas.xlsx" # Questions file - now internal to project QUESTIONS_FILE = BASE_DIR / "data" / "AllQuestions.xlsx" def load_personas() -> Tuple[List[Dict], List[Dict]]: """ Load merged personas sorted by age group. Returns: (adolescents, adults) each as list of dicts """ if not PERSONAS_FILE.exists(): raise FileNotFoundError(f"Merged personas file not found: {PERSONAS_FILE}") df = pd.read_excel(PERSONAS_FILE) # Split by age group df_adolescent = df[df['Age Category'].str.lower().str.contains('adolescent', na=False)].copy() df_adult = df[df['Age Category'].str.lower().str.contains('adult', na=False)].copy() # Convert to list of dicts adolescents = df_adolescent.to_dict('records') adults = df_adult.to_dict('records') print(f"๐Ÿ“Š Loaded {len(adolescents)} adolescents, {len(adults)} adults") return adolescents, adults def parse_behavioral_fingerprint(fp_str: Any) -> Dict[str, Any]: """ Safely parse behavioral fingerprint (JSON or Python dict literal). """ if pd.isna(fp_str) or not fp_str: return {} if isinstance(fp_str, dict): return fp_str fp_str = str(fp_str).strip() # Try JSON try: return json.loads(fp_str) except: pass # Try Python literal try: return ast.literal_eval(fp_str) except: pass return {} def load_questions() -> Dict[str, List[Dict]]: """ Load questions grouped by domain. Returns: { 'Personality': [q1, q2, ...], 'Grit': [...], ... } """ if not QUESTIONS_FILE.exists(): raise FileNotFoundError(f"Questions file not found: {QUESTIONS_FILE}") df = pd.read_excel(QUESTIONS_FILE) # Normalize column names df.columns = [c.strip() for c in df.columns] # Build questions by domain questions_by_domain: Dict[str, List[Dict[str, Any]]] = {} # Domain mapping (normalize case variations) domain_map = { 'Personality': 'Personality', 'personality': 'Personality', 'Grit': 'Grit', 'grit': 'Grit', 'GRIT': 'Grit', 'Emotional Intelligence': 'Emotional Intelligence', 'emotional intelligence': 'Emotional Intelligence', 'EI': 'Emotional Intelligence', 'Vocational Interest': 'Vocational Interest', 'vocational interest': 'Vocational Interest', 'Learning Strategies': 'Learning Strategies', 'learning strategies': 'Learning Strategies', } for _, row in df.iterrows(): raw_domain = str(row.get('domain', '')).strip() domain = domain_map.get(raw_domain, raw_domain) if domain not in questions_by_domain: questions_by_domain[domain] = [] # Build options list options = [] for i in range(1, 6): # option1 to option5 opt = row.get(f'option{i}', '') if pd.notna(opt) and str(opt).strip(): options.append(str(opt).strip()) # Check reverse scoring tag = str(row.get('tag', '')).strip().lower() is_reverse = 'reverse' in tag question = { 'q_code': str(row.get('code', '')).strip(), 'domain': domain, 'dimension': str(row.get('dimension', '')).strip(), 'subdimension': str(row.get('subdimension', '')).strip(), 'age_group': str(row.get('age-group', '')).strip(), 'question': str(row.get('question', '')).strip(), 'options_list': options, 'is_reverse_scored': is_reverse, 'type': str(row.get('Type', '')).strip(), } questions_by_domain[domain].append(question) # Print summary print("๐Ÿ“‹ Questions loaded:") for domain, qs in questions_by_domain.items(): reverse_count = sum(1 for q in qs if q['is_reverse_scored']) print(f" {domain}: {len(qs)} questions ({reverse_count} reverse-scored)") return questions_by_domain def get_questions_by_age(questions_by_domain: Dict[str, List[Dict[str, Any]]], age_group: str) -> Dict[str, List[Dict[str, Any]]]: """ Filter questions by age group (14-17 or 18-23). """ filtered = {} for domain, questions in questions_by_domain.items(): filtered[domain] = [q for q in questions if age_group in q.get('age_group', '')] # If no age-specific questions, include all (fallback) if not filtered[domain]: filtered[domain] = questions return filtered if __name__ == "__main__": # Test loading print("๐Ÿงช Testing Data Loader v2.0...") adolescents, adults = load_personas() print(f"\n๐Ÿ‘ค Sample Adolescent:") sample = adolescents[0] print(f" CPID: {sample.get('StudentCPID')}") print(f" Name: {sample.get('First Name')} {sample.get('Last Name')}") print(f" Openness: {sample.get('Openness Score')}") questions = load_questions() print(f"\n๐Ÿ“ Total Domains: {len(questions)}")