167 lines
5.3 KiB
Python
167 lines
5.3 KiB
Python
"""
|
|
Data Loader v2.0 - Zero Risk Edition
|
|
Loads merged personas and questions with full psychometric profiles.
|
|
"""
|
|
import pandas as pd
|
|
import json
|
|
from pathlib import Path
|
|
from typing import List, Dict, Tuple, Any
|
|
import ast
|
|
|
|
# Path Configuration
|
|
BASE_DIR = Path(__file__).resolve().parent.parent
|
|
PERSONAS_FILE = BASE_DIR / "data" / "merged_personas.xlsx"
|
|
# Questions file - now internal to project
|
|
QUESTIONS_FILE = BASE_DIR / "data" / "AllQuestions.xlsx"
|
|
|
|
|
|
|
|
def load_personas() -> Tuple[List[Dict], List[Dict]]:
|
|
"""
|
|
Load merged personas sorted by age group.
|
|
Returns: (adolescents, adults) each as list of dicts
|
|
"""
|
|
if not PERSONAS_FILE.exists():
|
|
raise FileNotFoundError(f"Merged personas file not found: {PERSONAS_FILE}")
|
|
|
|
df = pd.read_excel(PERSONAS_FILE)
|
|
|
|
# Split by age group
|
|
df_adolescent = df[df['Age Category'].str.lower().str.contains('adolescent', na=False)].copy()
|
|
df_adult = df[df['Age Category'].str.lower().str.contains('adult', na=False)].copy()
|
|
|
|
# Convert to list of dicts
|
|
adolescents = df_adolescent.to_dict('records')
|
|
adults = df_adult.to_dict('records')
|
|
|
|
print(f"📊 Loaded {len(adolescents)} adolescents, {len(adults)} adults")
|
|
return adolescents, adults
|
|
|
|
|
|
def parse_behavioral_fingerprint(fp_str: Any) -> Dict[str, Any]:
|
|
"""
|
|
Safely parse behavioral fingerprint (JSON or Python dict literal).
|
|
"""
|
|
if pd.isna(fp_str) or not fp_str:
|
|
return {}
|
|
|
|
if isinstance(fp_str, dict):
|
|
return fp_str
|
|
|
|
fp_str = str(fp_str).strip()
|
|
|
|
# Try JSON
|
|
try:
|
|
return json.loads(fp_str)
|
|
except:
|
|
pass
|
|
|
|
# Try Python literal
|
|
try:
|
|
return ast.literal_eval(fp_str)
|
|
except:
|
|
pass
|
|
|
|
return {}
|
|
|
|
|
|
def load_questions() -> Dict[str, List[Dict]]:
|
|
"""
|
|
Load questions grouped by domain.
|
|
Returns: { 'Personality': [q1, q2, ...], 'Grit': [...], ... }
|
|
"""
|
|
if not QUESTIONS_FILE.exists():
|
|
raise FileNotFoundError(f"Questions file not found: {QUESTIONS_FILE}")
|
|
|
|
df = pd.read_excel(QUESTIONS_FILE)
|
|
|
|
# Normalize column names
|
|
df.columns = [c.strip() for c in df.columns]
|
|
|
|
# Build questions by domain
|
|
questions_by_domain: Dict[str, List[Dict[str, Any]]] = {}
|
|
|
|
# Domain mapping (normalize case variations)
|
|
domain_map = {
|
|
'Personality': 'Personality',
|
|
'personality': 'Personality',
|
|
'Grit': 'Grit',
|
|
'grit': 'Grit',
|
|
'GRIT': 'Grit',
|
|
'Emotional Intelligence': 'Emotional Intelligence',
|
|
'emotional intelligence': 'Emotional Intelligence',
|
|
'EI': 'Emotional Intelligence',
|
|
'Vocational Interest': 'Vocational Interest',
|
|
'vocational interest': 'Vocational Interest',
|
|
'Learning Strategies': 'Learning Strategies',
|
|
'learning strategies': 'Learning Strategies',
|
|
}
|
|
|
|
for _, row in df.iterrows():
|
|
raw_domain = str(row.get('domain', '')).strip()
|
|
domain = domain_map.get(raw_domain, raw_domain)
|
|
|
|
if domain not in questions_by_domain:
|
|
questions_by_domain[domain] = []
|
|
|
|
# Build options list
|
|
options = []
|
|
for i in range(1, 6): # option1 to option5
|
|
opt = row.get(f'option{i}', '')
|
|
if pd.notna(opt) and str(opt).strip():
|
|
options.append(str(opt).strip())
|
|
|
|
# Check reverse scoring
|
|
tag = str(row.get('tag', '')).strip().lower()
|
|
is_reverse = 'reverse' in tag
|
|
|
|
question = {
|
|
'q_code': str(row.get('code', '')).strip(),
|
|
'domain': domain,
|
|
'dimension': str(row.get('dimension', '')).strip(),
|
|
'subdimension': str(row.get('subdimension', '')).strip(),
|
|
'age_group': str(row.get('age-group', '')).strip(),
|
|
'question': str(row.get('question', '')).strip(),
|
|
'options_list': options,
|
|
'is_reverse_scored': is_reverse,
|
|
'type': str(row.get('Type', '')).strip(),
|
|
}
|
|
|
|
questions_by_domain[domain].append(question)
|
|
|
|
# Print summary
|
|
print("📋 Questions loaded:")
|
|
for domain, qs in questions_by_domain.items():
|
|
reverse_count = sum(1 for q in qs if q['is_reverse_scored'])
|
|
print(f" {domain}: {len(qs)} questions ({reverse_count} reverse-scored)")
|
|
|
|
return questions_by_domain
|
|
|
|
|
|
def get_questions_by_age(questions_by_domain: Dict[str, List[Dict[str, Any]]], age_group: str) -> Dict[str, List[Dict[str, Any]]]:
|
|
"""
|
|
Filter questions by age group (14-17 or 18-23).
|
|
"""
|
|
filtered = {}
|
|
for domain, questions in questions_by_domain.items():
|
|
filtered[domain] = [q for q in questions if age_group in q.get('age_group', '')]
|
|
# If no age-specific questions, include all (fallback)
|
|
if not filtered[domain]:
|
|
filtered[domain] = questions
|
|
return filtered
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Test loading
|
|
print("🧪 Testing Data Loader v2.0...")
|
|
|
|
adolescents, adults = load_personas()
|
|
print(f"\n👤 Sample Adolescent:")
|
|
sample = adolescents[0]
|
|
print(f" CPID: {sample.get('StudentCPID')}")
|
|
print(f" Name: {sample.get('First Name')} {sample.get('Last Name')}")
|
|
print(f" Openness: {sample.get('Openness Score')}")
|
|
|
|
questions = load_questions()
|
|
print(f"\n📝 Total Domains: {len(questions)}")
|