CP_Assessment_engine/services/data_loader.py
2026-02-10 12:59:40 +05:30

167 lines
5.3 KiB
Python

"""
Data Loader v2.0 - Zero Risk Edition
Loads merged personas and questions with full psychometric profiles.
"""
import pandas as pd
import json
from pathlib import Path
from typing import List, Dict, Tuple, Any
import ast
# Path Configuration
BASE_DIR = Path(__file__).resolve().parent.parent
PERSONAS_FILE = BASE_DIR / "data" / "merged_personas.xlsx"
# Questions file - now internal to project
QUESTIONS_FILE = BASE_DIR / "data" / "AllQuestions.xlsx"
def load_personas() -> Tuple[List[Dict], List[Dict]]:
"""
Load merged personas sorted by age group.
Returns: (adolescents, adults) each as list of dicts
"""
if not PERSONAS_FILE.exists():
raise FileNotFoundError(f"Merged personas file not found: {PERSONAS_FILE}")
df = pd.read_excel(PERSONAS_FILE)
# Split by age group
df_adolescent = df[df['Age Category'].str.lower().str.contains('adolescent', na=False)].copy()
df_adult = df[df['Age Category'].str.lower().str.contains('adult', na=False)].copy()
# Convert to list of dicts
adolescents = df_adolescent.to_dict('records')
adults = df_adult.to_dict('records')
print(f"📊 Loaded {len(adolescents)} adolescents, {len(adults)} adults")
return adolescents, adults
def parse_behavioral_fingerprint(fp_str: Any) -> Dict[str, Any]:
"""
Safely parse behavioral fingerprint (JSON or Python dict literal).
"""
if pd.isna(fp_str) or not fp_str:
return {}
if isinstance(fp_str, dict):
return fp_str
fp_str = str(fp_str).strip()
# Try JSON
try:
return json.loads(fp_str)
except:
pass
# Try Python literal
try:
return ast.literal_eval(fp_str)
except:
pass
return {}
def load_questions() -> Dict[str, List[Dict]]:
"""
Load questions grouped by domain.
Returns: { 'Personality': [q1, q2, ...], 'Grit': [...], ... }
"""
if not QUESTIONS_FILE.exists():
raise FileNotFoundError(f"Questions file not found: {QUESTIONS_FILE}")
df = pd.read_excel(QUESTIONS_FILE)
# Normalize column names
df.columns = [c.strip() for c in df.columns]
# Build questions by domain
questions_by_domain: Dict[str, List[Dict[str, Any]]] = {}
# Domain mapping (normalize case variations)
domain_map = {
'Personality': 'Personality',
'personality': 'Personality',
'Grit': 'Grit',
'grit': 'Grit',
'GRIT': 'Grit',
'Emotional Intelligence': 'Emotional Intelligence',
'emotional intelligence': 'Emotional Intelligence',
'EI': 'Emotional Intelligence',
'Vocational Interest': 'Vocational Interest',
'vocational interest': 'Vocational Interest',
'Learning Strategies': 'Learning Strategies',
'learning strategies': 'Learning Strategies',
}
for _, row in df.iterrows():
raw_domain = str(row.get('domain', '')).strip()
domain = domain_map.get(raw_domain, raw_domain)
if domain not in questions_by_domain:
questions_by_domain[domain] = []
# Build options list
options = []
for i in range(1, 6): # option1 to option5
opt = row.get(f'option{i}', '')
if pd.notna(opt) and str(opt).strip():
options.append(str(opt).strip())
# Check reverse scoring
tag = str(row.get('tag', '')).strip().lower()
is_reverse = 'reverse' in tag
question = {
'q_code': str(row.get('code', '')).strip(),
'domain': domain,
'dimension': str(row.get('dimension', '')).strip(),
'subdimension': str(row.get('subdimension', '')).strip(),
'age_group': str(row.get('age-group', '')).strip(),
'question': str(row.get('question', '')).strip(),
'options_list': options,
'is_reverse_scored': is_reverse,
'type': str(row.get('Type', '')).strip(),
}
questions_by_domain[domain].append(question)
# Print summary
print("📋 Questions loaded:")
for domain, qs in questions_by_domain.items():
reverse_count = sum(1 for q in qs if q['is_reverse_scored'])
print(f" {domain}: {len(qs)} questions ({reverse_count} reverse-scored)")
return questions_by_domain
def get_questions_by_age(questions_by_domain: Dict[str, List[Dict[str, Any]]], age_group: str) -> Dict[str, List[Dict[str, Any]]]:
"""
Filter questions by age group (14-17 or 18-23).
"""
filtered = {}
for domain, questions in questions_by_domain.items():
filtered[domain] = [q for q in questions if age_group in q.get('age_group', '')]
# If no age-specific questions, include all (fallback)
if not filtered[domain]:
filtered[domain] = questions
return filtered
if __name__ == "__main__":
# Test loading
print("🧪 Testing Data Loader v2.0...")
adolescents, adults = load_personas()
print(f"\n👤 Sample Adolescent:")
sample = adolescents[0]
print(f" CPID: {sample.get('StudentCPID')}")
print(f" Name: {sample.get('First Name')} {sample.get('Last Name')}")
print(f" Openness: {sample.get('Openness Score')}")
questions = load_questions()
print(f"\n📝 Total Domains: {len(questions)}")