247 lines
9.3 KiB
Python
247 lines
9.3 KiB
Python
"""
|
||
Comprehensive Quality Check - 100% Verification
|
||
Checks completion, data quality, schema accuracy, and completeness
|
||
"""
|
||
import pandas as pd
|
||
from pathlib import Path
|
||
import sys
|
||
import io
|
||
|
||
# Fix Windows console encoding
|
||
if sys.platform == 'win32':
|
||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
||
|
||
BASE_DIR = Path(__file__).resolve().parent.parent
|
||
OUTPUT_DIR = BASE_DIR / "output" / "full_run"
|
||
DATA_DIR = BASE_DIR / "data"
|
||
QUESTIONS_FILE = BASE_DIR / "data" / "AllQuestions.xlsx"
|
||
|
||
# Expected counts
|
||
EXPECTED_ADOLESCENTS = 1507
|
||
EXPECTED_ADULTS = 1493
|
||
EXPECTED_DOMAINS = 5
|
||
EXPECTED_COGNITION_TESTS = 12
|
||
|
||
def load_questions():
|
||
"""Load all questions to verify completeness"""
|
||
try:
|
||
df = pd.read_excel(QUESTIONS_FILE, engine='openpyxl')
|
||
questions_by_domain = {}
|
||
for domain in df['domain'].unique():
|
||
domain_df = df[df['domain'] == domain]
|
||
for age_group in domain_df['age-group'].unique():
|
||
key = f"{domain}_{age_group}"
|
||
questions_by_domain[key] = len(domain_df[domain_df['age-group'] == age_group])
|
||
return questions_by_domain, df
|
||
except Exception as e:
|
||
print(f"⚠️ Error loading questions: {e}")
|
||
return {}, pd.DataFrame()
|
||
|
||
def check_file_completeness(file_path, expected_rows, domain_name, age_group):
|
||
"""Check if file exists and has correct row count"""
|
||
if not file_path.exists():
|
||
return False, f"❌ MISSING: {file_path.name}"
|
||
|
||
try:
|
||
df = pd.read_excel(file_path, engine='openpyxl')
|
||
actual_rows = len(df)
|
||
|
||
if actual_rows != expected_rows:
|
||
return False, f"❌ ROW COUNT MISMATCH: Expected {expected_rows}, got {actual_rows}"
|
||
|
||
# Check for required columns
|
||
if 'Student CPID' not in df.columns and 'Participant' not in df.columns:
|
||
return False, f"❌ MISSING ID COLUMN: No Student CPID or Participant column"
|
||
|
||
# Check for NaN in ID column
|
||
id_col = 'Student CPID' if 'Student CPID' in df.columns else 'Participant'
|
||
nan_count = df[id_col].isna().sum()
|
||
if nan_count > 0:
|
||
return False, f"❌ {nan_count} NaN values in ID column"
|
||
|
||
# Check data density (non-null percentage)
|
||
total_cells = len(df) * len(df.columns)
|
||
null_cells = df.isnull().sum().sum()
|
||
density = ((total_cells - null_cells) / total_cells) * 100
|
||
|
||
if density < 95:
|
||
return False, f"⚠️ LOW DATA DENSITY: {density:.2f}% (expected >95%)"
|
||
|
||
return True, f"✅ {actual_rows} rows, {density:.2f}% density"
|
||
except Exception as e:
|
||
return False, f"❌ ERROR: {str(e)}"
|
||
|
||
def check_question_completeness(file_path, domain_name, age_group, questions_df):
|
||
"""Check if all questions are answered"""
|
||
try:
|
||
df = pd.read_excel(file_path, engine='openpyxl')
|
||
|
||
# Get expected questions for this domain/age
|
||
domain_questions = questions_df[
|
||
(questions_df['domain'] == domain_name) &
|
||
(questions_df['age-group'] == age_group)
|
||
]
|
||
expected_q_codes = set(domain_questions['code'].astype(str).unique())
|
||
|
||
# Get answered question codes (columns minus metadata)
|
||
metadata_cols = {'Student CPID', 'Participant', 'Name', 'Age', 'Gender', 'Age Category'}
|
||
answered_cols = set(df.columns) - metadata_cols
|
||
answered_q_codes = set([col for col in answered_cols if col in expected_q_codes])
|
||
|
||
missing = expected_q_codes - answered_q_codes
|
||
extra = answered_q_codes - expected_q_codes
|
||
|
||
if missing:
|
||
return False, f"❌ MISSING QUESTIONS: {len(missing)} questions not answered"
|
||
if extra:
|
||
return False, f"⚠️ EXTRA QUESTIONS: {len(extra)} unexpected columns"
|
||
|
||
return True, f"✅ All {len(expected_q_codes)} questions answered"
|
||
except Exception as e:
|
||
return False, f"❌ ERROR checking questions: {str(e)}"
|
||
|
||
def main():
|
||
print("=" * 80)
|
||
print("🔍 COMPREHENSIVE QUALITY CHECK - 100% VERIFICATION")
|
||
print("=" * 80)
|
||
print()
|
||
|
||
# Load questions
|
||
questions_by_domain, questions_df = load_questions()
|
||
|
||
results = {
|
||
'adolescents': {'domains': {}, 'cognition': {}},
|
||
'adults': {'domains': {}, 'cognition': {}}
|
||
}
|
||
|
||
all_passed = True
|
||
|
||
# Check 5 domains for adolescents
|
||
print("📊 ADOLESCENTS (14-17) - 5 DOMAINS")
|
||
print("-" * 80)
|
||
# Domain name to file name mapping (from config.py)
|
||
domain_file_map = {
|
||
'Personality': 'Personality_14-17.xlsx',
|
||
'Grit': 'Grit_14-17.xlsx',
|
||
'Emotional Intelligence': 'Emotional_Intelligence_14-17.xlsx',
|
||
'Vocational Interest': 'Vocational_Interest_14-17.xlsx',
|
||
'Learning Strategies': 'Learning_Strategies_14-17.xlsx'
|
||
}
|
||
age_group = '14-17'
|
||
|
||
for domain, file_name in domain_file_map.items():
|
||
file_path = OUTPUT_DIR / "adolescense" / "5_domain" / file_name
|
||
passed, msg = check_file_completeness(file_path, EXPECTED_ADOLESCENTS, domain, age_group)
|
||
results['adolescents']['domains'][domain] = {'passed': passed, 'message': msg}
|
||
print(f" {domain:30} {msg}")
|
||
if not passed:
|
||
all_passed = False
|
||
|
||
# Check question completeness
|
||
if passed and not questions_df.empty:
|
||
q_passed, q_msg = check_question_completeness(file_path, domain, age_group, questions_df)
|
||
if not q_passed:
|
||
print(f" {q_msg}")
|
||
all_passed = False
|
||
else:
|
||
print(f" {q_msg}")
|
||
|
||
print()
|
||
|
||
# Check 5 domains for adults
|
||
print("📊 ADULTS (18-23) - 5 DOMAINS")
|
||
print("-" * 80)
|
||
# Domain name to file name mapping (from config.py)
|
||
domain_file_map_adults = {
|
||
'Personality': 'Personality_18-23.xlsx',
|
||
'Grit': 'Grit_18-23.xlsx',
|
||
'Emotional Intelligence': 'Emotional_Intelligence_18-23.xlsx',
|
||
'Vocational Interest': 'Vocational_Interest_18-23.xlsx',
|
||
'Learning Strategies': 'Learning_Strategies_18-23.xlsx'
|
||
}
|
||
age_group = '18-23'
|
||
|
||
for domain, file_name in domain_file_map_adults.items():
|
||
file_path = OUTPUT_DIR / "adults" / "5_domain" / file_name
|
||
passed, msg = check_file_completeness(file_path, EXPECTED_ADULTS, domain, age_group)
|
||
results['adults']['domains'][domain] = {'passed': passed, 'message': msg}
|
||
print(f" {domain:30} {msg}")
|
||
if not passed:
|
||
all_passed = False
|
||
|
||
# Check question completeness
|
||
if passed and not questions_df.empty:
|
||
q_passed, q_msg = check_question_completeness(file_path, domain, age_group, questions_df)
|
||
if not q_passed:
|
||
print(f" {q_msg}")
|
||
all_passed = False
|
||
else:
|
||
print(f" {q_msg}")
|
||
|
||
print()
|
||
|
||
# Check cognition tests
|
||
print("🧠 COGNITION TESTS")
|
||
print("-" * 80)
|
||
cognition_tests = [
|
||
'Cognitive_Flexibility_Test', 'Color_Stroop_Task',
|
||
'Problem_Solving_Test_MRO', 'Problem_Solving_Test_MR',
|
||
'Problem_Solving_Test_NPS', 'Problem_Solving_Test_SBDM',
|
||
'Reasoning_Tasks_AR', 'Reasoning_Tasks_DR', 'Reasoning_Tasks_NR',
|
||
'Response_Inhibition_Task', 'Sternberg_Working_Memory_Task',
|
||
'Visual_Paired_Associates_Test'
|
||
]
|
||
|
||
for test in cognition_tests:
|
||
# Adolescents
|
||
file_path = OUTPUT_DIR / "adolescense" / "cognition" / f"{test}_{age_group}.xlsx"
|
||
if file_path.exists():
|
||
passed, msg = check_file_completeness(file_path, EXPECTED_ADOLESCENTS, test, '14-17')
|
||
results['adolescents']['cognition'][test] = {'passed': passed, 'message': msg}
|
||
print(f" Adolescent {test:35} {msg}")
|
||
if not passed:
|
||
all_passed = False
|
||
else:
|
||
print(f" Adolescent {test:35} ⏭️ SKIPPED (not generated)")
|
||
|
||
# Adults
|
||
file_path = OUTPUT_DIR / "adults" / "cognition" / f"{test}_18-23.xlsx"
|
||
if file_path.exists():
|
||
passed, msg = check_file_completeness(file_path, EXPECTED_ADULTS, test, '18-23')
|
||
results['adults']['cognition'][test] = {'passed': passed, 'message': msg}
|
||
print(f" Adult {test:35} {msg}")
|
||
if not passed:
|
||
all_passed = False
|
||
else:
|
||
print(f" Adult {test:35} ⏭️ SKIPPED (not generated)")
|
||
|
||
print()
|
||
print("=" * 80)
|
||
|
||
# Summary
|
||
if all_passed:
|
||
print("✅ ALL CHECKS PASSED - 100% COMPLETE AND ACCURATE")
|
||
else:
|
||
print("❌ SOME CHECKS FAILED - REVIEW REQUIRED")
|
||
|
||
print("=" * 80)
|
||
|
||
# Calculate totals
|
||
total_domain_files = 10 # 5 domains × 2 age groups
|
||
total_cognition_files = 24 # 12 tests × 2 age groups (if all generated)
|
||
|
||
print()
|
||
print("📈 SUMMARY STATISTICS")
|
||
print("-" * 80)
|
||
print(f"Total Domain Files: {total_domain_files}")
|
||
print(f"Total Cognition Files: {len([f for age in ['adolescense', 'adults'] for f in (OUTPUT_DIR / age / 'cognition').glob('*.xlsx')])}")
|
||
print(f"Adolescent Students: {EXPECTED_ADOLESCENTS}")
|
||
print(f"Adult Students: {EXPECTED_ADULTS}")
|
||
print(f"Total Students: {EXPECTED_ADOLESCENTS + EXPECTED_ADULTS}")
|
||
|
||
return all_passed
|
||
|
||
if __name__ == "__main__":
|
||
success = main()
|
||
sys.exit(0 if success else 1)
|