""" Comprehensive Quality Check - 100% Verification Checks completion, data quality, schema accuracy, and completeness """ import pandas as pd from pathlib import Path import sys import io # Fix Windows console encoding if sys.platform == 'win32': sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') BASE_DIR = Path(__file__).resolve().parent.parent OUTPUT_DIR = BASE_DIR / "output" / "full_run" DATA_DIR = BASE_DIR / "data" QUESTIONS_FILE = BASE_DIR / "data" / "AllQuestions.xlsx" # Expected counts EXPECTED_ADOLESCENTS = 1507 EXPECTED_ADULTS = 1493 EXPECTED_DOMAINS = 5 EXPECTED_COGNITION_TESTS = 12 def load_questions(): """Load all questions to verify completeness""" try: df = pd.read_excel(QUESTIONS_FILE, engine='openpyxl') questions_by_domain = {} for domain in df['domain'].unique(): domain_df = df[df['domain'] == domain] for age_group in domain_df['age-group'].unique(): key = f"{domain}_{age_group}" questions_by_domain[key] = len(domain_df[domain_df['age-group'] == age_group]) return questions_by_domain, df except Exception as e: print(f"⚠️ Error loading questions: {e}") return {}, pd.DataFrame() def check_file_completeness(file_path, expected_rows, domain_name, age_group): """Check if file exists and has correct row count""" if not file_path.exists(): return False, f"❌ MISSING: {file_path.name}" try: df = pd.read_excel(file_path, engine='openpyxl') actual_rows = len(df) if actual_rows != expected_rows: return False, f"❌ ROW COUNT MISMATCH: Expected {expected_rows}, got {actual_rows}" # Check for required columns if 'Student CPID' not in df.columns and 'Participant' not in df.columns: return False, f"❌ MISSING ID COLUMN: No Student CPID or Participant column" # Check for NaN in ID column id_col = 'Student CPID' if 'Student CPID' in df.columns else 'Participant' nan_count = df[id_col].isna().sum() if nan_count > 0: return False, f"❌ {nan_count} NaN values in ID column" # Check data density (non-null percentage) total_cells = len(df) * len(df.columns) null_cells = df.isnull().sum().sum() density = ((total_cells - null_cells) / total_cells) * 100 if density < 95: return False, f"⚠️ LOW DATA DENSITY: {density:.2f}% (expected >95%)" return True, f"✅ {actual_rows} rows, {density:.2f}% density" except Exception as e: return False, f"❌ ERROR: {str(e)}" def check_question_completeness(file_path, domain_name, age_group, questions_df): """Check if all questions are answered""" try: df = pd.read_excel(file_path, engine='openpyxl') # Get expected questions for this domain/age domain_questions = questions_df[ (questions_df['domain'] == domain_name) & (questions_df['age-group'] == age_group) ] expected_q_codes = set(domain_questions['code'].astype(str).unique()) # Get answered question codes (columns minus metadata) metadata_cols = {'Student CPID', 'Participant', 'Name', 'Age', 'Gender', 'Age Category'} answered_cols = set(df.columns) - metadata_cols answered_q_codes = set([col for col in answered_cols if col in expected_q_codes]) missing = expected_q_codes - answered_q_codes extra = answered_q_codes - expected_q_codes if missing: return False, f"❌ MISSING QUESTIONS: {len(missing)} questions not answered" if extra: return False, f"⚠️ EXTRA QUESTIONS: {len(extra)} unexpected columns" return True, f"✅ All {len(expected_q_codes)} questions answered" except Exception as e: return False, f"❌ ERROR checking questions: {str(e)}" def main(): print("=" * 80) print("🔍 COMPREHENSIVE QUALITY CHECK - 100% VERIFICATION") print("=" * 80) print() # Load questions questions_by_domain, questions_df = load_questions() results = { 'adolescents': {'domains': {}, 'cognition': {}}, 'adults': {'domains': {}, 'cognition': {}} } all_passed = True # Check 5 domains for adolescents print("📊 ADOLESCENTS (14-17) - 5 DOMAINS") print("-" * 80) # Domain name to file name mapping (from config.py) domain_file_map = { 'Personality': 'Personality_14-17.xlsx', 'Grit': 'Grit_14-17.xlsx', 'Emotional Intelligence': 'Emotional_Intelligence_14-17.xlsx', 'Vocational Interest': 'Vocational_Interest_14-17.xlsx', 'Learning Strategies': 'Learning_Strategies_14-17.xlsx' } age_group = '14-17' for domain, file_name in domain_file_map.items(): file_path = OUTPUT_DIR / "adolescense" / "5_domain" / file_name passed, msg = check_file_completeness(file_path, EXPECTED_ADOLESCENTS, domain, age_group) results['adolescents']['domains'][domain] = {'passed': passed, 'message': msg} print(f" {domain:30} {msg}") if not passed: all_passed = False # Check question completeness if passed and not questions_df.empty: q_passed, q_msg = check_question_completeness(file_path, domain, age_group, questions_df) if not q_passed: print(f" {q_msg}") all_passed = False else: print(f" {q_msg}") print() # Check 5 domains for adults print("📊 ADULTS (18-23) - 5 DOMAINS") print("-" * 80) # Domain name to file name mapping (from config.py) domain_file_map_adults = { 'Personality': 'Personality_18-23.xlsx', 'Grit': 'Grit_18-23.xlsx', 'Emotional Intelligence': 'Emotional_Intelligence_18-23.xlsx', 'Vocational Interest': 'Vocational_Interest_18-23.xlsx', 'Learning Strategies': 'Learning_Strategies_18-23.xlsx' } age_group = '18-23' for domain, file_name in domain_file_map_adults.items(): file_path = OUTPUT_DIR / "adults" / "5_domain" / file_name passed, msg = check_file_completeness(file_path, EXPECTED_ADULTS, domain, age_group) results['adults']['domains'][domain] = {'passed': passed, 'message': msg} print(f" {domain:30} {msg}") if not passed: all_passed = False # Check question completeness if passed and not questions_df.empty: q_passed, q_msg = check_question_completeness(file_path, domain, age_group, questions_df) if not q_passed: print(f" {q_msg}") all_passed = False else: print(f" {q_msg}") print() # Check cognition tests print("🧠 COGNITION TESTS") print("-" * 80) cognition_tests = [ 'Cognitive_Flexibility_Test', 'Color_Stroop_Task', 'Problem_Solving_Test_MRO', 'Problem_Solving_Test_MR', 'Problem_Solving_Test_NPS', 'Problem_Solving_Test_SBDM', 'Reasoning_Tasks_AR', 'Reasoning_Tasks_DR', 'Reasoning_Tasks_NR', 'Response_Inhibition_Task', 'Sternberg_Working_Memory_Task', 'Visual_Paired_Associates_Test' ] for test in cognition_tests: # Adolescents file_path = OUTPUT_DIR / "adolescense" / "cognition" / f"{test}_{age_group}.xlsx" if file_path.exists(): passed, msg = check_file_completeness(file_path, EXPECTED_ADOLESCENTS, test, '14-17') results['adolescents']['cognition'][test] = {'passed': passed, 'message': msg} print(f" Adolescent {test:35} {msg}") if not passed: all_passed = False else: print(f" Adolescent {test:35} ⏭️ SKIPPED (not generated)") # Adults file_path = OUTPUT_DIR / "adults" / "cognition" / f"{test}_18-23.xlsx" if file_path.exists(): passed, msg = check_file_completeness(file_path, EXPECTED_ADULTS, test, '18-23') results['adults']['cognition'][test] = {'passed': passed, 'message': msg} print(f" Adult {test:35} {msg}") if not passed: all_passed = False else: print(f" Adult {test:35} ⏭️ SKIPPED (not generated)") print() print("=" * 80) # Summary if all_passed: print("✅ ALL CHECKS PASSED - 100% COMPLETE AND ACCURATE") else: print("❌ SOME CHECKS FAILED - REVIEW REQUIRED") print("=" * 80) # Calculate totals total_domain_files = 10 # 5 domains × 2 age groups total_cognition_files = 24 # 12 tests × 2 age groups (if all generated) print() print("📈 SUMMARY STATISTICS") print("-" * 80) print(f"Total Domain Files: {total_domain_files}") print(f"Total Cognition Files: {len([f for age in ['adolescense', 'adults'] for f in (OUTPUT_DIR / age / 'cognition').glob('*.xlsx')])}") print(f"Adolescent Students: {EXPECTED_ADOLESCENTS}") print(f"Adult Students: {EXPECTED_ADULTS}") print(f"Total Students: {EXPECTED_ADOLESCENTS + EXPECTED_ADULTS}") return all_passed if __name__ == "__main__": success = main() sys.exit(0 if success else 1)