CP_Assessment_engine/scripts/comprehensive_quality_check.py
2026-02-10 12:59:40 +05:30

247 lines
9.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Comprehensive Quality Check - 100% Verification
Checks completion, data quality, schema accuracy, and completeness
"""
import pandas as pd
from pathlib import Path
import sys
import io
# Fix Windows console encoding
if sys.platform == 'win32':
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
BASE_DIR = Path(__file__).resolve().parent.parent
OUTPUT_DIR = BASE_DIR / "output" / "full_run"
DATA_DIR = BASE_DIR / "data"
QUESTIONS_FILE = BASE_DIR / "data" / "AllQuestions.xlsx"
# Expected counts
EXPECTED_ADOLESCENTS = 1507
EXPECTED_ADULTS = 1493
EXPECTED_DOMAINS = 5
EXPECTED_COGNITION_TESTS = 12
def load_questions():
"""Load all questions to verify completeness"""
try:
df = pd.read_excel(QUESTIONS_FILE, engine='openpyxl')
questions_by_domain = {}
for domain in df['domain'].unique():
domain_df = df[df['domain'] == domain]
for age_group in domain_df['age-group'].unique():
key = f"{domain}_{age_group}"
questions_by_domain[key] = len(domain_df[domain_df['age-group'] == age_group])
return questions_by_domain, df
except Exception as e:
print(f"⚠️ Error loading questions: {e}")
return {}, pd.DataFrame()
def check_file_completeness(file_path, expected_rows, domain_name, age_group):
"""Check if file exists and has correct row count"""
if not file_path.exists():
return False, f"❌ MISSING: {file_path.name}"
try:
df = pd.read_excel(file_path, engine='openpyxl')
actual_rows = len(df)
if actual_rows != expected_rows:
return False, f"❌ ROW COUNT MISMATCH: Expected {expected_rows}, got {actual_rows}"
# Check for required columns
if 'Student CPID' not in df.columns and 'Participant' not in df.columns:
return False, f"❌ MISSING ID COLUMN: No Student CPID or Participant column"
# Check for NaN in ID column
id_col = 'Student CPID' if 'Student CPID' in df.columns else 'Participant'
nan_count = df[id_col].isna().sum()
if nan_count > 0:
return False, f"{nan_count} NaN values in ID column"
# Check data density (non-null percentage)
total_cells = len(df) * len(df.columns)
null_cells = df.isnull().sum().sum()
density = ((total_cells - null_cells) / total_cells) * 100
if density < 95:
return False, f"⚠️ LOW DATA DENSITY: {density:.2f}% (expected >95%)"
return True, f"{actual_rows} rows, {density:.2f}% density"
except Exception as e:
return False, f"❌ ERROR: {str(e)}"
def check_question_completeness(file_path, domain_name, age_group, questions_df):
"""Check if all questions are answered"""
try:
df = pd.read_excel(file_path, engine='openpyxl')
# Get expected questions for this domain/age
domain_questions = questions_df[
(questions_df['domain'] == domain_name) &
(questions_df['age-group'] == age_group)
]
expected_q_codes = set(domain_questions['code'].astype(str).unique())
# Get answered question codes (columns minus metadata)
metadata_cols = {'Student CPID', 'Participant', 'Name', 'Age', 'Gender', 'Age Category'}
answered_cols = set(df.columns) - metadata_cols
answered_q_codes = set([col for col in answered_cols if col in expected_q_codes])
missing = expected_q_codes - answered_q_codes
extra = answered_q_codes - expected_q_codes
if missing:
return False, f"❌ MISSING QUESTIONS: {len(missing)} questions not answered"
if extra:
return False, f"⚠️ EXTRA QUESTIONS: {len(extra)} unexpected columns"
return True, f"✅ All {len(expected_q_codes)} questions answered"
except Exception as e:
return False, f"❌ ERROR checking questions: {str(e)}"
def main():
print("=" * 80)
print("🔍 COMPREHENSIVE QUALITY CHECK - 100% VERIFICATION")
print("=" * 80)
print()
# Load questions
questions_by_domain, questions_df = load_questions()
results = {
'adolescents': {'domains': {}, 'cognition': {}},
'adults': {'domains': {}, 'cognition': {}}
}
all_passed = True
# Check 5 domains for adolescents
print("📊 ADOLESCENTS (14-17) - 5 DOMAINS")
print("-" * 80)
# Domain name to file name mapping (from config.py)
domain_file_map = {
'Personality': 'Personality_14-17.xlsx',
'Grit': 'Grit_14-17.xlsx',
'Emotional Intelligence': 'Emotional_Intelligence_14-17.xlsx',
'Vocational Interest': 'Vocational_Interest_14-17.xlsx',
'Learning Strategies': 'Learning_Strategies_14-17.xlsx'
}
age_group = '14-17'
for domain, file_name in domain_file_map.items():
file_path = OUTPUT_DIR / "adolescense" / "5_domain" / file_name
passed, msg = check_file_completeness(file_path, EXPECTED_ADOLESCENTS, domain, age_group)
results['adolescents']['domains'][domain] = {'passed': passed, 'message': msg}
print(f" {domain:30} {msg}")
if not passed:
all_passed = False
# Check question completeness
if passed and not questions_df.empty:
q_passed, q_msg = check_question_completeness(file_path, domain, age_group, questions_df)
if not q_passed:
print(f" {q_msg}")
all_passed = False
else:
print(f" {q_msg}")
print()
# Check 5 domains for adults
print("📊 ADULTS (18-23) - 5 DOMAINS")
print("-" * 80)
# Domain name to file name mapping (from config.py)
domain_file_map_adults = {
'Personality': 'Personality_18-23.xlsx',
'Grit': 'Grit_18-23.xlsx',
'Emotional Intelligence': 'Emotional_Intelligence_18-23.xlsx',
'Vocational Interest': 'Vocational_Interest_18-23.xlsx',
'Learning Strategies': 'Learning_Strategies_18-23.xlsx'
}
age_group = '18-23'
for domain, file_name in domain_file_map_adults.items():
file_path = OUTPUT_DIR / "adults" / "5_domain" / file_name
passed, msg = check_file_completeness(file_path, EXPECTED_ADULTS, domain, age_group)
results['adults']['domains'][domain] = {'passed': passed, 'message': msg}
print(f" {domain:30} {msg}")
if not passed:
all_passed = False
# Check question completeness
if passed and not questions_df.empty:
q_passed, q_msg = check_question_completeness(file_path, domain, age_group, questions_df)
if not q_passed:
print(f" {q_msg}")
all_passed = False
else:
print(f" {q_msg}")
print()
# Check cognition tests
print("🧠 COGNITION TESTS")
print("-" * 80)
cognition_tests = [
'Cognitive_Flexibility_Test', 'Color_Stroop_Task',
'Problem_Solving_Test_MRO', 'Problem_Solving_Test_MR',
'Problem_Solving_Test_NPS', 'Problem_Solving_Test_SBDM',
'Reasoning_Tasks_AR', 'Reasoning_Tasks_DR', 'Reasoning_Tasks_NR',
'Response_Inhibition_Task', 'Sternberg_Working_Memory_Task',
'Visual_Paired_Associates_Test'
]
for test in cognition_tests:
# Adolescents
file_path = OUTPUT_DIR / "adolescense" / "cognition" / f"{test}_{age_group}.xlsx"
if file_path.exists():
passed, msg = check_file_completeness(file_path, EXPECTED_ADOLESCENTS, test, '14-17')
results['adolescents']['cognition'][test] = {'passed': passed, 'message': msg}
print(f" Adolescent {test:35} {msg}")
if not passed:
all_passed = False
else:
print(f" Adolescent {test:35} ⏭️ SKIPPED (not generated)")
# Adults
file_path = OUTPUT_DIR / "adults" / "cognition" / f"{test}_18-23.xlsx"
if file_path.exists():
passed, msg = check_file_completeness(file_path, EXPECTED_ADULTS, test, '18-23')
results['adults']['cognition'][test] = {'passed': passed, 'message': msg}
print(f" Adult {test:35} {msg}")
if not passed:
all_passed = False
else:
print(f" Adult {test:35} ⏭️ SKIPPED (not generated)")
print()
print("=" * 80)
# Summary
if all_passed:
print("✅ ALL CHECKS PASSED - 100% COMPLETE AND ACCURATE")
else:
print("❌ SOME CHECKS FAILED - REVIEW REQUIRED")
print("=" * 80)
# Calculate totals
total_domain_files = 10 # 5 domains × 2 age groups
total_cognition_files = 24 # 12 tests × 2 age groups (if all generated)
print()
print("📈 SUMMARY STATISTICS")
print("-" * 80)
print(f"Total Domain Files: {total_domain_files}")
print(f"Total Cognition Files: {len([f for age in ['adolescense', 'adults'] for f in (OUTPUT_DIR / age / 'cognition').glob('*.xlsx')])}")
print(f"Adolescent Students: {EXPECTED_ADOLESCENTS}")
print(f"Adult Students: {EXPECTED_ADULTS}")
print(f"Total Students: {EXPECTED_ADOLESCENTS + EXPECTED_ADULTS}")
return all_passed
if __name__ == "__main__":
success = main()
sys.exit(0 if success else 1)