134 lines
4.6 KiB
Python
134 lines
4.6 KiB
Python
"""
|
|
Final 100% Verification Report
|
|
"""
|
|
import pandas as pd
|
|
from pathlib import Path
|
|
import sys
|
|
import io
|
|
|
|
if sys.platform == 'win32':
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
|
|
|
BASE_DIR = Path(__file__).resolve().parent.parent
|
|
OUTPUT_DIR = BASE_DIR / "output" / "full_run"
|
|
|
|
EXPECTED_ADOLESCENTS = 1507
|
|
EXPECTED_ADULTS = 1493
|
|
|
|
def verify_domain_files():
|
|
"""Verify all 5 domain files for both age groups"""
|
|
results = {}
|
|
|
|
domain_files = {
|
|
'adolescense': {
|
|
'Personality': 'Personality_14-17.xlsx',
|
|
'Grit': 'Grit_14-17.xlsx',
|
|
'Emotional Intelligence': 'Emotional_Intelligence_14-17.xlsx',
|
|
'Vocational Interest': 'Vocational_Interest_14-17.xlsx',
|
|
'Learning Strategies': 'Learning_Strategies_14-17.xlsx'
|
|
},
|
|
'adults': {
|
|
'Personality': 'Personality_18-23.xlsx',
|
|
'Grit': 'Grit_18-23.xlsx',
|
|
'Emotional Intelligence': 'Emotional_Intelligence_18-23.xlsx',
|
|
'Vocational Interest': 'Vocational_Interest_18-23.xlsx',
|
|
'Learning Strategies': 'Learning_Strategies_18-23.xlsx'
|
|
}
|
|
}
|
|
|
|
all_passed = True
|
|
|
|
for age_group, domains in domain_files.items():
|
|
expected_count = EXPECTED_ADOLESCENTS if age_group == 'adolescense' else EXPECTED_ADULTS
|
|
age_results = {}
|
|
|
|
for domain, file_name in domains.items():
|
|
file_path = OUTPUT_DIR / age_group / "5_domain" / file_name
|
|
|
|
if not file_path.exists():
|
|
age_results[domain] = {'status': 'MISSING', 'rows': 0}
|
|
all_passed = False
|
|
continue
|
|
|
|
try:
|
|
df = pd.read_excel(file_path, engine='openpyxl')
|
|
row_count = len(df)
|
|
col_count = len(df.columns)
|
|
|
|
# Check ID column
|
|
id_col = 'Student CPID' if 'Student CPID' in df.columns else 'Participant'
|
|
if id_col not in df.columns:
|
|
age_results[domain] = {'status': 'NO_ID_COLUMN', 'rows': row_count}
|
|
all_passed = False
|
|
continue
|
|
|
|
# Check for unique IDs
|
|
unique_ids = df[id_col].dropna().nunique()
|
|
|
|
# Calculate data density
|
|
total_cells = row_count * col_count
|
|
null_cells = df.isnull().sum().sum()
|
|
density = ((total_cells - null_cells) / total_cells) * 100 if total_cells > 0 else 0
|
|
|
|
# Verify row count
|
|
if row_count == expected_count and unique_ids == expected_count:
|
|
age_results[domain] = {
|
|
'status': 'PASS',
|
|
'rows': row_count,
|
|
'cols': col_count,
|
|
'unique_ids': unique_ids,
|
|
'density': round(density, 2)
|
|
}
|
|
else:
|
|
age_results[domain] = {
|
|
'status': 'ROW_MISMATCH',
|
|
'rows': row_count,
|
|
'expected': expected_count,
|
|
'unique_ids': unique_ids
|
|
}
|
|
all_passed = False
|
|
|
|
except Exception as e:
|
|
age_results[domain] = {'status': 'ERROR', 'error': str(e)}
|
|
all_passed = False
|
|
|
|
results[age_group] = age_results
|
|
|
|
return results, all_passed
|
|
|
|
def main():
|
|
print("=" * 80)
|
|
print("FINAL 100% VERIFICATION REPORT")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
results, all_passed = verify_domain_files()
|
|
|
|
# Print detailed results
|
|
for age_group, domains in results.items():
|
|
age_label = "ADOLESCENTS (14-17)" if age_group == 'adolescense' else "ADULTS (18-23)"
|
|
expected = EXPECTED_ADOLESCENTS if age_group == 'adolescense' else EXPECTED_ADULTS
|
|
|
|
print(f"{age_label} - Expected: {expected} students")
|
|
print("-" * 80)
|
|
|
|
for domain, result in domains.items():
|
|
if result['status'] == 'PASS':
|
|
print(f" {domain:30} PASS - {result['rows']} rows, {result['cols']} cols, {result['density']}% density")
|
|
else:
|
|
print(f" {domain:30} {result['status']} - {result}")
|
|
print()
|
|
|
|
print("=" * 80)
|
|
if all_passed:
|
|
print("VERIFICATION RESULT: 100% PASS - ALL DOMAINS COMPLETE")
|
|
else:
|
|
print("VERIFICATION RESULT: FAILED - REVIEW REQUIRED")
|
|
print("=" * 80)
|
|
|
|
return all_passed
|
|
|
|
if __name__ == "__main__":
|
|
success = main()
|
|
sys.exit(0 if success else 1)
|