CP_Assessment_engine/scripts/final_report_verification.py
2026-02-10 12:59:40 +05:30

106 lines
3.8 KiB
Python

"""Final verification of all data for FINAL_QUALITY_REPORT.md"""
import pandas as pd
from pathlib import Path
import sys
import io
if sys.platform == 'win32':
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
BASE_DIR = Path(__file__).resolve().parent.parent
def verify_all():
print("=" * 80)
print("FINAL REPORT VERIFICATION")
print("=" * 80)
all_good = True
# 1. Verify merged_personas.xlsx
print("\n1. merged_personas.xlsx:")
personas_file = BASE_DIR / "data" / "merged_personas.xlsx"
if personas_file.exists():
df = pd.read_excel(personas_file, engine='openpyxl')
print(f" Rows: {len(df)} (Expected: 3000)")
print(f" Columns: {len(df.columns)} (Expected: 79)")
print(f" DB columns: {len([c for c in df.columns if '_DB' in str(c)])} (Expected: 0)")
print(f" StudentCPID unique: {df['StudentCPID'].nunique()}/{len(df)}")
if len(df) != 3000:
print(f" ERROR: Row count mismatch")
all_good = False
if len(df.columns) != 79:
print(f" WARNING: Column count is {len(df.columns)}, expected 79")
if len([c for c in df.columns if '_DB' in str(c)]) > 0:
print(f" ERROR: DB columns still present")
all_good = False
else:
print(" ERROR: File not found")
all_good = False
# 2. Verify AllQuestions.xlsx
print("\n2. AllQuestions.xlsx:")
questions_file = BASE_DIR / "data" / "AllQuestions.xlsx"
if questions_file.exists():
df = pd.read_excel(questions_file, engine='openpyxl')
print(f" Total questions: {len(df)} (Expected: 1297)")
if 'code' in df.columns:
unique_codes = df['code'].nunique()
print(f" Unique question codes: {unique_codes}")
if unique_codes != len(df):
print(f" ERROR: Duplicate question codes found")
all_good = False
else:
print(" ERROR: File not found")
all_good = False
# 3. Verify output files
print("\n3. Output Files:")
output_dir = BASE_DIR / "output" / "full_run"
domain_files = {
'adolescense': ['Personality_14-17.xlsx', 'Grit_14-17.xlsx', 'Emotional_Intelligence_14-17.xlsx',
'Vocational_Interest_14-17.xlsx', 'Learning_Strategies_14-17.xlsx'],
'adults': ['Personality_18-23.xlsx', 'Grit_18-23.xlsx', 'Emotional_Intelligence_18-23.xlsx',
'Vocational_Interest_18-23.xlsx', 'Learning_Strategies_18-23.xlsx']
}
domain_count = 0
for age_group, files in domain_files.items():
for file_name in files:
file_path = output_dir / age_group / "5_domain" / file_name
if file_path.exists():
domain_count += 1
else:
print(f" ERROR: Missing {file_name}")
all_good = False
print(f" Domain files: {domain_count}/10")
# Check cognition files
cog_count = 0
for age_group in ['adolescense', 'adults']:
cog_dir = output_dir / age_group / "cognition"
if cog_dir.exists():
cog_files = list(cog_dir.glob("*.xlsx"))
cog_count += len(cog_files)
print(f" Cognition files: {cog_count}/24")
if cog_count != 24:
print(f" WARNING: Expected 24 cognition files, found {cog_count}")
# Final summary
print("\n" + "=" * 80)
if all_good and domain_count == 10 and cog_count == 24:
print("VERIFICATION PASSED - All checks successful")
else:
print("VERIFICATION ISSUES FOUND - Review required")
print("=" * 80)
return all_good and domain_count == 10 and cog_count == 24
if __name__ == "__main__":
success = verify_all()
sys.exit(0 if success else 1)