"""Final verification of all data for FINAL_QUALITY_REPORT.md""" import pandas as pd from pathlib import Path import sys import io if sys.platform == 'win32': sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') BASE_DIR = Path(__file__).resolve().parent.parent def verify_all(): print("=" * 80) print("FINAL REPORT VERIFICATION") print("=" * 80) all_good = True # 1. Verify merged_personas.xlsx print("\n1. merged_personas.xlsx:") personas_file = BASE_DIR / "data" / "merged_personas.xlsx" if personas_file.exists(): df = pd.read_excel(personas_file, engine='openpyxl') print(f" Rows: {len(df)} (Expected: 3000)") print(f" Columns: {len(df.columns)} (Expected: 79)") print(f" DB columns: {len([c for c in df.columns if '_DB' in str(c)])} (Expected: 0)") print(f" StudentCPID unique: {df['StudentCPID'].nunique()}/{len(df)}") if len(df) != 3000: print(f" ERROR: Row count mismatch") all_good = False if len(df.columns) != 79: print(f" WARNING: Column count is {len(df.columns)}, expected 79") if len([c for c in df.columns if '_DB' in str(c)]) > 0: print(f" ERROR: DB columns still present") all_good = False else: print(" ERROR: File not found") all_good = False # 2. Verify AllQuestions.xlsx print("\n2. AllQuestions.xlsx:") questions_file = BASE_DIR / "data" / "AllQuestions.xlsx" if questions_file.exists(): df = pd.read_excel(questions_file, engine='openpyxl') print(f" Total questions: {len(df)} (Expected: 1297)") if 'code' in df.columns: unique_codes = df['code'].nunique() print(f" Unique question codes: {unique_codes}") if unique_codes != len(df): print(f" ERROR: Duplicate question codes found") all_good = False else: print(" ERROR: File not found") all_good = False # 3. Verify output files print("\n3. Output Files:") output_dir = BASE_DIR / "output" / "full_run" domain_files = { 'adolescense': ['Personality_14-17.xlsx', 'Grit_14-17.xlsx', 'Emotional_Intelligence_14-17.xlsx', 'Vocational_Interest_14-17.xlsx', 'Learning_Strategies_14-17.xlsx'], 'adults': ['Personality_18-23.xlsx', 'Grit_18-23.xlsx', 'Emotional_Intelligence_18-23.xlsx', 'Vocational_Interest_18-23.xlsx', 'Learning_Strategies_18-23.xlsx'] } domain_count = 0 for age_group, files in domain_files.items(): for file_name in files: file_path = output_dir / age_group / "5_domain" / file_name if file_path.exists(): domain_count += 1 else: print(f" ERROR: Missing {file_name}") all_good = False print(f" Domain files: {domain_count}/10") # Check cognition files cog_count = 0 for age_group in ['adolescense', 'adults']: cog_dir = output_dir / age_group / "cognition" if cog_dir.exists(): cog_files = list(cog_dir.glob("*.xlsx")) cog_count += len(cog_files) print(f" Cognition files: {cog_count}/24") if cog_count != 24: print(f" WARNING: Expected 24 cognition files, found {cog_count}") # Final summary print("\n" + "=" * 80) if all_good and domain_count == 10 and cog_count == 24: print("VERIFICATION PASSED - All checks successful") else: print("VERIFICATION ISSUES FOUND - Review required") print("=" * 80) return all_good and domain_count == 10 and cog_count == 24 if __name__ == "__main__": success = verify_all() sys.exit(0 if success else 1)