106 lines
3.8 KiB
Python
106 lines
3.8 KiB
Python
"""Final verification of all data for FINAL_QUALITY_REPORT.md"""
|
|
import pandas as pd
|
|
from pathlib import Path
|
|
import sys
|
|
import io
|
|
|
|
if sys.platform == 'win32':
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
|
|
|
BASE_DIR = Path(__file__).resolve().parent.parent
|
|
|
|
def verify_all():
|
|
print("=" * 80)
|
|
print("FINAL REPORT VERIFICATION")
|
|
print("=" * 80)
|
|
|
|
all_good = True
|
|
|
|
# 1. Verify merged_personas.xlsx
|
|
print("\n1. merged_personas.xlsx:")
|
|
personas_file = BASE_DIR / "data" / "merged_personas.xlsx"
|
|
if personas_file.exists():
|
|
df = pd.read_excel(personas_file, engine='openpyxl')
|
|
print(f" Rows: {len(df)} (Expected: 3000)")
|
|
print(f" Columns: {len(df.columns)} (Expected: 79)")
|
|
print(f" DB columns: {len([c for c in df.columns if '_DB' in str(c)])} (Expected: 0)")
|
|
print(f" StudentCPID unique: {df['StudentCPID'].nunique()}/{len(df)}")
|
|
|
|
if len(df) != 3000:
|
|
print(f" ERROR: Row count mismatch")
|
|
all_good = False
|
|
if len(df.columns) != 79:
|
|
print(f" WARNING: Column count is {len(df.columns)}, expected 79")
|
|
if len([c for c in df.columns if '_DB' in str(c)]) > 0:
|
|
print(f" ERROR: DB columns still present")
|
|
all_good = False
|
|
else:
|
|
print(" ERROR: File not found")
|
|
all_good = False
|
|
|
|
# 2. Verify AllQuestions.xlsx
|
|
print("\n2. AllQuestions.xlsx:")
|
|
questions_file = BASE_DIR / "data" / "AllQuestions.xlsx"
|
|
if questions_file.exists():
|
|
df = pd.read_excel(questions_file, engine='openpyxl')
|
|
print(f" Total questions: {len(df)} (Expected: 1297)")
|
|
if 'code' in df.columns:
|
|
unique_codes = df['code'].nunique()
|
|
print(f" Unique question codes: {unique_codes}")
|
|
if unique_codes != len(df):
|
|
print(f" ERROR: Duplicate question codes found")
|
|
all_good = False
|
|
else:
|
|
print(" ERROR: File not found")
|
|
all_good = False
|
|
|
|
# 3. Verify output files
|
|
print("\n3. Output Files:")
|
|
output_dir = BASE_DIR / "output" / "full_run"
|
|
|
|
domain_files = {
|
|
'adolescense': ['Personality_14-17.xlsx', 'Grit_14-17.xlsx', 'Emotional_Intelligence_14-17.xlsx',
|
|
'Vocational_Interest_14-17.xlsx', 'Learning_Strategies_14-17.xlsx'],
|
|
'adults': ['Personality_18-23.xlsx', 'Grit_18-23.xlsx', 'Emotional_Intelligence_18-23.xlsx',
|
|
'Vocational_Interest_18-23.xlsx', 'Learning_Strategies_18-23.xlsx']
|
|
}
|
|
|
|
domain_count = 0
|
|
for age_group, files in domain_files.items():
|
|
for file_name in files:
|
|
file_path = output_dir / age_group / "5_domain" / file_name
|
|
if file_path.exists():
|
|
domain_count += 1
|
|
else:
|
|
print(f" ERROR: Missing {file_name}")
|
|
all_good = False
|
|
|
|
print(f" Domain files: {domain_count}/10")
|
|
|
|
# Check cognition files
|
|
cog_count = 0
|
|
for age_group in ['adolescense', 'adults']:
|
|
cog_dir = output_dir / age_group / "cognition"
|
|
if cog_dir.exists():
|
|
cog_files = list(cog_dir.glob("*.xlsx"))
|
|
cog_count += len(cog_files)
|
|
|
|
print(f" Cognition files: {cog_count}/24")
|
|
|
|
if cog_count != 24:
|
|
print(f" WARNING: Expected 24 cognition files, found {cog_count}")
|
|
|
|
# Final summary
|
|
print("\n" + "=" * 80)
|
|
if all_good and domain_count == 10 and cog_count == 24:
|
|
print("VERIFICATION PASSED - All checks successful")
|
|
else:
|
|
print("VERIFICATION ISSUES FOUND - Review required")
|
|
print("=" * 80)
|
|
|
|
return all_good and domain_count == 10 and cog_count == 24
|
|
|
|
if __name__ == "__main__":
|
|
success = verify_all()
|
|
sys.exit(0 if success else 1)
|