311 lines
11 KiB
Python
311 lines
11 KiB
Python
"""
|
||
Comprehensive Quality Check for Client Deliverables
|
||
Perfectionist-level review of all files to be shared with client/BOD
|
||
"""
|
||
import pandas as pd
|
||
import numpy as np
|
||
from pathlib import Path
|
||
import sys
|
||
import io
|
||
|
||
# Fix Windows console encoding
|
||
if sys.platform == 'win32':
|
||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
||
|
||
BASE_DIR = Path(__file__).resolve().parent.parent
|
||
|
||
def check_merged_personas():
|
||
"""Comprehensive check of merged_personas.xlsx"""
|
||
print("=" * 80)
|
||
print("📋 CHECKING: merged_personas.xlsx")
|
||
print("=" * 80)
|
||
|
||
file_path = BASE_DIR / "data" / "merged_personas.xlsx"
|
||
|
||
if not file_path.exists():
|
||
print("❌ FILE NOT FOUND")
|
||
return False
|
||
|
||
try:
|
||
df = pd.read_excel(file_path, engine='openpyxl')
|
||
|
||
print(f"\n📊 Basic Statistics:")
|
||
print(f" Total rows: {len(df)}")
|
||
print(f" Total columns: {len(df.columns)}")
|
||
print(f" Expected rows: 3,000")
|
||
|
||
if len(df) != 3000:
|
||
print(f" ⚠️ ROW COUNT MISMATCH: Expected 3,000, got {len(df)}")
|
||
|
||
# Check for problematic columns
|
||
print(f"\n🔍 Column Analysis:")
|
||
|
||
# Check for Grade/Division/Class columns
|
||
problematic_keywords = ['grade', 'division', 'class', 'section']
|
||
problematic_cols = []
|
||
|
||
for col in df.columns:
|
||
col_lower = str(col).lower()
|
||
for keyword in problematic_keywords:
|
||
if keyword in col_lower:
|
||
problematic_cols.append(col)
|
||
break
|
||
|
||
if problematic_cols:
|
||
print(f" ⚠️ POTENTIALLY PROBLEMATIC COLUMNS FOUND:")
|
||
for col in problematic_cols:
|
||
# Check for data inconsistencies
|
||
unique_vals = df[col].dropna().unique()
|
||
print(f" - {col}: {len(unique_vals)} unique values")
|
||
if len(unique_vals) <= 20:
|
||
print(f" Sample values: {list(unique_vals[:10])}")
|
||
|
||
# Check for duplicate columns
|
||
print(f"\n🔍 Duplicate Column Check:")
|
||
duplicate_cols = df.columns[df.columns.duplicated()].tolist()
|
||
if duplicate_cols:
|
||
print(f" ❌ DUPLICATE COLUMNS: {duplicate_cols}")
|
||
else:
|
||
print(f" ✅ No duplicate columns")
|
||
|
||
# Check for missing critical columns
|
||
print(f"\n🔍 Critical Column Check:")
|
||
critical_cols = ['StudentCPID', 'First Name', 'Last Name', 'Age', 'Age Category']
|
||
missing_critical = [c for c in critical_cols if c not in df.columns]
|
||
if missing_critical:
|
||
print(f" ❌ MISSING CRITICAL COLUMNS: {missing_critical}")
|
||
else:
|
||
print(f" ✅ All critical columns present")
|
||
|
||
# Check for data quality issues
|
||
print(f"\n🔍 Data Quality Check:")
|
||
|
||
# Check StudentCPID uniqueness
|
||
if 'StudentCPID' in df.columns:
|
||
unique_cpids = df['StudentCPID'].dropna().nunique()
|
||
total_cpids = df['StudentCPID'].notna().sum()
|
||
if unique_cpids != total_cpids:
|
||
print(f" ❌ DUPLICATE CPIDs: {total_cpids - unique_cpids} duplicates found")
|
||
else:
|
||
print(f" ✅ All StudentCPIDs unique ({unique_cpids} unique)")
|
||
|
||
# Check for NaN in critical columns
|
||
if 'StudentCPID' in df.columns:
|
||
nan_cpids = df['StudentCPID'].isna().sum()
|
||
if nan_cpids > 0:
|
||
print(f" ❌ MISSING CPIDs: {nan_cpids} rows with NaN StudentCPID")
|
||
else:
|
||
print(f" ✅ No missing StudentCPIDs")
|
||
|
||
# Check Age Category distribution
|
||
if 'Age Category' in df.columns:
|
||
age_dist = df['Age Category'].value_counts()
|
||
print(f" Age Category distribution:")
|
||
for age_cat, count in age_dist.items():
|
||
print(f" {age_cat}: {count}")
|
||
|
||
# Check for inconsistent data types
|
||
print(f"\n🔍 Data Type Consistency:")
|
||
for col in ['Age', 'Openness Score', 'Conscientiousness Score']:
|
||
if col in df.columns:
|
||
try:
|
||
numeric_vals = pd.to_numeric(df[col], errors='coerce')
|
||
non_numeric = numeric_vals.isna().sum() - df[col].isna().sum()
|
||
if non_numeric > 0:
|
||
print(f" ⚠️ {col}: {non_numeric} non-numeric values")
|
||
else:
|
||
print(f" ✅ {col}: All values numeric")
|
||
except:
|
||
print(f" ⚠️ {col}: Could not verify numeric")
|
||
|
||
# Check for suspicious patterns
|
||
print(f"\n🔍 Suspicious Pattern Check:")
|
||
|
||
# Check if all rows have same values (data corruption)
|
||
for col in df.columns[:10]: # Check first 10 columns
|
||
unique_count = df[col].nunique()
|
||
if unique_count == 1 and len(df) > 1:
|
||
print(f" ⚠️ {col}: All rows have same value (possible issue)")
|
||
|
||
# Check column naming consistency
|
||
print(f"\n🔍 Column Naming Check:")
|
||
suspicious_names = []
|
||
for col in df.columns:
|
||
col_str = str(col)
|
||
# Check for inconsistent naming
|
||
if col_str.strip() != col_str:
|
||
suspicious_names.append(f"{col} (has leading/trailing spaces)")
|
||
if '_DB' in col_str and 'Class_DB' in col_str or 'Section_DB' in col_str:
|
||
print(f" ℹ️ {col}: Database-derived column (from 3000_students_output.xlsx)")
|
||
|
||
if suspicious_names:
|
||
print(f" ⚠️ SUSPICIOUS COLUMN NAMES: {suspicious_names}")
|
||
|
||
# Summary
|
||
print(f"\n" + "=" * 80)
|
||
print(f"📊 SUMMARY:")
|
||
print(f" Total issues found: {len(problematic_cols)} potentially problematic columns")
|
||
if problematic_cols:
|
||
print(f" ⚠️ REVIEW REQUIRED: Check if these columns should be included")
|
||
print(f" Columns: {problematic_cols}")
|
||
else:
|
||
print(f" ✅ No obvious issues found")
|
||
print("=" * 80)
|
||
|
||
return len(problematic_cols) == 0
|
||
|
||
except Exception as e:
|
||
print(f"❌ ERROR: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
return False
|
||
|
||
def check_all_questions():
|
||
"""Check AllQuestions.xlsx quality"""
|
||
print("\n" + "=" * 80)
|
||
print("📋 CHECKING: AllQuestions.xlsx")
|
||
print("=" * 80)
|
||
|
||
file_path = BASE_DIR / "data" / "AllQuestions.xlsx"
|
||
|
||
if not file_path.exists():
|
||
print("❌ FILE NOT FOUND")
|
||
return False
|
||
|
||
try:
|
||
df = pd.read_excel(file_path, engine='openpyxl')
|
||
|
||
print(f"\n📊 Basic Statistics:")
|
||
print(f" Total questions: {len(df)}")
|
||
print(f" Total columns: {len(df.columns)}")
|
||
|
||
# Check required columns
|
||
required_cols = ['code', 'domain', 'age-group', 'question']
|
||
missing = [c for c in required_cols if c not in df.columns]
|
||
if missing:
|
||
print(f" ❌ MISSING REQUIRED COLUMNS: {missing}")
|
||
else:
|
||
print(f" ✅ All required columns present")
|
||
|
||
# Check for duplicate question codes
|
||
if 'code' in df.columns:
|
||
duplicate_codes = df[df['code'].duplicated()]['code'].tolist()
|
||
if duplicate_codes:
|
||
print(f" ❌ DUPLICATE QUESTION CODES: {len(duplicate_codes)} duplicates")
|
||
else:
|
||
print(f" ✅ All question codes unique")
|
||
|
||
# Check domain distribution
|
||
if 'domain' in df.columns:
|
||
domain_counts = df['domain'].value_counts()
|
||
print(f"\n Domain distribution:")
|
||
for domain, count in domain_counts.items():
|
||
print(f" {domain}: {count} questions")
|
||
|
||
# Check age-group distribution
|
||
if 'age-group' in df.columns:
|
||
age_counts = df['age-group'].value_counts()
|
||
print(f"\n Age group distribution:")
|
||
for age, count in age_counts.items():
|
||
print(f" {age}: {count} questions")
|
||
|
||
print(f" ✅ File structure looks good")
|
||
return True
|
||
|
||
except Exception as e:
|
||
print(f"❌ ERROR: {e}")
|
||
return False
|
||
|
||
def check_output_files():
|
||
"""Check sample output files for quality"""
|
||
print("\n" + "=" * 80)
|
||
print("📋 CHECKING: Output Files (Sample)")
|
||
print("=" * 80)
|
||
|
||
output_dir = BASE_DIR / "output" / "full_run"
|
||
|
||
# Check one file from each category
|
||
test_files = [
|
||
output_dir / "adolescense" / "5_domain" / "Personality_14-17.xlsx",
|
||
output_dir / "adults" / "5_domain" / "Personality_18-23.xlsx",
|
||
]
|
||
|
||
all_good = True
|
||
|
||
for file_path in test_files:
|
||
if not file_path.exists():
|
||
print(f" ⚠️ {file_path.name}: NOT FOUND")
|
||
continue
|
||
|
||
try:
|
||
df = pd.read_excel(file_path, engine='openpyxl')
|
||
|
||
# Check for "--" in omitted columns
|
||
if 'Student CPID' in df.columns or 'Participant' in df.columns:
|
||
# Check a few rows for data quality
|
||
sample_row = df.iloc[0]
|
||
print(f"\n {file_path.name}:")
|
||
print(f" Rows: {len(df)}, Columns: {len(df.columns)}")
|
||
|
||
# Check for proper "--" usage
|
||
dash_count = 0
|
||
for col in df.columns:
|
||
if col not in ['Participant', 'First Name', 'Last Name', 'Student CPID', 'Age', 'Gender', 'Age Category']:
|
||
dash_in_col = (df[col] == '--').sum()
|
||
if dash_in_col > 0:
|
||
dash_count += dash_in_col
|
||
|
||
if dash_count > 0:
|
||
print(f" ✅ Omitted values marked with '--': {dash_count} values")
|
||
else:
|
||
print(f" ℹ️ No '--' values found (may be normal if no omitted questions)")
|
||
|
||
except Exception as e:
|
||
print(f" ❌ ERROR reading {file_path.name}: {e}")
|
||
all_good = False
|
||
|
||
return all_good
|
||
|
||
def main():
|
||
print("=" * 80)
|
||
print("🔍 COMPREHENSIVE CLIENT DELIVERABLE QUALITY CHECK")
|
||
print("Perfectionist-Level Review")
|
||
print("=" * 80)
|
||
print()
|
||
|
||
results = {}
|
||
|
||
# Check merged_personas.xlsx
|
||
results['merged_personas'] = check_merged_personas()
|
||
|
||
# Check AllQuestions.xlsx
|
||
results['all_questions'] = check_all_questions()
|
||
|
||
# Check output files
|
||
results['output_files'] = check_output_files()
|
||
|
||
# Final summary
|
||
print("\n" + "=" * 80)
|
||
print("📊 FINAL QUALITY ASSESSMENT")
|
||
print("=" * 80)
|
||
|
||
all_passed = all(results.values())
|
||
|
||
for file_type, passed in results.items():
|
||
status = "✅ PASS" if passed else "❌ FAIL"
|
||
print(f" {file_type:20} {status}")
|
||
|
||
print()
|
||
if all_passed:
|
||
print("✅ ALL CHECKS PASSED - FILES READY FOR CLIENT")
|
||
else:
|
||
print("⚠️ SOME ISSUES FOUND - REVIEW REQUIRED BEFORE CLIENT DELIVERY")
|
||
|
||
print("=" * 80)
|
||
|
||
return all_passed
|
||
|
||
if __name__ == "__main__":
|
||
success = main()
|
||
sys.exit(0 if success else 1)
|