CP_Assessment_engine/scripts/client_deliverable_quality_check.py
2026-02-10 12:59:40 +05:30

311 lines
11 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Comprehensive Quality Check for Client Deliverables
Perfectionist-level review of all files to be shared with client/BOD
"""
import pandas as pd
import numpy as np
from pathlib import Path
import sys
import io
# Fix Windows console encoding
if sys.platform == 'win32':
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
BASE_DIR = Path(__file__).resolve().parent.parent
def check_merged_personas():
"""Comprehensive check of merged_personas.xlsx"""
print("=" * 80)
print("📋 CHECKING: merged_personas.xlsx")
print("=" * 80)
file_path = BASE_DIR / "data" / "merged_personas.xlsx"
if not file_path.exists():
print("❌ FILE NOT FOUND")
return False
try:
df = pd.read_excel(file_path, engine='openpyxl')
print(f"\n📊 Basic Statistics:")
print(f" Total rows: {len(df)}")
print(f" Total columns: {len(df.columns)}")
print(f" Expected rows: 3,000")
if len(df) != 3000:
print(f" ⚠️ ROW COUNT MISMATCH: Expected 3,000, got {len(df)}")
# Check for problematic columns
print(f"\n🔍 Column Analysis:")
# Check for Grade/Division/Class columns
problematic_keywords = ['grade', 'division', 'class', 'section']
problematic_cols = []
for col in df.columns:
col_lower = str(col).lower()
for keyword in problematic_keywords:
if keyword in col_lower:
problematic_cols.append(col)
break
if problematic_cols:
print(f" ⚠️ POTENTIALLY PROBLEMATIC COLUMNS FOUND:")
for col in problematic_cols:
# Check for data inconsistencies
unique_vals = df[col].dropna().unique()
print(f" - {col}: {len(unique_vals)} unique values")
if len(unique_vals) <= 20:
print(f" Sample values: {list(unique_vals[:10])}")
# Check for duplicate columns
print(f"\n🔍 Duplicate Column Check:")
duplicate_cols = df.columns[df.columns.duplicated()].tolist()
if duplicate_cols:
print(f" ❌ DUPLICATE COLUMNS: {duplicate_cols}")
else:
print(f" ✅ No duplicate columns")
# Check for missing critical columns
print(f"\n🔍 Critical Column Check:")
critical_cols = ['StudentCPID', 'First Name', 'Last Name', 'Age', 'Age Category']
missing_critical = [c for c in critical_cols if c not in df.columns]
if missing_critical:
print(f" ❌ MISSING CRITICAL COLUMNS: {missing_critical}")
else:
print(f" ✅ All critical columns present")
# Check for data quality issues
print(f"\n🔍 Data Quality Check:")
# Check StudentCPID uniqueness
if 'StudentCPID' in df.columns:
unique_cpids = df['StudentCPID'].dropna().nunique()
total_cpids = df['StudentCPID'].notna().sum()
if unique_cpids != total_cpids:
print(f" ❌ DUPLICATE CPIDs: {total_cpids - unique_cpids} duplicates found")
else:
print(f" ✅ All StudentCPIDs unique ({unique_cpids} unique)")
# Check for NaN in critical columns
if 'StudentCPID' in df.columns:
nan_cpids = df['StudentCPID'].isna().sum()
if nan_cpids > 0:
print(f" ❌ MISSING CPIDs: {nan_cpids} rows with NaN StudentCPID")
else:
print(f" ✅ No missing StudentCPIDs")
# Check Age Category distribution
if 'Age Category' in df.columns:
age_dist = df['Age Category'].value_counts()
print(f" Age Category distribution:")
for age_cat, count in age_dist.items():
print(f" {age_cat}: {count}")
# Check for inconsistent data types
print(f"\n🔍 Data Type Consistency:")
for col in ['Age', 'Openness Score', 'Conscientiousness Score']:
if col in df.columns:
try:
numeric_vals = pd.to_numeric(df[col], errors='coerce')
non_numeric = numeric_vals.isna().sum() - df[col].isna().sum()
if non_numeric > 0:
print(f" ⚠️ {col}: {non_numeric} non-numeric values")
else:
print(f"{col}: All values numeric")
except:
print(f" ⚠️ {col}: Could not verify numeric")
# Check for suspicious patterns
print(f"\n🔍 Suspicious Pattern Check:")
# Check if all rows have same values (data corruption)
for col in df.columns[:10]: # Check first 10 columns
unique_count = df[col].nunique()
if unique_count == 1 and len(df) > 1:
print(f" ⚠️ {col}: All rows have same value (possible issue)")
# Check column naming consistency
print(f"\n🔍 Column Naming Check:")
suspicious_names = []
for col in df.columns:
col_str = str(col)
# Check for inconsistent naming
if col_str.strip() != col_str:
suspicious_names.append(f"{col} (has leading/trailing spaces)")
if '_DB' in col_str and 'Class_DB' in col_str or 'Section_DB' in col_str:
print(f" {col}: Database-derived column (from 3000_students_output.xlsx)")
if suspicious_names:
print(f" ⚠️ SUSPICIOUS COLUMN NAMES: {suspicious_names}")
# Summary
print(f"\n" + "=" * 80)
print(f"📊 SUMMARY:")
print(f" Total issues found: {len(problematic_cols)} potentially problematic columns")
if problematic_cols:
print(f" ⚠️ REVIEW REQUIRED: Check if these columns should be included")
print(f" Columns: {problematic_cols}")
else:
print(f" ✅ No obvious issues found")
print("=" * 80)
return len(problematic_cols) == 0
except Exception as e:
print(f"❌ ERROR: {e}")
import traceback
traceback.print_exc()
return False
def check_all_questions():
"""Check AllQuestions.xlsx quality"""
print("\n" + "=" * 80)
print("📋 CHECKING: AllQuestions.xlsx")
print("=" * 80)
file_path = BASE_DIR / "data" / "AllQuestions.xlsx"
if not file_path.exists():
print("❌ FILE NOT FOUND")
return False
try:
df = pd.read_excel(file_path, engine='openpyxl')
print(f"\n📊 Basic Statistics:")
print(f" Total questions: {len(df)}")
print(f" Total columns: {len(df.columns)}")
# Check required columns
required_cols = ['code', 'domain', 'age-group', 'question']
missing = [c for c in required_cols if c not in df.columns]
if missing:
print(f" ❌ MISSING REQUIRED COLUMNS: {missing}")
else:
print(f" ✅ All required columns present")
# Check for duplicate question codes
if 'code' in df.columns:
duplicate_codes = df[df['code'].duplicated()]['code'].tolist()
if duplicate_codes:
print(f" ❌ DUPLICATE QUESTION CODES: {len(duplicate_codes)} duplicates")
else:
print(f" ✅ All question codes unique")
# Check domain distribution
if 'domain' in df.columns:
domain_counts = df['domain'].value_counts()
print(f"\n Domain distribution:")
for domain, count in domain_counts.items():
print(f" {domain}: {count} questions")
# Check age-group distribution
if 'age-group' in df.columns:
age_counts = df['age-group'].value_counts()
print(f"\n Age group distribution:")
for age, count in age_counts.items():
print(f" {age}: {count} questions")
print(f" ✅ File structure looks good")
return True
except Exception as e:
print(f"❌ ERROR: {e}")
return False
def check_output_files():
"""Check sample output files for quality"""
print("\n" + "=" * 80)
print("📋 CHECKING: Output Files (Sample)")
print("=" * 80)
output_dir = BASE_DIR / "output" / "full_run"
# Check one file from each category
test_files = [
output_dir / "adolescense" / "5_domain" / "Personality_14-17.xlsx",
output_dir / "adults" / "5_domain" / "Personality_18-23.xlsx",
]
all_good = True
for file_path in test_files:
if not file_path.exists():
print(f" ⚠️ {file_path.name}: NOT FOUND")
continue
try:
df = pd.read_excel(file_path, engine='openpyxl')
# Check for "--" in omitted columns
if 'Student CPID' in df.columns or 'Participant' in df.columns:
# Check a few rows for data quality
sample_row = df.iloc[0]
print(f"\n {file_path.name}:")
print(f" Rows: {len(df)}, Columns: {len(df.columns)}")
# Check for proper "--" usage
dash_count = 0
for col in df.columns:
if col not in ['Participant', 'First Name', 'Last Name', 'Student CPID', 'Age', 'Gender', 'Age Category']:
dash_in_col = (df[col] == '--').sum()
if dash_in_col > 0:
dash_count += dash_in_col
if dash_count > 0:
print(f" ✅ Omitted values marked with '--': {dash_count} values")
else:
print(f" No '--' values found (may be normal if no omitted questions)")
except Exception as e:
print(f" ❌ ERROR reading {file_path.name}: {e}")
all_good = False
return all_good
def main():
print("=" * 80)
print("🔍 COMPREHENSIVE CLIENT DELIVERABLE QUALITY CHECK")
print("Perfectionist-Level Review")
print("=" * 80)
print()
results = {}
# Check merged_personas.xlsx
results['merged_personas'] = check_merged_personas()
# Check AllQuestions.xlsx
results['all_questions'] = check_all_questions()
# Check output files
results['output_files'] = check_output_files()
# Final summary
print("\n" + "=" * 80)
print("📊 FINAL QUALITY ASSESSMENT")
print("=" * 80)
all_passed = all(results.values())
for file_type, passed in results.items():
status = "✅ PASS" if passed else "❌ FAIL"
print(f" {file_type:20} {status}")
print()
if all_passed:
print("✅ ALL CHECKS PASSED - FILES READY FOR CLIENT")
else:
print("⚠️ SOME ISSUES FOUND - REVIEW REQUIRED BEFORE CLIENT DELIVERY")
print("=" * 80)
return all_passed
if __name__ == "__main__":
success = main()
sys.exit(0 if success else 1)