""" Comprehensive Quality Check for Client Deliverables Perfectionist-level review of all files to be shared with client/BOD """ import pandas as pd import numpy as np from pathlib import Path import sys import io # Fix Windows console encoding if sys.platform == 'win32': sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') BASE_DIR = Path(__file__).resolve().parent.parent def check_merged_personas(): """Comprehensive check of merged_personas.xlsx""" print("=" * 80) print("šŸ“‹ CHECKING: merged_personas.xlsx") print("=" * 80) file_path = BASE_DIR / "data" / "merged_personas.xlsx" if not file_path.exists(): print("āŒ FILE NOT FOUND") return False try: df = pd.read_excel(file_path, engine='openpyxl') print(f"\nšŸ“Š Basic Statistics:") print(f" Total rows: {len(df)}") print(f" Total columns: {len(df.columns)}") print(f" Expected rows: 3,000") if len(df) != 3000: print(f" āš ļø ROW COUNT MISMATCH: Expected 3,000, got {len(df)}") # Check for problematic columns print(f"\nšŸ” Column Analysis:") # Check for Grade/Division/Class columns problematic_keywords = ['grade', 'division', 'class', 'section'] problematic_cols = [] for col in df.columns: col_lower = str(col).lower() for keyword in problematic_keywords: if keyword in col_lower: problematic_cols.append(col) break if problematic_cols: print(f" āš ļø POTENTIALLY PROBLEMATIC COLUMNS FOUND:") for col in problematic_cols: # Check for data inconsistencies unique_vals = df[col].dropna().unique() print(f" - {col}: {len(unique_vals)} unique values") if len(unique_vals) <= 20: print(f" Sample values: {list(unique_vals[:10])}") # Check for duplicate columns print(f"\nšŸ” Duplicate Column Check:") duplicate_cols = df.columns[df.columns.duplicated()].tolist() if duplicate_cols: print(f" āŒ DUPLICATE COLUMNS: {duplicate_cols}") else: print(f" āœ… No duplicate columns") # Check for missing critical columns print(f"\nšŸ” Critical Column Check:") critical_cols = ['StudentCPID', 'First Name', 'Last Name', 'Age', 'Age Category'] missing_critical = [c for c in critical_cols if c not in df.columns] if missing_critical: print(f" āŒ MISSING CRITICAL COLUMNS: {missing_critical}") else: print(f" āœ… All critical columns present") # Check for data quality issues print(f"\nšŸ” Data Quality Check:") # Check StudentCPID uniqueness if 'StudentCPID' in df.columns: unique_cpids = df['StudentCPID'].dropna().nunique() total_cpids = df['StudentCPID'].notna().sum() if unique_cpids != total_cpids: print(f" āŒ DUPLICATE CPIDs: {total_cpids - unique_cpids} duplicates found") else: print(f" āœ… All StudentCPIDs unique ({unique_cpids} unique)") # Check for NaN in critical columns if 'StudentCPID' in df.columns: nan_cpids = df['StudentCPID'].isna().sum() if nan_cpids > 0: print(f" āŒ MISSING CPIDs: {nan_cpids} rows with NaN StudentCPID") else: print(f" āœ… No missing StudentCPIDs") # Check Age Category distribution if 'Age Category' in df.columns: age_dist = df['Age Category'].value_counts() print(f" Age Category distribution:") for age_cat, count in age_dist.items(): print(f" {age_cat}: {count}") # Check for inconsistent data types print(f"\nšŸ” Data Type Consistency:") for col in ['Age', 'Openness Score', 'Conscientiousness Score']: if col in df.columns: try: numeric_vals = pd.to_numeric(df[col], errors='coerce') non_numeric = numeric_vals.isna().sum() - df[col].isna().sum() if non_numeric > 0: print(f" āš ļø {col}: {non_numeric} non-numeric values") else: print(f" āœ… {col}: All values numeric") except: print(f" āš ļø {col}: Could not verify numeric") # Check for suspicious patterns print(f"\nšŸ” Suspicious Pattern Check:") # Check if all rows have same values (data corruption) for col in df.columns[:10]: # Check first 10 columns unique_count = df[col].nunique() if unique_count == 1 and len(df) > 1: print(f" āš ļø {col}: All rows have same value (possible issue)") # Check column naming consistency print(f"\nšŸ” Column Naming Check:") suspicious_names = [] for col in df.columns: col_str = str(col) # Check for inconsistent naming if col_str.strip() != col_str: suspicious_names.append(f"{col} (has leading/trailing spaces)") if '_DB' in col_str and 'Class_DB' in col_str or 'Section_DB' in col_str: print(f" ā„¹ļø {col}: Database-derived column (from 3000_students_output.xlsx)") if suspicious_names: print(f" āš ļø SUSPICIOUS COLUMN NAMES: {suspicious_names}") # Summary print(f"\n" + "=" * 80) print(f"šŸ“Š SUMMARY:") print(f" Total issues found: {len(problematic_cols)} potentially problematic columns") if problematic_cols: print(f" āš ļø REVIEW REQUIRED: Check if these columns should be included") print(f" Columns: {problematic_cols}") else: print(f" āœ… No obvious issues found") print("=" * 80) return len(problematic_cols) == 0 except Exception as e: print(f"āŒ ERROR: {e}") import traceback traceback.print_exc() return False def check_all_questions(): """Check AllQuestions.xlsx quality""" print("\n" + "=" * 80) print("šŸ“‹ CHECKING: AllQuestions.xlsx") print("=" * 80) file_path = BASE_DIR / "data" / "AllQuestions.xlsx" if not file_path.exists(): print("āŒ FILE NOT FOUND") return False try: df = pd.read_excel(file_path, engine='openpyxl') print(f"\nšŸ“Š Basic Statistics:") print(f" Total questions: {len(df)}") print(f" Total columns: {len(df.columns)}") # Check required columns required_cols = ['code', 'domain', 'age-group', 'question'] missing = [c for c in required_cols if c not in df.columns] if missing: print(f" āŒ MISSING REQUIRED COLUMNS: {missing}") else: print(f" āœ… All required columns present") # Check for duplicate question codes if 'code' in df.columns: duplicate_codes = df[df['code'].duplicated()]['code'].tolist() if duplicate_codes: print(f" āŒ DUPLICATE QUESTION CODES: {len(duplicate_codes)} duplicates") else: print(f" āœ… All question codes unique") # Check domain distribution if 'domain' in df.columns: domain_counts = df['domain'].value_counts() print(f"\n Domain distribution:") for domain, count in domain_counts.items(): print(f" {domain}: {count} questions") # Check age-group distribution if 'age-group' in df.columns: age_counts = df['age-group'].value_counts() print(f"\n Age group distribution:") for age, count in age_counts.items(): print(f" {age}: {count} questions") print(f" āœ… File structure looks good") return True except Exception as e: print(f"āŒ ERROR: {e}") return False def check_output_files(): """Check sample output files for quality""" print("\n" + "=" * 80) print("šŸ“‹ CHECKING: Output Files (Sample)") print("=" * 80) output_dir = BASE_DIR / "output" / "full_run" # Check one file from each category test_files = [ output_dir / "adolescense" / "5_domain" / "Personality_14-17.xlsx", output_dir / "adults" / "5_domain" / "Personality_18-23.xlsx", ] all_good = True for file_path in test_files: if not file_path.exists(): print(f" āš ļø {file_path.name}: NOT FOUND") continue try: df = pd.read_excel(file_path, engine='openpyxl') # Check for "--" in omitted columns if 'Student CPID' in df.columns or 'Participant' in df.columns: # Check a few rows for data quality sample_row = df.iloc[0] print(f"\n {file_path.name}:") print(f" Rows: {len(df)}, Columns: {len(df.columns)}") # Check for proper "--" usage dash_count = 0 for col in df.columns: if col not in ['Participant', 'First Name', 'Last Name', 'Student CPID', 'Age', 'Gender', 'Age Category']: dash_in_col = (df[col] == '--').sum() if dash_in_col > 0: dash_count += dash_in_col if dash_count > 0: print(f" āœ… Omitted values marked with '--': {dash_count} values") else: print(f" ā„¹ļø No '--' values found (may be normal if no omitted questions)") except Exception as e: print(f" āŒ ERROR reading {file_path.name}: {e}") all_good = False return all_good def main(): print("=" * 80) print("šŸ” COMPREHENSIVE CLIENT DELIVERABLE QUALITY CHECK") print("Perfectionist-Level Review") print("=" * 80) print() results = {} # Check merged_personas.xlsx results['merged_personas'] = check_merged_personas() # Check AllQuestions.xlsx results['all_questions'] = check_all_questions() # Check output files results['output_files'] = check_output_files() # Final summary print("\n" + "=" * 80) print("šŸ“Š FINAL QUALITY ASSESSMENT") print("=" * 80) all_passed = all(results.values()) for file_type, passed in results.items(): status = "āœ… PASS" if passed else "āŒ FAIL" print(f" {file_type:20} {status}") print() if all_passed: print("āœ… ALL CHECKS PASSED - FILES READY FOR CLIENT") else: print("āš ļø SOME ISSUES FOUND - REVIEW REQUIRED BEFORE CLIENT DELIVERY") print("=" * 80) return all_passed if __name__ == "__main__": success = main() sys.exit(0 if success else 1)