CP_Assessment_engine/scripts/verify_omitted_replacement.py
2026-02-10 12:59:40 +05:30

93 lines
2.8 KiB
Python

"""
Verify that omitted question values were replaced with "--"
"""
import pandas as pd
from pathlib import Path
import sys
import io
if sys.platform == 'win32':
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
BASE_DIR = Path(__file__).resolve().parent.parent
OUTPUT_DIR = BASE_DIR / "output" / "full_run"
MAPPING_FILE = BASE_DIR / "data" / "AllQuestions.xlsx"
def verify_replacement():
"""Verify omitted values were replaced correctly"""
print("=" * 80)
print("✅ VERIFICATION: Omitted Values Replacement")
print("=" * 80)
print()
# Load omitted codes
map_df = pd.read_excel(MAPPING_FILE, engine='openpyxl')
omitted_codes = set(map_df[map_df['Type'].str.lower() == 'omission']['code'].astype(str).str.strip().tolist())
print(f"📊 Total omitted question codes: {len(omitted_codes)}")
print()
# Test a sample file
test_file = OUTPUT_DIR / "adolescense" / "5_domain" / "Personality_14-17.xlsx"
if not test_file.exists():
print(f"❌ Test file not found: {test_file}")
return False
df = pd.read_excel(test_file, engine='openpyxl')
# Find omitted columns in this file
omitted_cols_in_file = []
for col in df.columns:
if str(col).strip() in omitted_codes:
omitted_cols_in_file.append(col)
print(f"📋 Testing file: {test_file.name}")
print(f" Found {len(omitted_cols_in_file)} omitted question columns")
print()
# Verify replacement
all_correct = True
sample_checked = 0
for col in omitted_cols_in_file[:10]: # Check first 10
unique_vals = df[col].unique()
non_dash_vals = [v for v in unique_vals if str(v) != '--' and pd.notna(v)]
if non_dash_vals:
print(f"{col}: Found non-'--' values: {non_dash_vals[:3]}")
all_correct = False
else:
sample_checked += 1
if sample_checked <= 3:
print(f"{col}: All values are '--' (verified)")
if sample_checked > 3:
print(f" ✅ ... and {sample_checked - 3} more columns verified")
print()
# Check a few random rows
print("📊 Sample Row Check (first 3 omitted columns):")
for col in omitted_cols_in_file[:3]:
sample_values = df[col].head(5).tolist()
all_dash = all(str(v) == '--' for v in sample_values)
status = "" if all_dash else ""
print(f" {status} {col}: {sample_values}")
print()
print("=" * 80)
if all_correct:
print("✅ VERIFICATION PASSED: All omitted values replaced with '--'")
else:
print("❌ VERIFICATION FAILED: Some values not replaced")
print("=" * 80)
return all_correct
if __name__ == "__main__":
success = verify_replacement()
sys.exit(0 if success else 1)