CP_Assessment_engine/scripts/replace_omitted_values.py
2026-02-10 12:59:40 +05:30

181 lines
6.5 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Replace Omitted Question Values with "--"
For all questions marked as "Omission" type, replace all values with "--"
PRESERVES header colors (green for omission, red for reverse-scored)
"""
import pandas as pd
from openpyxl import load_workbook
from openpyxl.styles import Font
from pathlib import Path
import sys
import io
# Fix Windows console encoding
if sys.platform == 'win32':
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
BASE_DIR = Path(__file__).resolve().parent.parent
OUTPUT_DIR = BASE_DIR / "output" / "full_run"
MAPPING_FILE = BASE_DIR / "data" / "AllQuestions.xlsx"
def get_omitted_question_codes():
"""Load all omitted question codes from mapping file"""
if not MAPPING_FILE.exists():
print(f"❌ ERROR: Mapping file not found: {MAPPING_FILE}")
return set()
try:
map_df = pd.read_excel(MAPPING_FILE, engine='openpyxl')
# Get all questions where Type == 'Omission'
omitted_df = map_df[map_df['Type'].str.lower() == 'omission']
omitted_codes = set(omitted_df['code'].astype(str).str.strip().tolist())
print(f"📊 Loaded {len(omitted_codes)} omitted question codes from mapping file")
return omitted_codes
except Exception as e:
print(f"❌ ERROR loading mapping file: {e}")
return set()
def replace_omitted_in_file(file_path, omitted_codes, domain_name, age_group):
"""Replace omitted question values with '--' in a single file, preserving header colors"""
print(f" 🔄 Processing: {file_path.name}")
try:
# Load the Excel file with openpyxl to preserve formatting
wb = load_workbook(file_path)
ws = wb.active
# Also load with pandas for data manipulation
df = pd.read_excel(file_path, engine='openpyxl')
# Identify metadata columns (don't touch these)
metadata_cols = {'Participant', 'First Name', 'Last Name', 'Student CPID', 'Age', 'Gender', 'Age Category'}
# Find omitted question columns and their column indices
omitted_cols_info = []
for col_idx, col_name in enumerate(df.columns, start=1):
col_str = str(col_name).strip()
if col_str in omitted_codes:
omitted_cols_info.append({
'name': col_name,
'index': col_idx,
'pandas_idx': col_idx - 1 # pandas is 0-indexed
})
if not omitted_cols_info:
print(f" No omitted questions found in this file")
return True
print(f" 📋 Found {len(omitted_cols_info)} omitted question columns")
# Replace all values in omitted columns with "--"
rows_replaced = 0
for col_info in omitted_cols_info:
col_name = col_info['name']
col_idx = col_info['index']
pandas_idx = col_info['pandas_idx']
# Count non-null values before replacement
non_null_count = df[col_name].notna().sum()
if non_null_count > 0:
# Replace in pandas dataframe
df[col_name] = "--"
# Also replace in openpyxl worksheet (for all rows except header)
for row_idx in range(2, ws.max_row + 1): # Start from row 2 (skip header)
ws.cell(row=row_idx, column=col_idx).value = "--"
rows_replaced += non_null_count
# Save using openpyxl to preserve formatting
wb.save(file_path)
print(f" ✅ Replaced values in {len(omitted_cols_info)} columns ({rows_replaced} total values)")
print(f" ✅ Header colors preserved")
print(f" 💾 File saved successfully")
return True
except Exception as e:
print(f" ❌ ERROR processing file: {e}")
import traceback
traceback.print_exc()
return False
def main():
print("=" * 80)
print("🔄 REPLACING OMITTED QUESTION VALUES WITH '--'")
print("=" * 80)
print()
# Load omitted question codes
omitted_codes = get_omitted_question_codes()
if not omitted_codes:
print("❌ ERROR: No omitted codes loaded. Cannot proceed.")
return False
print()
# Domain files to process
domain_files = {
'adolescense': {
'Personality': 'Personality_14-17.xlsx',
'Grit': 'Grit_14-17.xlsx',
'Emotional Intelligence': 'Emotional_Intelligence_14-17.xlsx',
'Vocational Interest': 'Vocational_Interest_14-17.xlsx',
'Learning Strategies': 'Learning_Strategies_14-17.xlsx'
},
'adults': {
'Personality': 'Personality_18-23.xlsx',
'Grit': 'Grit_18-23.xlsx',
'Emotional Intelligence': 'Emotional_Intelligence_18-23.xlsx',
'Vocational Interest': 'Vocational_Interest_18-23.xlsx',
'Learning Strategies': 'Learning_Strategies_18-23.xlsx'
}
}
total_files = 0
processed_files = 0
failed_files = []
for age_group, domains in domain_files.items():
age_label = "14-17" if age_group == 'adolescense' else "18-23"
print(f"📂 Processing {age_group.upper()} files (Age: {age_label})...")
print("-" * 80)
for domain_name, file_name in domains.items():
total_files += 1
file_path = OUTPUT_DIR / age_group / "5_domain" / file_name
if not file_path.exists():
print(f" ⚠️ SKIP: {file_name} (file not found)")
failed_files.append((file_name, "File not found"))
continue
success = replace_omitted_in_file(file_path, omitted_codes, domain_name, age_label)
if success:
processed_files += 1
else:
failed_files.append((file_name, "Processing error"))
print()
print("=" * 80)
print(f"✅ REPLACEMENT COMPLETE")
print(f" Processed: {processed_files}/{total_files} files")
if failed_files:
print(f" Failed: {len(failed_files)} files")
for file_name, error in failed_files:
print(f" - {file_name}: {error}")
else:
print(f" ✅ All files processed successfully")
print("=" * 80)
return len(failed_files) == 0
if __name__ == "__main__":
success = main()
sys.exit(0 if success else 1)