181 lines
6.5 KiB
Python
181 lines
6.5 KiB
Python
"""
|
||
Replace Omitted Question Values with "--"
|
||
For all questions marked as "Omission" type, replace all values with "--"
|
||
PRESERVES header colors (green for omission, red for reverse-scored)
|
||
"""
|
||
import pandas as pd
|
||
from openpyxl import load_workbook
|
||
from openpyxl.styles import Font
|
||
from pathlib import Path
|
||
import sys
|
||
import io
|
||
|
||
# Fix Windows console encoding
|
||
if sys.platform == 'win32':
|
||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
||
|
||
BASE_DIR = Path(__file__).resolve().parent.parent
|
||
OUTPUT_DIR = BASE_DIR / "output" / "full_run"
|
||
MAPPING_FILE = BASE_DIR / "data" / "AllQuestions.xlsx"
|
||
|
||
def get_omitted_question_codes():
|
||
"""Load all omitted question codes from mapping file"""
|
||
if not MAPPING_FILE.exists():
|
||
print(f"❌ ERROR: Mapping file not found: {MAPPING_FILE}")
|
||
return set()
|
||
|
||
try:
|
||
map_df = pd.read_excel(MAPPING_FILE, engine='openpyxl')
|
||
|
||
# Get all questions where Type == 'Omission'
|
||
omitted_df = map_df[map_df['Type'].str.lower() == 'omission']
|
||
omitted_codes = set(omitted_df['code'].astype(str).str.strip().tolist())
|
||
|
||
print(f"📊 Loaded {len(omitted_codes)} omitted question codes from mapping file")
|
||
return omitted_codes
|
||
except Exception as e:
|
||
print(f"❌ ERROR loading mapping file: {e}")
|
||
return set()
|
||
|
||
def replace_omitted_in_file(file_path, omitted_codes, domain_name, age_group):
|
||
"""Replace omitted question values with '--' in a single file, preserving header colors"""
|
||
print(f" 🔄 Processing: {file_path.name}")
|
||
|
||
try:
|
||
# Load the Excel file with openpyxl to preserve formatting
|
||
wb = load_workbook(file_path)
|
||
ws = wb.active
|
||
|
||
# Also load with pandas for data manipulation
|
||
df = pd.read_excel(file_path, engine='openpyxl')
|
||
|
||
# Identify metadata columns (don't touch these)
|
||
metadata_cols = {'Participant', 'First Name', 'Last Name', 'Student CPID', 'Age', 'Gender', 'Age Category'}
|
||
|
||
# Find omitted question columns and their column indices
|
||
omitted_cols_info = []
|
||
for col_idx, col_name in enumerate(df.columns, start=1):
|
||
col_str = str(col_name).strip()
|
||
if col_str in omitted_codes:
|
||
omitted_cols_info.append({
|
||
'name': col_name,
|
||
'index': col_idx,
|
||
'pandas_idx': col_idx - 1 # pandas is 0-indexed
|
||
})
|
||
|
||
if not omitted_cols_info:
|
||
print(f" ℹ️ No omitted questions found in this file")
|
||
return True
|
||
|
||
print(f" 📋 Found {len(omitted_cols_info)} omitted question columns")
|
||
|
||
# Replace all values in omitted columns with "--"
|
||
rows_replaced = 0
|
||
for col_info in omitted_cols_info:
|
||
col_name = col_info['name']
|
||
col_idx = col_info['index']
|
||
pandas_idx = col_info['pandas_idx']
|
||
|
||
# Count non-null values before replacement
|
||
non_null_count = df[col_name].notna().sum()
|
||
if non_null_count > 0:
|
||
# Replace in pandas dataframe
|
||
df[col_name] = "--"
|
||
|
||
# Also replace in openpyxl worksheet (for all rows except header)
|
||
for row_idx in range(2, ws.max_row + 1): # Start from row 2 (skip header)
|
||
ws.cell(row=row_idx, column=col_idx).value = "--"
|
||
|
||
rows_replaced += non_null_count
|
||
|
||
# Save using openpyxl to preserve formatting
|
||
wb.save(file_path)
|
||
print(f" ✅ Replaced values in {len(omitted_cols_info)} columns ({rows_replaced} total values)")
|
||
print(f" ✅ Header colors preserved")
|
||
print(f" 💾 File saved successfully")
|
||
|
||
return True
|
||
|
||
except Exception as e:
|
||
print(f" ❌ ERROR processing file: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
return False
|
||
|
||
def main():
|
||
print("=" * 80)
|
||
print("🔄 REPLACING OMITTED QUESTION VALUES WITH '--'")
|
||
print("=" * 80)
|
||
print()
|
||
|
||
# Load omitted question codes
|
||
omitted_codes = get_omitted_question_codes()
|
||
|
||
if not omitted_codes:
|
||
print("❌ ERROR: No omitted codes loaded. Cannot proceed.")
|
||
return False
|
||
|
||
print()
|
||
|
||
# Domain files to process
|
||
domain_files = {
|
||
'adolescense': {
|
||
'Personality': 'Personality_14-17.xlsx',
|
||
'Grit': 'Grit_14-17.xlsx',
|
||
'Emotional Intelligence': 'Emotional_Intelligence_14-17.xlsx',
|
||
'Vocational Interest': 'Vocational_Interest_14-17.xlsx',
|
||
'Learning Strategies': 'Learning_Strategies_14-17.xlsx'
|
||
},
|
||
'adults': {
|
||
'Personality': 'Personality_18-23.xlsx',
|
||
'Grit': 'Grit_18-23.xlsx',
|
||
'Emotional Intelligence': 'Emotional_Intelligence_18-23.xlsx',
|
||
'Vocational Interest': 'Vocational_Interest_18-23.xlsx',
|
||
'Learning Strategies': 'Learning_Strategies_18-23.xlsx'
|
||
}
|
||
}
|
||
|
||
total_files = 0
|
||
processed_files = 0
|
||
failed_files = []
|
||
|
||
for age_group, domains in domain_files.items():
|
||
age_label = "14-17" if age_group == 'adolescense' else "18-23"
|
||
print(f"📂 Processing {age_group.upper()} files (Age: {age_label})...")
|
||
print("-" * 80)
|
||
|
||
for domain_name, file_name in domains.items():
|
||
total_files += 1
|
||
file_path = OUTPUT_DIR / age_group / "5_domain" / file_name
|
||
|
||
if not file_path.exists():
|
||
print(f" ⚠️ SKIP: {file_name} (file not found)")
|
||
failed_files.append((file_name, "File not found"))
|
||
continue
|
||
|
||
success = replace_omitted_in_file(file_path, omitted_codes, domain_name, age_label)
|
||
|
||
if success:
|
||
processed_files += 1
|
||
else:
|
||
failed_files.append((file_name, "Processing error"))
|
||
|
||
print()
|
||
|
||
print("=" * 80)
|
||
print(f"✅ REPLACEMENT COMPLETE")
|
||
print(f" Processed: {processed_files}/{total_files} files")
|
||
if failed_files:
|
||
print(f" Failed: {len(failed_files)} files")
|
||
for file_name, error in failed_files:
|
||
print(f" - {file_name}: {error}")
|
||
else:
|
||
print(f" ✅ All files processed successfully")
|
||
print("=" * 80)
|
||
|
||
return len(failed_files) == 0
|
||
|
||
if __name__ == "__main__":
|
||
success = main()
|
||
sys.exit(0 if success else 1)
|