""" Replace Omitted Question Values with "--" For all questions marked as "Omission" type, replace all values with "--" PRESERVES header colors (green for omission, red for reverse-scored) """ import pandas as pd from openpyxl import load_workbook from openpyxl.styles import Font from pathlib import Path import sys import io # Fix Windows console encoding if sys.platform == 'win32': sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') BASE_DIR = Path(__file__).resolve().parent.parent OUTPUT_DIR = BASE_DIR / "output" / "full_run" MAPPING_FILE = BASE_DIR / "data" / "AllQuestions.xlsx" def get_omitted_question_codes(): """Load all omitted question codes from mapping file""" if not MAPPING_FILE.exists(): print(f"❌ ERROR: Mapping file not found: {MAPPING_FILE}") return set() try: map_df = pd.read_excel(MAPPING_FILE, engine='openpyxl') # Get all questions where Type == 'Omission' omitted_df = map_df[map_df['Type'].str.lower() == 'omission'] omitted_codes = set(omitted_df['code'].astype(str).str.strip().tolist()) print(f"📊 Loaded {len(omitted_codes)} omitted question codes from mapping file") return omitted_codes except Exception as e: print(f"❌ ERROR loading mapping file: {e}") return set() def replace_omitted_in_file(file_path, omitted_codes, domain_name, age_group): """Replace omitted question values with '--' in a single file, preserving header colors""" print(f" 🔄 Processing: {file_path.name}") try: # Load the Excel file with openpyxl to preserve formatting wb = load_workbook(file_path) ws = wb.active # Also load with pandas for data manipulation df = pd.read_excel(file_path, engine='openpyxl') # Identify metadata columns (don't touch these) metadata_cols = {'Participant', 'First Name', 'Last Name', 'Student CPID', 'Age', 'Gender', 'Age Category'} # Find omitted question columns and their column indices omitted_cols_info = [] for col_idx, col_name in enumerate(df.columns, start=1): col_str = str(col_name).strip() if col_str in omitted_codes: omitted_cols_info.append({ 'name': col_name, 'index': col_idx, 'pandas_idx': col_idx - 1 # pandas is 0-indexed }) if not omitted_cols_info: print(f" â„šī¸ No omitted questions found in this file") return True print(f" 📋 Found {len(omitted_cols_info)} omitted question columns") # Replace all values in omitted columns with "--" rows_replaced = 0 for col_info in omitted_cols_info: col_name = col_info['name'] col_idx = col_info['index'] pandas_idx = col_info['pandas_idx'] # Count non-null values before replacement non_null_count = df[col_name].notna().sum() if non_null_count > 0: # Replace in pandas dataframe df[col_name] = "--" # Also replace in openpyxl worksheet (for all rows except header) for row_idx in range(2, ws.max_row + 1): # Start from row 2 (skip header) ws.cell(row=row_idx, column=col_idx).value = "--" rows_replaced += non_null_count # Save using openpyxl to preserve formatting wb.save(file_path) print(f" ✅ Replaced values in {len(omitted_cols_info)} columns ({rows_replaced} total values)") print(f" ✅ Header colors preserved") print(f" 💾 File saved successfully") return True except Exception as e: print(f" ❌ ERROR processing file: {e}") import traceback traceback.print_exc() return False def main(): print("=" * 80) print("🔄 REPLACING OMITTED QUESTION VALUES WITH '--'") print("=" * 80) print() # Load omitted question codes omitted_codes = get_omitted_question_codes() if not omitted_codes: print("❌ ERROR: No omitted codes loaded. Cannot proceed.") return False print() # Domain files to process domain_files = { 'adolescense': { 'Personality': 'Personality_14-17.xlsx', 'Grit': 'Grit_14-17.xlsx', 'Emotional Intelligence': 'Emotional_Intelligence_14-17.xlsx', 'Vocational Interest': 'Vocational_Interest_14-17.xlsx', 'Learning Strategies': 'Learning_Strategies_14-17.xlsx' }, 'adults': { 'Personality': 'Personality_18-23.xlsx', 'Grit': 'Grit_18-23.xlsx', 'Emotional Intelligence': 'Emotional_Intelligence_18-23.xlsx', 'Vocational Interest': 'Vocational_Interest_18-23.xlsx', 'Learning Strategies': 'Learning_Strategies_18-23.xlsx' } } total_files = 0 processed_files = 0 failed_files = [] for age_group, domains in domain_files.items(): age_label = "14-17" if age_group == 'adolescense' else "18-23" print(f"📂 Processing {age_group.upper()} files (Age: {age_label})...") print("-" * 80) for domain_name, file_name in domains.items(): total_files += 1 file_path = OUTPUT_DIR / age_group / "5_domain" / file_name if not file_path.exists(): print(f" âš ī¸ SKIP: {file_name} (file not found)") failed_files.append((file_name, "File not found")) continue success = replace_omitted_in_file(file_path, omitted_codes, domain_name, age_label) if success: processed_files += 1 else: failed_files.append((file_name, "Processing error")) print() print("=" * 80) print(f"✅ REPLACEMENT COMPLETE") print(f" Processed: {processed_files}/{total_files} files") if failed_files: print(f" Failed: {len(failed_files)} files") for file_name, error in failed_files: print(f" - {file_name}: {error}") else: print(f" ✅ All files processed successfully") print("=" * 80) return len(failed_files) == 0 if __name__ == "__main__": success = main() sys.exit(0 if success else 1)