CP_Assessment_engine/scripts/check_resume_logic.py

"""Check the difference between old and new resume logic"""
import pandas as pd

df = pd.read_excel('output/full_run/adolescense/5_domain/Emotional_Intelligence_14-17.xlsx', engine='openpyxl')
cpid_col = 'Student CPID'

# OLD logic (what current running process used)
old_logic = set(df[cpid_col].astype(str).tolist())

# NEW logic (what fixed code will use)
new_logic = set()
for cpid in df[cpid_col].dropna().astype(str):
    cpid_str = str(cpid).strip()
    if cpid_str and cpid_str.lower() != 'nan' and cpid_str != '':
        new_logic.add(cpid_str)

print("="*60)
print("RESUME LOGIC COMPARISON")
print("="*60)
print(f"OLD logic count (includes NaN): {len(old_logic)}")
print(f"NEW logic count (valid only): {len(new_logic)}")
print(f"Difference: {len(old_logic) - len(new_logic)}")
print(f"\n'nan' in old set: {'nan' in old_logic}")
print(f"Valid CPIDs in old set: {len([c for c in old_logic if c and c.lower() != 'nan'])}")
print(f"\nExpected total: 1507")
print(f"Missing with OLD logic: {1507 - len([c for c in old_logic if c and c.lower() != 'nan'])}")
print(f"Missing with NEW logic: {1507 - len(new_logic)}")
print("="*60)