CP_AUTOMATION/scripts/generate_synthetic_data.py
2025-12-12 19:54:54 +05:30

274 lines
9.7 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Synthetic Data Generator
Generates synthetic data for profile completion based on student information.
This matches the format provided by the user for creating students.
"""
import sys
import json
from pathlib import Path
from datetime import datetime
import pandas as pd
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
class SyntheticDataGenerator:
"""Generate synthetic data for student profiles"""
def __init__(self):
"""Initialize generator"""
self.focus_areas = [
'01. Academics',
'02. Family',
'03. Health',
'04. Friendship',
'05. Emotional management',
'06. Personal Growth',
'07. Hobbies',
'08. Physical Activities',
'09. Future Aspiration',
'10. Others'
]
self.strengths = [
'1. Quick Learning',
'2. Curiosity',
'3. Problem Solving',
'4. Justice',
'5. Empathy',
'6. Risk Taking',
'7. Compassion',
'8. Creative',
'9. Technical',
'10. Leadership',
'11. Communication',
'12. Athletic',
'13. Languages',
'14. Research',
'15. Critical Thinking',
'16. Artistic',
'17. Others'
]
self.hobbies = [
'1. Reading',
'2. Musical',
'3. Sports',
'4. Arts & Crafts',
'5. Cooking',
'6. Gardening',
'7. Gaming',
'8. Traveling',
'9. Volunteering',
'10. Learning',
'11. Singing',
'12. Other'
]
self.clubs = [
'1. Science',
'2. Mathematics',
'3. Quiz',
'4. Literary',
'5. Robotics',
'6. Art',
'7. Music',
'8. Dramatics',
'9. Sports',
'10. Community',
'11. MUN',
'12. Other'
]
self.expectations = [
'1. Self-Understanding: Gain deeper insights into their personality, strengths, and areas for growth.',
'2. Career Guidance: Clear recommendations on suitable career paths or college majors based on their interests and abilities, backed by scientific tool.',
'3. Academic Support: Help in identifying their learning styles, study habits, or cognitive strengths to improve performance.',
'4. Validation / Reassurance: Confirmation of their self-perceptions or reassurance that they\'re "on the right path."',
'5. Improved Decision-Making: Help making informed choices',
'6. Clarity About Strengths and Weaknesses: Hope to learn what I\'m naturally good at and what skills I may need to develop.',
'7. Personal Growth: To help them build confidence, motivation, or emotional intelligence.',
'8. Objective Feedback: Want an unbiased, science-based perspective rather than subjective opinions from others.',
'9. Actionable Next Steps: Concrete advice or recommendations they can follow after the assessment.',
'10. Others'
]
def generate_for_student(self, student_data):
"""
Generate synthetic data for a student
Args:
student_data: Dictionary with student information
Returns:
dict: Synthetic data for profile completion
"""
synthetic = {
# Step 5: Focus Areas (Pick 3 each)
'short_term_focus_areas': self._pick_random(self.focus_areas, 3),
'long_term_focus_areas': self._pick_random(self.focus_areas, 3),
# Step 6: Self-Assessment (Pick 3 each)
'strengths': self._pick_random(self.strengths, 3),
'areas_of_improvement': self._pick_random(self.strengths, 3),
# Step 7: Hobbies & Clubs (Pick 3 hobbies, multiple clubs)
'hobbies_interests': self._pick_random(self.hobbies, 3),
'clubs_or_teams': self._pick_random(self.clubs, min(4, len(self.clubs))),
# Step 8: Achievements (Text areas)
'achievements_academics': self._generate_achievement_text('academic'),
'achievements_sports': self._generate_achievement_text('sports'),
'achievements_cultural': self._generate_achievement_text('cultural'),
'achievements_others': self._generate_achievement_text('other'),
# Step 9: Expectations (Pick 3)
'expectations': self._pick_random(self.expectations, 3),
# Additional fields if needed
'specially_abled': False,
'specially_abled_details': '',
'short_term_focus_others_text': '',
'long_term_focus_others_text': '',
'strength_others_text': '',
'improvement_others_text': '',
'hobby_other_text': '',
'club_other_text': '',
'expectation_others_text': ''
}
return synthetic
def _pick_random(self, options, count):
"""Pick random items from options"""
import random
return random.sample(options, min(count, len(options)))
def _generate_achievement_text(self, category):
"""Generate achievement text based on category"""
templates = {
'academic': [
"Merit scholarship recipient for academic excellence",
"Top 10% in class for three consecutive years",
"Won inter-school science quiz competition",
"Published research paper in school journal"
],
'sports': [
"Represented school in district level basketball tournament",
"Won gold medal in inter-school athletics meet",
"Captain of school cricket team",
"Participated in state level swimming championship"
],
'cultural': [
"Won first prize in school drama competition",
"Participated in inter-school music festival",
"Organized annual cultural day event",
"Won best performer award in dance competition"
],
'other': [
"Active member of community service club",
"Volunteered for environmental awareness campaign",
"Organized charity fundraiser event",
"Received appreciation for leadership skills"
]
}
import random
return random.choice(templates.get(category, templates['other']))
def generate_batch(self, students_data):
"""
Generate synthetic data for multiple students
Args:
students_data: List of student dictionaries
Returns:
list: List of synthetic data dictionaries
"""
results = []
for student in students_data:
synthetic = self.generate_for_student(student)
synthetic['student_cpid'] = student.get('Student CPID', '')
synthetic['student_name'] = f"{student.get('First Name', '')} {student.get('Last Name', '')}"
results.append(synthetic)
return results
def save_to_excel(self, synthetic_data, output_path):
"""
Save synthetic data to Excel file
Args:
synthetic_data: List of synthetic data dictionaries
output_path: Path to save Excel file
"""
# Convert to DataFrame
df = pd.DataFrame(synthetic_data)
# Save to Excel
df.to_excel(output_path, index=False)
print(f"✅ Saved synthetic data to {output_path}")
def save_to_json(self, synthetic_data, output_path):
"""
Save synthetic data to JSON file
Args:
synthetic_data: List of synthetic data dictionaries
output_path: Path to save JSON file
"""
with open(output_path, 'w') as f:
json.dump(synthetic_data, f, indent=2)
print(f"✅ Saved synthetic data to {output_path}")
def main():
"""Main entry point"""
import argparse
parser = argparse.ArgumentParser(description='Generate synthetic data for students')
parser.add_argument('--students', type=str, required=True, help='Path to Excel file with student data')
parser.add_argument('--output', type=str, default=None, help='Output file path (default: synthetic_data_<timestamp>.xlsx)')
parser.add_argument('--format', choices=['excel', 'json'], default='excel', help='Output format')
args = parser.parse_args()
# Load students
students_df = pd.read_excel(args.students)
students_data = students_df.to_dict('records')
print(f"✅ Loaded {len(students_data)} students")
# Generate synthetic data
generator = SyntheticDataGenerator()
synthetic_data = generator.generate_batch(students_data)
# Save output
if args.output:
output_path = Path(args.output)
else:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_path = Path(__file__).parent.parent / "test_data" / f"synthetic_data_{timestamp}.{args.format}"
output_path.parent.mkdir(parents=True, exist_ok=True)
if args.format == 'excel':
generator.save_to_excel(synthetic_data, output_path)
else:
generator.save_to_json(synthetic_data, output_path)
print(f"\n✅ Generated synthetic data for {len(synthetic_data)} students")
if __name__ == "__main__":
main()