274 lines
9.7 KiB
Python
Executable File
274 lines
9.7 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Synthetic Data Generator
|
|
|
|
Generates synthetic data for profile completion based on student information.
|
|
This matches the format provided by the user for creating students.
|
|
"""
|
|
import sys
|
|
import json
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
import pandas as pd
|
|
|
|
project_root = Path(__file__).parent.parent
|
|
sys.path.insert(0, str(project_root))
|
|
|
|
|
|
class SyntheticDataGenerator:
|
|
"""Generate synthetic data for student profiles"""
|
|
|
|
def __init__(self):
|
|
"""Initialize generator"""
|
|
self.focus_areas = [
|
|
'01. Academics',
|
|
'02. Family',
|
|
'03. Health',
|
|
'04. Friendship',
|
|
'05. Emotional management',
|
|
'06. Personal Growth',
|
|
'07. Hobbies',
|
|
'08. Physical Activities',
|
|
'09. Future Aspiration',
|
|
'10. Others'
|
|
]
|
|
|
|
self.strengths = [
|
|
'1. Quick Learning',
|
|
'2. Curiosity',
|
|
'3. Problem Solving',
|
|
'4. Justice',
|
|
'5. Empathy',
|
|
'6. Risk Taking',
|
|
'7. Compassion',
|
|
'8. Creative',
|
|
'9. Technical',
|
|
'10. Leadership',
|
|
'11. Communication',
|
|
'12. Athletic',
|
|
'13. Languages',
|
|
'14. Research',
|
|
'15. Critical Thinking',
|
|
'16. Artistic',
|
|
'17. Others'
|
|
]
|
|
|
|
self.hobbies = [
|
|
'1. Reading',
|
|
'2. Musical',
|
|
'3. Sports',
|
|
'4. Arts & Crafts',
|
|
'5. Cooking',
|
|
'6. Gardening',
|
|
'7. Gaming',
|
|
'8. Traveling',
|
|
'9. Volunteering',
|
|
'10. Learning',
|
|
'11. Singing',
|
|
'12. Other'
|
|
]
|
|
|
|
self.clubs = [
|
|
'1. Science',
|
|
'2. Mathematics',
|
|
'3. Quiz',
|
|
'4. Literary',
|
|
'5. Robotics',
|
|
'6. Art',
|
|
'7. Music',
|
|
'8. Dramatics',
|
|
'9. Sports',
|
|
'10. Community',
|
|
'11. MUN',
|
|
'12. Other'
|
|
]
|
|
|
|
self.expectations = [
|
|
'1. Self-Understanding: Gain deeper insights into their personality, strengths, and areas for growth.',
|
|
'2. Career Guidance: Clear recommendations on suitable career paths or college majors based on their interests and abilities, backed by scientific tool.',
|
|
'3. Academic Support: Help in identifying their learning styles, study habits, or cognitive strengths to improve performance.',
|
|
'4. Validation / Reassurance: Confirmation of their self-perceptions or reassurance that they\'re "on the right path."',
|
|
'5. Improved Decision-Making: Help making informed choices',
|
|
'6. Clarity About Strengths and Weaknesses: Hope to learn what I\'m naturally good at and what skills I may need to develop.',
|
|
'7. Personal Growth: To help them build confidence, motivation, or emotional intelligence.',
|
|
'8. Objective Feedback: Want an unbiased, science-based perspective rather than subjective opinions from others.',
|
|
'9. Actionable Next Steps: Concrete advice or recommendations they can follow after the assessment.',
|
|
'10. Others'
|
|
]
|
|
|
|
def generate_for_student(self, student_data):
|
|
"""
|
|
Generate synthetic data for a student
|
|
|
|
Args:
|
|
student_data: Dictionary with student information
|
|
|
|
Returns:
|
|
dict: Synthetic data for profile completion
|
|
"""
|
|
synthetic = {
|
|
# Step 5: Focus Areas (Pick 3 each)
|
|
'short_term_focus_areas': self._pick_random(self.focus_areas, 3),
|
|
'long_term_focus_areas': self._pick_random(self.focus_areas, 3),
|
|
|
|
# Step 6: Self-Assessment (Pick 3 each)
|
|
'strengths': self._pick_random(self.strengths, 3),
|
|
'areas_of_improvement': self._pick_random(self.strengths, 3),
|
|
|
|
# Step 7: Hobbies & Clubs (Pick 3 hobbies, multiple clubs)
|
|
'hobbies_interests': self._pick_random(self.hobbies, 3),
|
|
'clubs_or_teams': self._pick_random(self.clubs, min(4, len(self.clubs))),
|
|
|
|
# Step 8: Achievements (Text areas)
|
|
'achievements_academics': self._generate_achievement_text('academic'),
|
|
'achievements_sports': self._generate_achievement_text('sports'),
|
|
'achievements_cultural': self._generate_achievement_text('cultural'),
|
|
'achievements_others': self._generate_achievement_text('other'),
|
|
|
|
# Step 9: Expectations (Pick 3)
|
|
'expectations': self._pick_random(self.expectations, 3),
|
|
|
|
# Additional fields if needed
|
|
'specially_abled': False,
|
|
'specially_abled_details': '',
|
|
'short_term_focus_others_text': '',
|
|
'long_term_focus_others_text': '',
|
|
'strength_others_text': '',
|
|
'improvement_others_text': '',
|
|
'hobby_other_text': '',
|
|
'club_other_text': '',
|
|
'expectation_others_text': ''
|
|
}
|
|
|
|
return synthetic
|
|
|
|
def _pick_random(self, options, count):
|
|
"""Pick random items from options"""
|
|
import random
|
|
return random.sample(options, min(count, len(options)))
|
|
|
|
def _generate_achievement_text(self, category):
|
|
"""Generate achievement text based on category"""
|
|
templates = {
|
|
'academic': [
|
|
"Merit scholarship recipient for academic excellence",
|
|
"Top 10% in class for three consecutive years",
|
|
"Won inter-school science quiz competition",
|
|
"Published research paper in school journal"
|
|
],
|
|
'sports': [
|
|
"Represented school in district level basketball tournament",
|
|
"Won gold medal in inter-school athletics meet",
|
|
"Captain of school cricket team",
|
|
"Participated in state level swimming championship"
|
|
],
|
|
'cultural': [
|
|
"Won first prize in school drama competition",
|
|
"Participated in inter-school music festival",
|
|
"Organized annual cultural day event",
|
|
"Won best performer award in dance competition"
|
|
],
|
|
'other': [
|
|
"Active member of community service club",
|
|
"Volunteered for environmental awareness campaign",
|
|
"Organized charity fundraiser event",
|
|
"Received appreciation for leadership skills"
|
|
]
|
|
}
|
|
|
|
import random
|
|
return random.choice(templates.get(category, templates['other']))
|
|
|
|
def generate_batch(self, students_data):
|
|
"""
|
|
Generate synthetic data for multiple students
|
|
|
|
Args:
|
|
students_data: List of student dictionaries
|
|
|
|
Returns:
|
|
list: List of synthetic data dictionaries
|
|
"""
|
|
results = []
|
|
for student in students_data:
|
|
synthetic = self.generate_for_student(student)
|
|
synthetic['student_cpid'] = student.get('Student CPID', '')
|
|
synthetic['student_name'] = f"{student.get('First Name', '')} {student.get('Last Name', '')}"
|
|
results.append(synthetic)
|
|
|
|
return results
|
|
|
|
def save_to_excel(self, synthetic_data, output_path):
|
|
"""
|
|
Save synthetic data to Excel file
|
|
|
|
Args:
|
|
synthetic_data: List of synthetic data dictionaries
|
|
output_path: Path to save Excel file
|
|
"""
|
|
# Convert to DataFrame
|
|
df = pd.DataFrame(synthetic_data)
|
|
|
|
# Save to Excel
|
|
df.to_excel(output_path, index=False)
|
|
print(f"✅ Saved synthetic data to {output_path}")
|
|
|
|
def save_to_json(self, synthetic_data, output_path):
|
|
"""
|
|
Save synthetic data to JSON file
|
|
|
|
Args:
|
|
synthetic_data: List of synthetic data dictionaries
|
|
output_path: Path to save JSON file
|
|
"""
|
|
with open(output_path, 'w') as f:
|
|
json.dump(synthetic_data, f, indent=2)
|
|
print(f"✅ Saved synthetic data to {output_path}")
|
|
|
|
|
|
def main():
|
|
"""Main entry point"""
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description='Generate synthetic data for students')
|
|
parser.add_argument('--students', type=str, required=True, help='Path to Excel file with student data')
|
|
parser.add_argument('--output', type=str, default=None, help='Output file path (default: synthetic_data_<timestamp>.xlsx)')
|
|
parser.add_argument('--format', choices=['excel', 'json'], default='excel', help='Output format')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Load students
|
|
students_df = pd.read_excel(args.students)
|
|
students_data = students_df.to_dict('records')
|
|
|
|
print(f"✅ Loaded {len(students_data)} students")
|
|
|
|
# Generate synthetic data
|
|
generator = SyntheticDataGenerator()
|
|
synthetic_data = generator.generate_batch(students_data)
|
|
|
|
# Save output
|
|
if args.output:
|
|
output_path = Path(args.output)
|
|
else:
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
output_path = Path(__file__).parent.parent / "test_data" / f"synthetic_data_{timestamp}.{args.format}"
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
if args.format == 'excel':
|
|
generator.save_to_excel(synthetic_data, output_path)
|
|
else:
|
|
generator.save_to_json(synthetic_data, output_path)
|
|
|
|
print(f"\n✅ Generated synthetic data for {len(synthetic_data)} students")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|
|
|
|
|
|
|
|
|
|
|