saas-market-analysis-dubai/load_csv_batch.py
2025-09-17 03:04:22 +05:30

590 lines
24 KiB
Python
Executable File

#!/usr/bin/env python
"""
CSV Data Loading Script - Batch Processing with Error Handling
Loads CSV data in small batches (10 entries at a time) with comprehensive error handling.
"""
import os
import sys
import django
import pandas as pd
from datetime import datetime
from decimal import Decimal
import uuid
import traceback
# Add the project directory to Python path
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
# Set Django settings
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'dubai_analytics.settings')
# Setup Django
django.setup()
from apps.analytics.models import (
Broker, Developer, Project, Valuation, Land, Rent, Transaction, Forecast
)
from apps.users.models import User
from apps.core.models import APIRateLimit, SystemConfiguration
def safe_get(row, key, default=None):
"""Safely get value from pandas row, handling NaN values."""
try:
value = row.get(key, default)
if pd.isna(value) or value == '' or str(value).lower() == 'nan':
return default
return value
except:
return default
def safe_decimal(value, default=0):
"""Safely convert to Decimal."""
try:
if pd.isna(value) or value == '' or str(value).lower() == 'nan':
return Decimal(str(default))
return Decimal(str(value))
except:
return Decimal(str(default))
def safe_datetime(value):
"""Safely convert to datetime."""
try:
if pd.isna(value) or value == '' or str(value).lower() == 'nan':
return None
return pd.to_datetime(value)
except:
return None
def safe_int(value, default=0):
"""Safely convert to int."""
try:
if pd.isna(value) or value == '' or str(value).lower() == 'nan':
return default
return int(float(value))
except:
return default
def safe_str(value, default=''):
"""Safely convert to string."""
try:
if pd.isna(value) or value == '' or str(value).lower() == 'nan':
return default
return str(value).strip()
except:
return default
def load_brokers_batch(csv_path, batch_size=10):
"""Load brokers data from CSV in batches."""
print("📊 Loading brokers data in batches...")
try:
df = pd.read_csv(csv_path)
print(f" Found {len(df)} broker records")
total_created = 0
total_errors = 0
for i in range(0, len(df), batch_size):
batch_df = df.iloc[i:i+batch_size]
batch_created = 0
batch_errors = 0
print(f" Processing batch {i//batch_size + 1} (rows {i+1}-{min(i+batch_size, len(df))})...")
for _, row in batch_df.iterrows():
try:
broker_number = safe_str(row.get('BROKER_NUMBER', ''))
if not broker_number:
batch_errors += 1
continue
broker, created = Broker.objects.get_or_create(
broker_number=broker_number,
defaults={
'broker_name_en': safe_str(row.get('BROKER_EN', '')),
'gender': safe_str(row.get('GENDER_EN', 'male')),
'license_start_date': safe_datetime(row.get('LICENSE_START_DATE')) or datetime.now(),
'license_end_date': safe_datetime(row.get('LICENSE_END_DATE')) or datetime.now(),
'webpage': safe_str(row.get('WEBPAGE', '')),
'phone': safe_str(row.get('PHONE', '')),
'fax': safe_str(row.get('FAX', '')),
'real_estate_number': safe_str(row.get('REAL_ESTATE_NUMBER', '')),
'real_estate_name_en': safe_str(row.get('REAL_ESTATE_EN', '')),
}
)
if created:
batch_created += 1
except Exception as e:
print(f" ❌ Error creating broker {safe_str(row.get('BROKER_NUMBER', 'unknown'))}: {str(e)[:100]}")
batch_errors += 1
print(f" ✅ Batch {i//batch_size + 1}: {batch_created} created, {batch_errors} errors")
total_created += batch_created
total_errors += batch_errors
# Stop after first batch for testing
if i == 0:
break
print(f" 📊 Brokers Summary: {total_created} created, {total_errors} errors")
return total_created
except Exception as e:
print(f" ❌ Error loading brokers: {e}")
return 0
def load_projects_batch(csv_path, batch_size=10):
"""Load projects data from CSV in batches."""
print("🏢 Loading projects data in batches...")
try:
df = pd.read_csv(csv_path)
print(f" Found {len(df)} project records")
total_created = 0
total_errors = 0
for i in range(0, len(df), batch_size):
batch_df = df.iloc[i:i+batch_size]
batch_created = 0
batch_errors = 0
print(f" Processing batch {i//batch_size + 1} (rows {i+1}-{min(i+batch_size, len(df))})...")
for _, row in batch_df.iterrows():
try:
project_number = safe_str(row.get('PROJECT_NUMBER', ''))
if not project_number:
batch_errors += 1
continue
# Get or create developer
developer = None
dev_number = safe_str(row.get('DEVELOPER_NUMBER', ''))
if dev_number:
developer, _ = Developer.objects.get_or_create(
developer_number=dev_number,
defaults={'developer_name_en': safe_str(row.get('DEVELOPER_EN', ''))}
)
project, created = Project.objects.get_or_create(
project_number=project_number,
defaults={
'project_name_en': safe_str(row.get('PROJECT_EN', '')),
'project_status': safe_str(row.get('PROJECT_STATUS', 'active')),
'area_en': safe_str(row.get('AREA_EN', '')),
'zone_en': safe_str(row.get('ZONE_EN', '')),
'developer': developer,
'total_units': safe_int(row.get('CNT_UNIT', 0)),
'completion_percentage': safe_decimal(row.get('PERCENT_COMPLETED', 0)),
'launch_date': safe_datetime(row.get('START_DATE')) or datetime.now().date(),
'completion_date': safe_datetime(row.get('COMPLETION_DATE')),
}
)
if created:
batch_created += 1
except Exception as e:
print(f" ❌ Error creating project {safe_str(row.get('PROJECT_NUMBER', 'unknown'))}: {str(e)[:100]}")
batch_errors += 1
print(f" ✅ Batch {i//batch_size + 1}: {batch_created} created, {batch_errors} errors")
total_created += batch_created
total_errors += batch_errors
# Stop after first batch for testing
if i == 0:
break
print(f" 🏢 Projects Summary: {total_created} created, {total_errors} errors")
return total_created
except Exception as e:
print(f" ❌ Error loading projects: {e}")
return 0
def load_lands_batch(csv_path, batch_size=10):
"""Load lands data from CSV in batches."""
print("🏞️ Loading lands data in batches...")
try:
df = pd.read_csv(csv_path)
print(f" Found {len(df)} land records")
total_created = 0
total_errors = 0
for i in range(0, len(df), batch_size):
batch_df = df.iloc[i:i+batch_size]
batch_created = 0
batch_errors = 0
print(f" Processing batch {i//batch_size + 1} (rows {i+1}-{min(i+batch_size, len(df))})...")
for _, row in batch_df.iterrows():
try:
# Create a unique land number if not available
land_number = safe_str(row.get('PRE_REGISTRATION_NUMBER', ''))
if not land_number:
land_number = f"LAND_{uuid.uuid4().hex[:8]}"
land, created = Land.objects.get_or_create(
land_number=land_number,
defaults={
'land_type': safe_str(row.get('LAND_TYPE_EN', '')),
'area_en': safe_str(row.get('AREA_EN', '')),
'zone_en': safe_str(row.get('ZONE_EN', '')),
'actual_area': safe_decimal(row.get('ACTUAL_AREA', 0)),
'is_freehold': bool(safe_str(row.get('IS_FREE_HOLD_EN', 'False')).lower() == 'true'),
'land_use': safe_str(row.get('PROP_SUB_TYPE_EN', '')),
'plot_number': safe_str(row.get('DM_ZIP_CODE', '')),
'street_number': safe_str(row.get('PROJECT_NUMBER', '')),
}
)
if created:
batch_created += 1
except Exception as e:
print(f" ❌ Error creating land: {str(e)[:100]}")
batch_errors += 1
print(f" ✅ Batch {i//batch_size + 1}: {batch_created} created, {batch_errors} errors")
total_created += batch_created
total_errors += batch_errors
# Stop after first batch for testing
if i == 0:
break
print(f" 🏞️ Lands Summary: {total_created} created, {total_errors} errors")
return total_created
except Exception as e:
print(f" ❌ Error loading lands: {e}")
return 0
def load_rents_batch(csv_path, batch_size=10):
"""Load rents data from CSV in batches."""
print("🏠 Loading rents data in batches...")
try:
df = pd.read_csv(csv_path)
print(f" Found {len(df)} rent records")
total_created = 0
total_errors = 0
for i in range(0, len(df), batch_size):
batch_df = df.iloc[i:i+batch_size]
batch_created = 0
batch_errors = 0
print(f" Processing batch {i//batch_size + 1} (rows {i+1}-{min(i+batch_size, len(df))})...")
for _, row in batch_df.iterrows():
try:
# Create a unique rent number
rent_number = f"RENT_{uuid.uuid4().hex[:8]}"
rent, created = Rent.objects.get_or_create(
rent_number=rent_number,
defaults={
'property_type': safe_str(row.get('PROP_TYPE_EN', '')),
'area_en': safe_str(row.get('AREA_EN', '')),
'zone_en': safe_str(row.get('ZONE_EN', '')),
'rent_date': safe_datetime(row.get('INSTANCE_DATE')) or datetime.now(),
'annual_rent': safe_decimal(row.get('ANNUAL_RENT', 0)),
'monthly_rent': safe_decimal(row.get('MONTHLY_RENT', 0)),
'property_area': safe_decimal(row.get('PROCEDURE_AREA', 0)),
'rent_per_sqft': safe_decimal(row.get('RENT_PER_SQFT', 0)),
}
)
if created:
batch_created += 1
except Exception as e:
print(f" ❌ Error creating rent: {str(e)[:100]}")
batch_errors += 1
print(f" ✅ Batch {i//batch_size + 1}: {batch_created} created, {batch_errors} errors")
total_created += batch_created
total_errors += batch_errors
# Stop after first batch for testing
if i == 0:
break
print(f" 🏠 Rents Summary: {total_created} created, {total_errors} errors")
return total_created
except Exception as e:
print(f" ❌ Error loading rents: {e}")
return 0
def load_transactions_batch(csv_path, batch_size=10):
"""Load transactions data from CSV in batches."""
print("💼 Loading transactions data in batches...")
try:
df = pd.read_csv(csv_path)
print(f" Found {len(df)} transaction records")
total_created = 0
total_errors = 0
for i in range(0, len(df), batch_size):
batch_df = df.iloc[i:i+batch_size]
batch_created = 0
batch_errors = 0
print(f" Processing batch {i//batch_size + 1} (rows {i+1}-{min(i+batch_size, len(df))})...")
for _, row in batch_df.iterrows():
try:
transaction_number = safe_str(row.get('TRANSACTION_NUMBER', ''))
if not transaction_number:
batch_errors += 1
continue
# Get or create project
project = None
project_name = safe_str(row.get('PROJECT_EN', ''))
if project_name:
project, _ = Project.objects.get_or_create(
project_number=safe_str(row.get('PROJECT_NUMBER', '')),
defaults={'project_name_en': project_name}
)
transaction, created = Transaction.objects.get_or_create(
transaction_number=transaction_number,
defaults={
'instance_date': safe_datetime(row.get('INSTANCE_DATE')) or datetime.now(),
'area_en': safe_str(row.get('AREA_EN', '')),
'zone_en': safe_str(row.get('ZONE_EN', '')),
'property_type': safe_str(row.get('PROP_TYPE_EN', '')),
'transaction_value': safe_decimal(row.get('TRANS_VALUE', 0)),
'property_area': safe_decimal(row.get('PROCEDURE_AREA', 0)),
'price_per_sqft': safe_decimal(row.get('PRICE_PER_SQFT', 0)),
'group': safe_str(row.get('GROUP_EN', '')),
'usage': safe_str(row.get('USAGE_EN', '')),
'master_project': safe_str(row.get('MASTER_PROJECT_EN', '')),
'project': project,
}
)
if created:
batch_created += 1
except Exception as e:
print(f" ❌ Error creating transaction {safe_str(row.get('TRANSACTION_NUMBER', 'unknown'))}: {str(e)[:100]}")
batch_errors += 1
print(f" ✅ Batch {i//batch_size + 1}: {batch_created} created, {batch_errors} errors")
total_created += batch_created
total_errors += batch_errors
# Stop after first batch for testing
if i == 0:
break
print(f" 💼 Transactions Summary: {total_created} created, {total_errors} errors")
return total_created
except Exception as e:
print(f" ❌ Error loading transactions: {e}")
return 0
def load_valuations_batch(csv_path, batch_size=10):
"""Load valuations data from CSV in batches."""
print("💰 Loading valuations data in batches...")
try:
df = pd.read_csv(csv_path)
print(f" Found {len(df)} valuation records")
total_created = 0
total_errors = 0
for i in range(0, len(df), batch_size):
batch_df = df.iloc[i:i+batch_size]
batch_created = 0
batch_errors = 0
print(f" Processing batch {i//batch_size + 1} (rows {i+1}-{min(i+batch_size, len(df))})...")
for _, row in batch_df.iterrows():
try:
# Create a unique valuation number
valuation_number = f"VAL_{uuid.uuid4().hex[:8]}"
valuation, created = Valuation.objects.get_or_create(
valuation_number=valuation_number,
defaults={
'property_type': safe_str(row.get('PROP_TYPE_EN', '')),
'area_en': safe_str(row.get('AREA_EN', '')),
'zone_en': safe_str(row.get('ZONE_EN', '')),
'valuation_date': safe_datetime(row.get('VALUATION_DATE')) or datetime.now(),
'property_value': safe_decimal(row.get('PROPERTY_VALUE', 0)),
'land_value': safe_decimal(row.get('LAND_VALUE', 0)),
'building_value': safe_decimal(row.get('BUILDING_VALUE', 0)),
'total_area': safe_decimal(row.get('TOTAL_AREA', 0)),
'land_area': safe_decimal(row.get('LAND_AREA', 0)),
'building_area': safe_decimal(row.get('BUILDING_AREA', 0)),
}
)
if created:
batch_created += 1
except Exception as e:
print(f" ❌ Error creating valuation: {str(e)[:100]}")
batch_errors += 1
print(f" ✅ Batch {i//batch_size + 1}: {batch_created} created, {batch_errors} errors")
total_created += batch_created
total_errors += batch_errors
# Stop after first batch for testing
if i == 0:
break
print(f" 💰 Valuations Summary: {total_created} created, {total_errors} errors")
return total_created
except Exception as e:
print(f" ❌ Error loading valuations: {e}")
return 0
def create_sample_forecasts():
"""Create sample forecast data."""
print("🔮 Creating sample forecasts...")
try:
# Get some sample areas and property types from transactions
areas = Transaction.objects.values_list('area_en', flat=True).distinct()[:5]
property_types = Transaction.objects.values_list('property_type', flat=True).distinct()[:3]
forecasts_created = 0
for area in areas:
for prop_type in property_types:
if not area or not prop_type:
continue
forecast, created = Forecast.objects.get_or_create(
area_en=area,
property_type=prop_type,
defaults={
'forecast_date': datetime.now().date(),
'predicted_price': Decimal('1000000.00'),
'confidence_interval_lower': Decimal('800000.00'),
'confidence_interval_upper': Decimal('1200000.00'),
'model_version': '1.0',
'accuracy_score': Decimal('0.85'),
'metadata': {'source': 'sample_data', 'model': 'linear_regression'}
}
)
if created:
forecasts_created += 1
print(f" ✅ Created {forecasts_created} sample forecasts")
return forecasts_created
except Exception as e:
print(f" ❌ Error creating forecasts: {e}")
return 0
def setup_rate_limits():
"""Setup default rate limits for different subscription tiers."""
print("⚙️ Setting up rate limits...")
try:
rate_limits = [
{'subscription_type': 'free', 'requests_per_minute': 10, 'requests_per_hour': 100, 'requests_per_day': 1000},
{'subscription_type': 'paid', 'requests_per_minute': 60, 'requests_per_hour': 1000, 'requests_per_day': 10000},
{'subscription_type': 'premium', 'requests_per_minute': 120, 'requests_per_hour': 2000, 'requests_per_day': 20000},
]
created_count = 0
for limit_data in rate_limits:
rate_limit, created = APIRateLimit.objects.get_or_create(
subscription_type=limit_data['subscription_type'],
defaults=limit_data
)
if created:
created_count += 1
print(f" ✅ Created {created_count} rate limit configurations")
return created_count
except Exception as e:
print(f" ❌ Error setting up rate limits: {e}")
return 0
def verify_data_loaded():
"""Verify that data has been loaded successfully."""
print("\n🔍 Verifying loaded data...")
try:
counts = {
'Brokers': Broker.objects.count(),
'Developers': Developer.objects.count(),
'Projects': Project.objects.count(),
'Lands': Land.objects.count(),
'Rents': Rent.objects.count(),
'Transactions': Transaction.objects.count(),
'Valuations': Valuation.objects.count(),
'Forecasts': Forecast.objects.count(),
}
print(" 📊 Current database counts:")
for model_name, count in counts.items():
print(f" {model_name}: {count}")
return counts
except Exception as e:
print(f" ❌ Error verifying data: {e}")
return {}
def main():
"""Main function to load all CSV data in batches."""
print("=" * 70)
print(" Dubai Analytics Platform - Batch CSV Data Loader")
print(" Loading 10 entries per table for testing")
print("=" * 70)
print()
# Check if sample data directory exists
sample_data_dir = "sample data"
if not os.path.exists(sample_data_dir):
print(f"❌ Sample data directory '{sample_data_dir}' not found!")
print(" Please ensure the CSV files are in the 'sample data' directory.")
return
# Track total records created
total_created = 0
# Load each CSV file in batches
csv_files = [
('brokers.csv', load_brokers_batch),
('projects.csv', load_projects_batch),
('lands.csv', load_lands_batch),
('rents.csv', load_rents_batch),
('transactions.csv', load_transactions_batch),
('valuations.csv', load_valuations_batch),
]
for csv_file, load_function in csv_files:
csv_path = os.path.join(sample_data_dir, csv_file)
if os.path.exists(csv_path):
print(f"\n📁 Processing {csv_file}...")
created = load_function(csv_path, batch_size=10)
total_created += created
else:
print(f"⚠️ File {csv_file} not found, skipping...")
# Create sample forecasts
print(f"\n🔮 Creating sample forecasts...")
forecasts_created = create_sample_forecasts()
total_created += forecasts_created
# Setup rate limits
print(f"\n⚙️ Setting up rate limits...")
rate_limits_created = setup_rate_limits()
# Verify data loaded
counts = verify_data_loaded()
# Summary
print("\n" + "=" * 70)
print(" Data Loading Summary")
print("=" * 70)
print(f"📊 Total records created: {total_created}")
print(f"⚙️ Rate limits configured: {rate_limits_created}")
print()
print("✅ Batch data loading completed successfully!")
print()
print("Next steps:")
print("1. Access Django Admin: http://localhost:8000/admin/")
print("2. Login with: admin@dubai-analytics.com / admin123")
print("3. View the loaded data in the admin interface")
print("4. Test the API endpoints with the sample data")
if __name__ == '__main__':
main()