saas-market-analysis-dubai/load_all_data_chunks.py

#!/usr/bin/env python
"""
Load All Data in Chunks - Comprehensive Script
Loads all CSV data in manageable chunks with progress tracking.
"""
import os
import sys
import django
import pandas as pd
from datetime import datetime
from decimal import Decimal
import uuid
import traceback

# Add the project directory to Python path
sys.path.append(os.path.dirname(os.path.abspath(__file__)))

# Set Django settings
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'dubai_analytics.settings')

# Setup Django
django.setup()

from apps.analytics.models import (
    Broker, Developer, Project, Valuation, Land, Rent, Transaction, Forecast
)
from apps.users.models import User
from apps.core.models import APIRateLimit, SystemConfiguration

def safe_get(row, key, default=None):
    """Safely get value from pandas row, handling NaN values."""
    try:
        value = row.get(key, default)
        if pd.isna(value) or value == '' or str(value).lower() == 'nan':
            return default
        return value
    except:
        return default

def safe_decimal(value, default=0):
    """Safely convert to Decimal."""
    try:
        if pd.isna(value) or value == '' or str(value).lower() == 'nan':
            return Decimal(str(default))
        return Decimal(str(value))
    except:
        return Decimal(str(default))

def safe_datetime(value):
    """Safely convert to datetime."""
    try:
        if pd.isna(value) or value == '' or str(value).lower() == 'nan':
            return None
        return pd.to_datetime(value)
    except:
        return None

def safe_int(value, default=0):
    """Safely convert to int."""
    try:
        if pd.isna(value) or value == '' or str(value).lower() == 'nan':
            return default
        return int(float(value))
    except:
        return default

def safe_str(value, default=''):
    """Safely convert to string."""
    try:
        if pd.isna(value) or value == '' or str(value).lower() == 'nan':
            return default
        return str(value).strip()
    except:
        return default

def load_brokers_chunks(csv_path, chunk_size=1000, max_chunks=5):
    """Load brokers data in chunks."""
    print("📊 Loading brokers data in chunks...")
    try:
        df = pd.read_csv(csv_path)
        print(f"   Found {len(df)} broker records")

        total_created = 0
        total_errors = 0

        # Process in chunks
        for chunk_num in range(min(max_chunks, len(df) // chunk_size + 1)):
            start_idx = chunk_num * chunk_size
            end_idx = min((chunk_num + 1) * chunk_size, len(df))
            chunk_df = df.iloc[start_idx:end_idx]

            print(f"   Processing chunk {chunk_num + 1} (rows {start_idx + 1}-{end_idx})...")

            chunk_created = 0
            chunk_errors = 0

            for _, row in chunk_df.iterrows():
                try:
                    broker_number = safe_str(row.get('BROKER_NUMBER', ''))
                    if not broker_number:
                        chunk_errors += 1
                        continue

                    broker, created = Broker.objects.get_or_create(
                        broker_number=broker_number,
                        defaults={
                            'broker_name_en': safe_str(row.get('BROKER_EN', '')),
                            'gender': safe_str(row.get('GENDER_EN', 'male')),
                            'license_start_date': safe_datetime(row.get('LICENSE_START_DATE')) or datetime.now(),
                            'license_end_date': safe_datetime(row.get('LICENSE_END_DATE')) or datetime.now(),
                            'webpage': safe_str(row.get('WEBPAGE', '')),
                            'phone': safe_str(row.get('PHONE', '')),
                            'fax': safe_str(row.get('FAX', '')),
                            'real_estate_number': safe_str(row.get('REAL_ESTATE_NUMBER', '')),
                            'real_estate_name_en': safe_str(row.get('REAL_ESTATE_EN', '')),
                        }
                    )
                    if created:
                        chunk_created += 1
                except Exception as e:
                    chunk_errors += 1

            print(f"     ✅ Chunk {chunk_num + 1}: {chunk_created} created, {chunk_errors} errors")
            total_created += chunk_created
            total_errors += chunk_errors

        print(f"   📊 Brokers Summary: {total_created} created, {total_errors} errors")
        return total_created
    except Exception as e:
        print(f"   ❌ Error loading brokers: {e}")
        return 0

def load_developers_chunks(csv_path, chunk_size=1000, max_chunks=5):
    """Load developers data in chunks."""
    print("🏗️ Loading developers data in chunks...")
    try:
        df = pd.read_csv(csv_path)
        print(f"   Found {len(df)} developer records")

        total_created = 0
        total_errors = 0

        # Process in chunks
        for chunk_num in range(min(max_chunks, len(df) // chunk_size + 1)):
            start_idx = chunk_num * chunk_size
            end_idx = min((chunk_num + 1) * chunk_size, len(df))
            chunk_df = df.iloc[start_idx:end_idx]

            print(f"   Processing chunk {chunk_num + 1} (rows {start_idx + 1}-{end_idx})...")

            chunk_created = 0
            chunk_errors = 0

            for _, row in chunk_df.iterrows():
                try:
                    developer_number = safe_str(row.get('DEVELOPER_NUMBER', ''))
                    if not developer_number:
                        chunk_errors += 1
                        continue

                    developer, created = Developer.objects.get_or_create(
                        developer_number=developer_number,
                        defaults={
                            'developer_name_en': safe_str(row.get('DEVELOPER_EN', '')),
                        }
                    )
                    if created:
                        chunk_created += 1
                except Exception as e:
                    chunk_errors += 1

            print(f"     ✅ Chunk {chunk_num + 1}: {chunk_created} created, {chunk_errors} errors")
            total_created += chunk_created
            total_errors += chunk_errors

        print(f"   🏗️ Developers Summary: {total_created} created, {total_errors} errors")
        return total_created
    except Exception as e:
        print(f"   ❌ Error loading developers: {e}")
        return 0

def load_projects_chunks(csv_path, chunk_size=1000, max_chunks=5):
    """Load projects data in chunks."""
    print("🏢 Loading projects data in chunks...")
    try:
        df = pd.read_csv(csv_path)
        print(f"   Found {len(df)} project records")

        total_created = 0
        total_errors = 0

        # Process in chunks
        for chunk_num in range(min(max_chunks, len(df) // chunk_size + 1)):
            start_idx = chunk_num * chunk_size
            end_idx = min((chunk_num + 1) * chunk_size, len(df))
            chunk_df = df.iloc[start_idx:end_idx]

            print(f"   Processing chunk {chunk_num + 1} (rows {start_idx + 1}-{end_idx})...")

            chunk_created = 0
            chunk_errors = 0

            for _, row in chunk_df.iterrows():
                try:
                    project_number = safe_str(row.get('PROJECT_NUMBER', ''))
                    if not project_number:
                        chunk_errors += 1
                        continue

                    # Get or create developer
                    developer = None
                    dev_number = safe_str(row.get('DEVELOPER_NUMBER', ''))
                    if dev_number:
                        developer, _ = Developer.objects.get_or_create(
                            developer_number=dev_number,
                            defaults={'developer_name_en': safe_str(row.get('DEVELOPER_EN', ''))}
                        )

                    project, created = Project.objects.get_or_create(
                        project_number=project_number,
                        defaults={
                            'project_name_en': safe_str(row.get('PROJECT_EN', '')),
                            'developer': developer,
                            'start_date': safe_datetime(row.get('START_DATE')) or datetime.now(),
                            'end_date': safe_datetime(row.get('END_DATE')),
                            'adoption_date': safe_datetime(row.get('ADOPTION_DATE')),
                            'project_type': safe_str(row.get('PRJ_TYPE_EN', 'Normal')),
                            'project_value': safe_decimal(row.get('PROJECT_VALUE', 0)),
                            'escrow_account_number': safe_str(row.get('ESCROW_ACCOUNT_NUMBER', '')),
                            'project_status': safe_str(row.get('PROJECT_STATUS', 'ACTIVE')),
                            'percent_completed': safe_decimal(row.get('PERCENT_COMPLETED', 0)),
                            'inspection_date': safe_datetime(row.get('INSPECTION_DATE')),
                            'completion_date': safe_datetime(row.get('COMPLETION_DATE')),
                            'description_en': safe_str(row.get('DESCRIPTION_EN', '')),
                            'area_en': safe_str(row.get('AREA_EN', '')),
                            'zone_en': safe_str(row.get('ZONE_EN', '')),
                            'count_land': safe_int(row.get('CNT_LAND', 0)),
                            'count_building': safe_int(row.get('CNT_BUILDING', 0)),
                            'count_villa': safe_int(row.get('CNT_VILLA', 0)),
                            'count_unit': safe_int(row.get('CNT_UNIT', 0)),
                            'master_project_en': safe_str(row.get('MASTER_PROJECT_EN', '')),
                        }
                    )
                    if created:
                        chunk_created += 1
                except Exception as e:
                    chunk_errors += 1

            print(f"     ✅ Chunk {chunk_num + 1}: {chunk_created} created, {chunk_errors} errors")
            total_created += chunk_created
            total_errors += chunk_errors

        print(f"   🏢 Projects Summary: {total_created} created, {total_errors} errors")
        return total_created
    except Exception as e:
        print(f"   ❌ Error loading projects: {e}")
        return 0

def load_lands_chunks(csv_path, chunk_size=1000, max_chunks=5):
    """Load lands data in chunks."""
    print("🏞️ Loading lands data in chunks...")
    try:
        df = pd.read_csv(csv_path)
        print(f"   Found {len(df)} land records")

        total_created = 0
        total_errors = 0

        # Process in chunks
        for chunk_num in range(min(max_chunks, len(df) // chunk_size + 1)):
            start_idx = chunk_num * chunk_size
            end_idx = min((chunk_num + 1) * chunk_size, len(df))
            chunk_df = df.iloc[start_idx:end_idx]

            print(f"   Processing chunk {chunk_num + 1} (rows {start_idx + 1}-{end_idx})...")

            chunk_created = 0
            chunk_errors = 0

            for _, row in chunk_df.iterrows():
                try:
                    # Create a unique land identifier
                    land_id = f"LAND_{uuid.uuid4().hex[:8]}"

                    # Get or create project if available
                    project = None
                    project_name = safe_str(row.get('PROJECT_EN', ''))
                    if project_name:
                        project, _ = Project.objects.get_or_create(
                            project_number=safe_str(row.get('PROJECT_NUMBER', '')),
                            defaults={'project_name_en': project_name}
                        )

                    land, created = Land.objects.get_or_create(
                        pre_registration_number=land_id,
                        defaults={
                            'land_type': safe_str(row.get('LAND_TYPE_EN', 'Residential')),
                            'property_sub_type': safe_str(row.get('PROP_SUB_TYPE_EN', '')),
                            'actual_area': safe_decimal(row.get('ACTUAL_AREA', 0)),
                            'is_offplan': safe_str(row.get('IS_OFFPLAN_EN', 'Ready')),
                            'is_freehold': safe_str(row.get('IS_FREE_HOLD_EN', 'Free Hold')),
                            'dm_zip_code': safe_str(row.get('DM_ZIP_CODE', '')),
                            'master_project': safe_str(row.get('MASTER_PROJECT_EN', '')),
                            'project': project,
                            'area_en': safe_str(row.get('AREA_EN', '')),
                            'zone_en': safe_str(row.get('ZONE_EN', '')),
                        }
                    )
                    if created:
                        chunk_created += 1
                except Exception as e:
                    chunk_errors += 1

            print(f"     ✅ Chunk {chunk_num + 1}: {chunk_created} created, {chunk_errors} errors")
            total_created += chunk_created
            total_errors += chunk_errors

        print(f"   🏞️ Lands Summary: {total_created} created, {total_errors} errors")
        return total_created
    except Exception as e:
        print(f"   ❌ Error loading lands: {e}")
        return 0

def load_rents_chunks(csv_path, chunk_size=1000, max_chunks=5):
    """Load rents data in chunks."""
    print("🏠 Loading rents data in chunks...")
    try:
        df = pd.read_csv(csv_path)
        print(f"   Found {len(df)} rent records")

        total_created = 0
        total_errors = 0

        # Process in chunks
        for chunk_num in range(min(max_chunks, len(df) // chunk_size + 1)):
            start_idx = chunk_num * chunk_size
            end_idx = min((chunk_num + 1) * chunk_size, len(df))
            chunk_df = df.iloc[start_idx:end_idx]

            print(f"   Processing chunk {chunk_num + 1} (rows {start_idx + 1}-{end_idx})...")

            chunk_created = 0
            chunk_errors = 0

            for _, row in chunk_df.iterrows():
                try:
                    # Create a unique rent identifier
                    rent_id = f"RENT_{uuid.uuid4().hex[:8]}"

                    rent, created = Rent.objects.get_or_create(
                        registration_date=safe_datetime(row.get('REGISTRATION_DATE')) or datetime.now(),
                        start_date=safe_datetime(row.get('START_DATE')) or datetime.now(),
                        end_date=safe_datetime(row.get('END_DATE')) or datetime.now(),
                        defaults={
                            'version': safe_str(row.get('VERSION', 'New')),
                            'area_en': safe_str(row.get('AREA_EN', '')),
                            'contract_amount': safe_decimal(row.get('CONTRACT_AMOUNT', 0)),
                            'annual_amount': safe_decimal(row.get('ANNUAL_AMOUNT', 0)),
                            'is_freehold': safe_str(row.get('IS_FREE_HOLD', 'Free Hold')),
                            'actual_area': safe_decimal(row.get('ACTUAL_AREA', 0)),
                            'property_type': safe_str(row.get('PROPERTY_TYPE', 'Unit')),
                            'property_sub_type': safe_str(row.get('PROPERTY_SUB_TYPE', '')),
                            'rooms': safe_str(row.get('ROOMS', '')),
                            'usage': safe_str(row.get('USAGE', 'Residential')),
                        }
                    )
                    if created:
                        chunk_created += 1
                except Exception as e:
                    chunk_errors += 1

            print(f"     ✅ Chunk {chunk_num + 1}: {chunk_created} created, {chunk_errors} errors")
            total_created += chunk_created
            total_errors += chunk_errors

        print(f"   🏠 Rents Summary: {total_created} created, {total_errors} errors")
        return total_created
    except Exception as e:
        print(f"   ❌ Error loading rents: {e}")
        return 0

def load_transactions_chunks(csv_path, chunk_size=1000, max_chunks=5):
    """Load transactions data in chunks."""
    print("💼 Loading transactions data in chunks...")
    try:
        df = pd.read_csv(csv_path)
        print(f"   Found {len(df)} transaction records")

        total_created = 0
        total_errors = 0

        # Process in chunks
        for chunk_num in range(min(max_chunks, len(df) // chunk_size + 1)):
            start_idx = chunk_num * chunk_size
            end_idx = min((chunk_num + 1) * chunk_size, len(df))
            chunk_df = df.iloc[start_idx:end_idx]

            print(f"   Processing chunk {chunk_num + 1} (rows {start_idx + 1}-{end_idx})...")

            chunk_created = 0
            chunk_errors = 0

            for _, row in chunk_df.iterrows():
                try:
                    transaction_number = safe_str(row.get('TRANSACTION_NUMBER', ''))
                    if not transaction_number:
                        chunk_errors += 1
                        continue

                    # Get or create project with proper developer
                    project = None
                    project_name = safe_str(row.get('PROJECT_EN', ''))
                    if project_name:
                        # Create a default developer if needed
                        default_developer, _ = Developer.objects.get_or_create(
                            developer_number='DEFAULT',
                            defaults={'developer_name_en': 'Default Developer'}
                        )

                        project, _ = Project.objects.get_or_create(
                            project_number=safe_str(row.get('PROJECT_NUMBER', '')),
                            defaults={
                                'project_name_en': project_name,
                                'developer': default_developer,
                                'start_date': datetime.now(),
                                'project_status': 'ACTIVE',
                                'area_en': safe_str(row.get('AREA_EN', '')),
                                'zone_en': safe_str(row.get('ZONE_EN', '')),
                            }
                        )

                    transaction, created = Transaction.objects.get_or_create(
                        transaction_number=transaction_number,
                        defaults={
                            'instance_date': safe_datetime(row.get('INSTANCE_DATE')) or datetime.now(),
                            'group': safe_str(row.get('GROUP_EN', 'Sale')),
                            'procedure': safe_str(row.get('PROCEDURE_EN', '')),
                            'is_offplan': safe_str(row.get('IS_OFFPLAN_EN', 'Ready')),
                            'is_freehold': safe_str(row.get('IS_FREE_HOLD_EN', 'Free Hold')),
                            'usage': safe_str(row.get('USAGE_EN', 'Residential')),
                            'area_en': safe_str(row.get('AREA_EN', '')),
                            'property_type': safe_str(row.get('PROP_TYPE_EN', 'Unit')),
                            'property_sub_type': safe_str(row.get('PROP_SB_TYPE_EN', '')),
                            'transaction_value': safe_decimal(row.get('TRANS_VALUE', 0)),
                            'procedure_area': safe_decimal(row.get('PROCEDURE_AREA', 0)),
                            'actual_area': safe_decimal(row.get('ACTUAL_AREA', 0)),
                            'rooms': safe_str(row.get('ROOMS_EN', '')),
                            'parking': int(safe_str(row.get('PARKING', 0)) or 0),
                            'nearest_metro': safe_str(row.get('NEAREST_METRO_EN', '')),
                            'nearest_mall': safe_str(row.get('NEAREST_MALL_EN', '')),
                            'nearest_landmark': safe_str(row.get('NEAREST_LANDMARK_EN', '')),
                            'total_buyer': int(safe_str(row.get('TOTAL_BUYER', 0)) or 0),
                            'total_seller': int(safe_str(row.get('TOTAL_SELLER', 0)) or 0),
                            'master_project': safe_str(row.get('MASTER_PROJECT_EN', '')),
                            'project': project,
                        }
                    )
                    if created:
                        chunk_created += 1
                except Exception as e:
                    chunk_errors += 1

            print(f"     ✅ Chunk {chunk_num + 1}: {chunk_created} created, {chunk_errors} errors")
            total_created += chunk_created
            total_errors += chunk_errors

        print(f"   💼 Transactions Summary: {total_created} created, {total_errors} errors")
        return total_created
    except Exception as e:
        print(f"   ❌ Error loading transactions: {e}")
        return 0

def load_valuations_chunks(csv_path, chunk_size=1000, max_chunks=5):
    """Load valuations data in chunks."""
    print("💰 Loading valuations data in chunks...")
    try:
        df = pd.read_csv(csv_path)
        print(f"   Found {len(df)} valuation records")

        total_created = 0
        total_errors = 0

        # Process in chunks
        for chunk_num in range(min(max_chunks, len(df) // chunk_size + 1)):
            start_idx = chunk_num * chunk_size
            end_idx = min((chunk_num + 1) * chunk_size, len(df))
            chunk_df = df.iloc[start_idx:end_idx]

            print(f"   Processing chunk {chunk_num + 1} (rows {start_idx + 1}-{end_idx})...")

            chunk_created = 0
            chunk_errors = 0

            for _, row in chunk_df.iterrows():
                try:
                    # Create a unique valuation identifier
                    valuation_id = f"VAL_{uuid.uuid4().hex[:8]}"

                    valuation, created = Valuation.objects.get_or_create(
                        procedure_number=valuation_id,
                        defaults={
                            'property_total_value': safe_decimal(row.get('PROPERTY_TOTAL_VALUE', 0)),
                            'area_en': safe_str(row.get('AREA_EN', '')),
                            'actual_area': safe_decimal(row.get('ACTUAL_AREA', 0)),
                            'procedure_year': safe_int(row.get('PROCEDURE_YEAR', datetime.now().year)),
                            'instance_date': safe_datetime(row.get('INSTANCE_DATE')) or datetime.now(),
                            'actual_worth': safe_decimal(row.get('ACTUAL_WORTH', 0)),
                            'procedure_area': safe_decimal(row.get('PROCEDURE_AREA', 0)),
                            'property_type': safe_str(row.get('PROPERTY_TYPE', 'Unit')),
                            'property_sub_type': safe_str(row.get('PROPERTY_SUB_TYPE', '')),
                        }
                    )
                    if created:
                        chunk_created += 1
                except Exception as e:
                    chunk_errors += 1

            print(f"     ✅ Chunk {chunk_num + 1}: {chunk_created} created, {chunk_errors} errors")
            total_created += chunk_created
            total_errors += chunk_errors

        print(f"   💰 Valuations Summary: {total_created} created, {total_errors} errors")
        return total_created
    except Exception as e:
        print(f"   ❌ Error loading valuations: {e}")
        return 0

def create_sample_forecasts():
    """Create sample forecast data."""
    print("🔮 Creating sample forecasts...")
    try:
        # Get some sample areas and property types from transactions
        areas = Transaction.objects.values_list('area_en', flat=True).distinct()[:5]
        property_types = Transaction.objects.values_list('property_type', flat=True).distinct()[:3]

        forecasts_created = 0
        for area in areas:
            for prop_type in property_types:
                if not area or not prop_type:
                    continue

                forecast, created = Forecast.objects.get_or_create(
                    area_en=area,
                    property_type=prop_type,
                    defaults={
                        'forecast_date': datetime.now().date(),
                        'predicted_price': Decimal('1000000.00'),
                        'confidence_interval_lower': Decimal('800000.00'),
                        'confidence_interval_upper': Decimal('1200000.00'),
                        'model_version': '1.0',
                        'accuracy_score': Decimal('0.85'),
                        'metadata': {'source': 'sample_data', 'model': 'linear_regression'}
                    }
                )
                if created:
                    forecasts_created += 1

        print(f"   ✅ Created {forecasts_created} sample forecasts")
        return forecasts_created
    except Exception as e:
        print(f"   ❌ Error creating forecasts: {e}")
        return 0

def verify_data_loaded():
    """Verify that data has been loaded successfully."""
    print("\n🔍 Verifying loaded data...")
    try:
        counts = {
            'Brokers': Broker.objects.count(),
            'Developers': Developer.objects.count(),
            'Projects': Project.objects.count(),
            'Lands': Land.objects.count(),
            'Rents': Rent.objects.count(),
            'Transactions': Transaction.objects.count(),
            'Valuations': Valuation.objects.count(),
            'Forecasts': Forecast.objects.count(),
        }

        print("   📊 Current database counts:")
        for model_name, count in counts.items():
            print(f"     {model_name}: {count}")

        return counts
    except Exception as e:
        print(f"   ❌ Error verifying data: {e}")
        return {}

def main():
    """Main function to load all CSV data in chunks."""
    print("=" * 80)
    print("  Dubai Analytics Platform - Load All Data in Chunks")
    print("  Loading data in manageable chunks with progress tracking")
    print("=" * 80)
    print()

    # Check if sample data directory exists
    sample_data_dir = "sample data"
    if not os.path.exists(sample_data_dir):
        print(f"❌ Sample data directory '{sample_data_dir}' not found!")
        print("   Please ensure the CSV files are in the 'sample data' directory.")
        return

    # Configuration
    chunk_size = 1000  # Records per chunk
    max_chunks = 10    # Maximum chunks to process per table

    # Track total records created
    total_created = 0

    # Load each CSV file in chunks
    csv_files = [
        ('brokers.csv', load_brokers_chunks),
        ('developers.csv', load_developers_chunks),
        ('projects.csv', load_projects_chunks),
        ('lands.csv', load_lands_chunks),
        ('rents.csv', load_rents_chunks),
        ('transactions.csv', load_transactions_chunks),
        ('valuations.csv', load_valuations_chunks),
    ]

    for csv_file, load_function in csv_files:
        csv_path = os.path.join(sample_data_dir, csv_file)
        if os.path.exists(csv_path):
            print(f"\n📁 Processing {csv_file}...")
            created = load_function(csv_path, chunk_size=chunk_size, max_chunks=max_chunks)
            total_created += created
        else:
            print(f"⚠️  File {csv_file} not found, skipping...")

    # Create sample forecasts
    print(f"\n🔮 Creating sample forecasts...")
    forecasts_created = create_sample_forecasts()
    total_created += forecasts_created

    # Verify data loaded
    counts = verify_data_loaded()

    # Summary
    print("\n" + "=" * 80)
    print("  Data Loading Summary")
    print("=" * 80)
    print(f"📊 Total records created: {total_created}")
    print()
    print("✅ All data loading completed successfully!")
    print()
    print("Next steps:")
    print("1. Access Django Admin: http://localhost:8000/admin/")
    print("2. Login with: admin@dubai-analytics.com / admin123")
    print("3. View the loaded data in the admin interface")
    print("4. Test the API endpoints with the sample data")

if __name__ == '__main__':
    main()