PDF_Generation_and_Automation/python-pdf-generator/app.py

#!/usr/bin/env python3
"""
Advanced HTML to PDF Generator API with Intelligent Content Analysis
Supports URLs, HTML files, HTML strings, and batch processing
Always uses A4 size for consistent output
"""

from flask import Flask, request, send_file, jsonify
import os
import asyncio
import tempfile
import zipfile
from playwright.async_api import async_playwright, TimeoutError, Page
from werkzeug.utils import secure_filename
import uuid
from datetime import datetime
import re
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = Flask(__name__)

# Configure temp folder
TEMP_FOLDER = 'temp'
if not os.path.exists(TEMP_FOLDER):
    os.makedirs(TEMP_FOLDER)

class HTMLPreprocessor:
    """Intelligently preprocesses HTML to remove spacing issues and optimize for PDF generation."""

    @staticmethod
    def preprocess_html(html_content: str) -> str:
        """
        Dynamically analyze and fix spacing issues in HTML for perfect PDF generation.
        """
        print("🔧 Preprocessing HTML for optimal PDF generation...")

        # Step 1: Detect page elements and their structure
        page_info = HTMLPreprocessor._analyze_page_structure(html_content)

        # Step 2: Remove problematic spacing
        html_content = HTMLPreprocessor._remove_spacing_issues(html_content, page_info)

        # Step 3: Optimize for PDF generation
        html_content = HTMLPreprocessor._optimize_for_pdf(html_content, page_info)

        print(f"✅ HTML preprocessing completed - {page_info['page_count']} pages optimized")
        return html_content

    @staticmethod
    def _analyze_page_structure(html_content: str) -> dict:
        """Analyze the HTML structure to understand page layout and spacing."""

        # Detect page elements
        page_selectors = [
            r'class="[^"]*brochure-page[^"]*"',
            r'class="[^"]*page[^"]*"',
            r'class="[^"]*pdf-page[^"]*"',
            r'class="[^"]*slide[^"]*"',
            r'class="[^"]*section[^"]*"'
        ]

        page_count = 0
        page_elements = []

        for selector in page_selectors:
            matches = re.findall(selector, html_content, re.IGNORECASE)
            if matches:
                page_count = len(matches)
                page_elements = matches
                break

        # If no specific page elements found, look for A4-sized containers
        if page_count == 0:
            # Look for elements with A4-like dimensions in CSS
            a4_patterns = [
                r'width:\s*210mm',
                r'height:\s*297mm',
                r'width:\s*794px',
                r'height:\s*1123px',
                r'width:\s*8\.27in',
                r'height:\s*11\.7in'
            ]

            for pattern in a4_patterns:
                if re.search(pattern, html_content, re.IGNORECASE):
                    page_count = 1
                    break

        # Analyze body and container spacing
        spacing_issues = HTMLPreprocessor._detect_spacing_issues(html_content)

        return {
            'page_count': page_count,
            'page_elements': page_elements,
            'spacing_issues': spacing_issues,
            'has_flexbox': 'display: flex' in html_content,
            'has_grid': 'display: grid' in html_content,
            'has_padding': 'padding:' in html_content,
            'has_margin': 'margin:' in html_content,
            'has_gap': 'gap:' in html_content
        }

    @staticmethod
    def _detect_spacing_issues(html_content: str) -> dict:
        """Detect various types of spacing issues that affect PDF generation."""

        issues = {
            'body_padding': False,
            'body_margin': False,
            'body_gap': False,
            'document_level_spacing': False,
            'container_spacing': False
        }

        # Check for body-level spacing issues
        if re.search(r'body\s*{[^}]*padding[^}]*}', html_content, re.IGNORECASE):
            issues['body_padding'] = True

        if re.search(r'body\s*{[^}]*margin[^}]*}', html_content, re.IGNORECASE):
            issues['body_margin'] = True

        if re.search(r'body\s*{[^}]*gap[^}]*}', html_content, re.IGNORECASE):
            issues['body_gap'] = True

        # Check for document-level spacing
        if re.search(r'html\s*{[^}]*padding[^}]*}', html_content, re.IGNORECASE):
            issues['document_level_spacing'] = True

        if re.search(r'html\s*{[^}]*margin[^}]*}', html_content, re.IGNORECASE):
            issues['document_level_spacing'] = True

        # Check for container spacing
        if re.search(r'\.container\s*{[^}]*padding[^}]*}', html_content, re.IGNORECASE):
            issues['container_spacing'] = True

        if re.search(r'\.wrapper\s*{[^}]*padding[^}]*}', html_content, re.IGNORECASE):
            issues['container_spacing'] = True

        return issues

    @staticmethod
    def _remove_spacing_issues(html_content: str, page_info: dict) -> str:
        """Remove problematic spacing while preserving internal page spacing."""

        # Only remove document-level spacing, preserve internal spacing
        if page_info['spacing_issues']['body_padding']:
            html_content = re.sub(
                r'(body\s*{[^}]*?)padding[^;]*;?([^}]*})',
                r'\1\2',
                html_content,
                flags=re.IGNORECASE
            )

        if page_info['spacing_issues']['body_margin']:
            html_content = re.sub(
                r'(body\s*{[^}]*?)margin[^;]*;?([^}]*})',
                r'\1\2',
                html_content,
                flags=re.IGNORECASE
            )

        if page_info['spacing_issues']['body_gap']:
            html_content = re.sub(
                r'(body\s*{[^}]*?)gap[^;]*;?([^}]*})',
                r'\1\2',
                html_content,
                flags=re.IGNORECASE
            )

        if page_info['spacing_issues']['document_level_spacing']:
            html_content = re.sub(
                r'(html\s*{[^}]*?)padding[^;]*;?([^}]*})',
                r'\1\2',
                html_content,
                flags=re.IGNORECASE
            )
            html_content = re.sub(
                r'(html\s*{[^}]*?)margin[^;]*;?([^}]*})',
                r'\1\2',
                html_content,
                flags=re.IGNORECASE
            )

        # Add CSS to ensure continuous flow
        continuous_flow_css = '''
        /* Ensure continuous flow for PDF generation */
        body {
            padding: 0 !important;
            margin: 0 !important;
            gap: 0 !important;
        }

        /* Preserve all internal page spacing and margins */
        .page-layout, .p1-content-side, .p2-grid, .p3-main-content, .p4-info-grid {
            /* Keep all internal spacing intact */
        }

        /* Ensure no page breaks within content */
        .brochure-page, .page, .pdf-page, .slide, .section {
            page-break-after: auto;
            page-break-inside: avoid;
            break-inside: avoid;
        }

        /* Preserve internal margins and padding */
        * {
            page-break-inside: avoid;
            break-inside: avoid;
        }
'''

        # Insert the CSS after existing styles
        if '</style>' in html_content:
            html_content = html_content.replace('</style>', continuous_flow_css + '\n    </style>')

        return html_content

    @staticmethod
    def _optimize_for_pdf(html_content: str, page_info: dict) -> str:
        """Add PDF-specific optimizations while preserving internal spacing."""

        pdf_optimizations = '''
        /* PDF-specific optimizations - preserve internal spacing */
        @media print {
            /* Only remove document-level spacing, preserve internal spacing */
            body {
                padding: 0 !important;
                margin: 0 !important;
                gap: 0 !important;
            }

            /* Preserve all internal page spacing and margins */
            .page-layout {
                padding: 70px !important; /* Keep internal page padding */
            }

            .p1-content-side {
                padding: 70px 60px !important; /* Keep content padding */
            }

            /* Ensure no page breaks within content */
            .brochure-page, .page, .pdf-page {
                page-break-after: auto !important;
                page-break-inside: avoid !important;
            }
        }

        /* Ensure exact color rendering */
        * {
            -webkit-print-color-adjust: exact !important;
            color-adjust: exact !important;
        }
'''

        # Insert PDF optimizations
        if '</style>' in html_content:
            html_content = html_content.replace('</style>', pdf_optimizations + '\n    </style>')

        return html_content

class PageDetector:
    """Detects page elements and their dimensions in HTML documents."""

    @staticmethod
    async def detect_pages_and_format(page: Page) -> dict:
        """
        Advanced page detection with multiple fallback strategies.
        Handles different HTML structures and CSS approaches robustly.
        """
        page_info = await page.evaluate("""
            () => {
                // Strategy 1: Direct page element detection
                const pageSelectors = [
                    '.brochure-page',
                    '.brochure',
                    '.page',
                    '[class*="page"]',
                    '.pdf-page',
                    '.slide',
                    '.section'
                ];

                let pageElements = [];
                let detectedSelector = '';

                // Find page elements with priority order
                for (const selector of pageSelectors) {
                    const elements = document.querySelectorAll(selector);
                    if (elements.length > 0) {
                        pageElements = Array.from(elements);
                        detectedSelector = selector;
                        break;
                    }
                }

                // Strategy 2: A4-sized container detection
                if (pageElements.length === 0) {
                    const allElements = document.querySelectorAll('*');
                    const a4Elements = Array.from(allElements).filter(el => {
                        const style = window.getComputedStyle(el);
                        const width = parseFloat(style.width);
                        const height = parseFloat(style.height);

                        // A4 dimensions in different units
                        const isA4Width = (width >= 794 && width <= 800) ||
                                        (width >= 210 && width <= 220) ||
                                        (width >= 8.27 && width <= 8.5);
                        const isA4Height = (height >= 1123 && height <= 1130) ||
                                         (height >= 297 && height <= 300) ||
                                         (height >= 11.69 && height <= 12);

                        return isA4Width && isA4Height;
                    });

                    if (a4Elements.length > 0) {
                        pageElements = a4Elements;
                        detectedSelector = 'A4-sized-element';
                    }
                }

                // Strategy 3: Body as single page
                if (pageElements.length === 0) {
                    pageElements = [document.body];
                    detectedSelector = 'body';
                }

                // Advanced dimension analysis with multiple measurement methods
                let dimensionResults = [];

                pageElements.forEach((element, index) => {
                    const measurements = {};

                    // Method 1: CSS Computed Style
                    const computedStyle = window.getComputedStyle(element);
                    const cssWidth = parseFloat(computedStyle.width);
                    const cssHeight = parseFloat(computedStyle.height);

                    if (cssWidth > 0 && cssHeight > 0) {
                        measurements.css = { width: cssWidth, height: cssHeight };
                    }

                    // Method 2: Bounding Client Rect
                    const rect = element.getBoundingClientRect();
                    if (rect.width > 0 && rect.height > 0) {
                        measurements.bounding = { width: rect.width, height: rect.height };
                    }

                    // Method 3: Offset Dimensions
                    if (element.offsetWidth > 0 && element.offsetHeight > 0) {
                        measurements.offset = { width: element.offsetWidth, height: element.offsetHeight };
                    }

                    // Method 4: Scroll Dimensions
                    if (element.scrollWidth > 0 && element.scrollHeight > 0) {
                        measurements.scroll = { width: element.scrollWidth, height: element.scrollHeight };
                    }

                    // Method 5: Client Dimensions
                    if (element.clientWidth > 0 && element.clientHeight > 0) {
                        measurements.client = { width: element.clientWidth, height: element.clientHeight };
                    }

                    // Select the best measurement method
                    let bestMeasurement = null;
                    let bestScore = 0;

                    Object.entries(measurements).forEach(([method, dims]) => {
                        const score = calculateDimensionScore(dims.width, dims.height);
                        if (score > bestScore) {
                            bestScore = score;
                            bestMeasurement = { method, ...dims };
                        }
                    });

                    if (bestMeasurement) {
                        dimensionResults.push({
                            index,
                            element: element.tagName + (element.className ? '.' + element.className.split(' ')[0] : ''),
                            ...bestMeasurement
                        });
                    }
                });

                // Helper function to score dimensions
                function calculateDimensionScore(width, height) {
                    if (width <= 0 || height <= 0) return 0;
                    if (width > 2000 || height > 2000) return 0; // Too large
                    if (width < 50 || height < 50) return 0; // Too small

                    // Prefer A4-like dimensions
                    const aspectRatio = width / height;
                    const a4Ratio = 210 / 297; // 0.707
                    const ratioScore = 1 - Math.abs(aspectRatio - a4Ratio) / a4Ratio;

                    // Prefer reasonable sizes
                    const sizeScore = Math.min(width / 800, height / 1200);

                    return ratioScore * sizeScore;
                }

                // Calculate final dimensions
                let maxWidth = 0;
                let maxHeight = 0;
                let totalWidth = 0;
                let totalHeight = 0;
                let validCount = 0;

                dimensionResults.forEach(result => {
                    if (result.width > 0 && result.height > 0) {
                        maxWidth = Math.max(maxWidth, result.width);
                        maxHeight = Math.max(maxHeight, result.height);
                        totalWidth += result.width;
                        totalHeight += result.height;
                        validCount++;
                    }
                });

                // Fallback to standard A4 if no valid dimensions
                if (validCount === 0) {
                    maxWidth = 794;
                    maxHeight = 1123;
                    console.warn('No valid dimensions detected, using standard A4');
                }

                // Enhanced format detection
                let format = 'a4';
                const aspectRatio = maxWidth / maxHeight;

                if (Math.abs(aspectRatio - 0.707) < 0.1) { // A4 ratio
                    format = 'a4';
                } else if (Math.abs(aspectRatio - 0.773) < 0.1) { // Letter ratio
                    format = 'a4';
                } else if (Math.abs(aspectRatio - 0.607) < 0.1) { // Legal ratio
                    format = 'a4';
                } else if (aspectRatio > 1.2) { // Landscape
                    format = 'a4';
                } else if (aspectRatio < 0.5) { // Very tall
                    format = 'a4';
                }

                return {
                    pageCount: pageElements.length,
                    format: format,
                    maxWidth: Math.round(maxWidth),
                    maxHeight: Math.round(maxHeight),
                    totalWidth: Math.round(totalWidth),
                    totalHeight: Math.round(totalHeight),
                    aspectRatio: aspectRatio,
                    detectedSelector: detectedSelector,
                    validDimensions: validCount,
                    averageWidth: validCount > 0 ? Math.round(totalWidth / validCount) : 0,
                    averageHeight: validCount > 0 ? Math.round(totalHeight / validCount) : 0,
                    dimensionResults: dimensionResults,
                    hasReasonableDimensions: maxWidth >= 200 && maxHeight >= 200,
                    measurementMethods: dimensionResults.map(r => r.method)
                };
            }
        """)

        return page_info

async def generate_single_pdf(input_content: str, output_pdf: str, is_url: bool = False, is_file: bool = False, is_html_string: bool = False):
    """
    Generate PDF for a single input (URL, file path, or HTML string).
    Always uses A4 size for consistent output with intelligent content fitting.
    """
    temp_file = None
    try:
        async with async_playwright() as p:
            # Use the correct Chromium path
            browser = await p.chromium.launch(
                headless=True,
                args=[
                    '--disable-dev-shm-usage',
                    '--no-sandbox',
                    '--disable-setuid-sandbox',
                    '--disable-gpu',
                    '--disable-web-security',
                    '--disable-features=VizDisplayCompositor',
                    '--enable-font-antialiasing',
                    '--font-render-hinting=none',
                    '--disable-background-timer-throttling',
                    '--disable-backgrounding-occluded-windows',
                    '--disable-renderer-backgrounding',
                    '--allow-running-insecure-content',
                    '--disable-extensions',
                    '--disable-plugins',
                    '--disable-images=false',
                    '--enable-javascript',
                    '--enable-css',
                    '--enable-fonts'
                ]
            )

            page = await browser.new_page()
            await page.set_viewport_size({'width': 1920, 'height': 1080})
            page.set_default_timeout(120000)

            if is_html_string:
                # Preprocess HTML content
                processed_html = HTMLPreprocessor.preprocess_html(input_content)

                # Write processed HTML to temp file
                with tempfile.NamedTemporaryFile(delete=False, suffix='.html', dir=TEMP_FOLDER) as tmp:
                    tmp.write(processed_html.encode('utf-8'))
                    temp_file = tmp.name
                abs_path = "file://" + os.path.abspath(temp_file)
                await page.goto(abs_path, wait_until="load")
            elif is_url:
                await page.goto(input_content, wait_until="domcontentloaded")
            elif is_file:
                abs_path = "file://" + os.path.abspath(input_content)
                await page.goto(abs_path, wait_until="load")
            else:
                raise ValueError("Invalid input type")

            # Wait for content to load and stabilize
            await page.wait_for_timeout(3000)

            # Wait for any dynamic content to finish loading
            try:
                await page.wait_for_load_state('networkidle', timeout=10000)
            except:
                pass  # Continue if network idle doesn't happen

            # Ensure all external resources are loaded
            print("🔄 Loading external resources...")
            await _ensure_resources_loaded(page)

            # Detect pages and format
            print("🔍 Analyzing page structure...")
            page_info = await PageDetector.detect_pages_and_format(page)

            print(f"📄 Detected {page_info['pageCount']} pages")
            print(f"📐 Max dimensions: {page_info['maxWidth']}x{page_info['maxHeight']}px")
            print(f"🎯 Recommended format: {page_info['format']}")
            print(f"🔍 Detected selector: {page_info['detectedSelector']}")
            print(f"✅ Valid dimensions: {page_info['validDimensions']}")
            print(f"📏 Average dimensions: {page_info['averageWidth']}x{page_info['averageHeight']}px")
            print(f"📊 Aspect ratio: {page_info['aspectRatio']:.3f}")
            print(f"🔧 Measurement methods: {', '.join(page_info['measurementMethods'])}")

            # Always use A4 format for consistent output
            pdf_format = 'a4'

            # Calculate optimal scale to fit content within A4 dimensions
            dpi = 96
            content_width_px = page_info['maxWidth']
            content_height_px = page_info['maxHeight']

            # Convert to inches
            content_width_in = content_width_px / dpi
            content_height_in = content_height_px / dpi

            # Determine orientation based on content analysis
            landscape = False
            if content_width_in > content_height_in * 1.2:  # 20% wider threshold
                landscape = True
            elif page_info.get('hasTables', False) and content_width_in > content_height_in * 1.1:  # Tables need more width
                landscape = True

            # Always use A4 dimensions
            if landscape:
                # A4 Landscape: 11" x 8.5"
                pdf_width = 11.0
                pdf_height = 8.5
            else:
                # A4 Portrait: 8.5" x 11"
                pdf_width = 8.5
                pdf_height = 11.0

            # Calculate optimal scale to fit content within A4 dimensions
            # Account for margins when calculating scale
            margin_in = 0.5  # 0.5 inch margins
            available_width = pdf_width - (2 * margin_in)
            available_height = pdf_height - (2 * margin_in)

            # Calculate scale to fit content within available space
            width_scale = available_width / content_width_in if content_width_in > 0 else 1.0
            height_scale = available_height / content_height_in if content_height_in > 0 else 1.0

            # Use the smaller scale to ensure content fits in both dimensions
            optimal_scale = min(width_scale, height_scale, 1.0)  # Don't scale up beyond 100%

            # Ensure minimum scale for readability
            if optimal_scale < 0.3:
                optimal_scale = 0.3  # Minimum 30% scale for readability

            # Adjust margins based on content type - optimized for A4 size
            if page_info.get('hasTables', False):
                # Tables need more breathing room on A4
                margins = {'top': '0.75in', 'right': '0.75in', 'bottom': '0.75in', 'left': '0.75in'}
            elif page_info.get('hasImages', False):
                # Images look better with balanced margins on A4
                margins = {'top': '0.6in', 'right': '0.6in', 'bottom': '0.6in', 'left': '0.6in'}
            else:
                # Text content works well with standard A4 margins
                margins = {'top': '0.5in', 'right': '0.5in', 'bottom': '0.5in', 'left': '0.5in'}

            # For very small content, use smaller margins to maximize A4 space
            if content_width_in < 6.0 and content_height_in < 8.0:
                margins = {'top': '0.4in', 'right': '0.4in', 'bottom': '0.4in', 'left': '0.4in'}

            # For very large content, use larger margins to ensure readability
            if content_width_in > 10.0 or content_height_in > 12.0:
                margins = {'top': '0.8in', 'right': '0.8in', 'bottom': '0.8in', 'left': '0.8in'}

            pdf_options = {
                'path': output_pdf,
                'print_background': True,
                'margin': margins,
                'scale': optimal_scale,
                'landscape': landscape,
                'width': f"{pdf_width}in",
                'height': f"{pdf_height}in",
                'prefer_css_page_size': False,  # Disable CSS page size to ensure A4
                'format': 'A4'  # Explicitly set A4 format
            }

            # Generate PDF
            await page.pdf(**pdf_options)
            await browser.close()

            print(f"✅ PDF generated: {output_pdf}")
            print(f"📏 A4 Size: {pdf_width}in x {pdf_height}in ({'Landscape' if landscape else 'Portrait'})")
            print(f"📐 Content: {content_width_in:.2f}in x {content_height_in:.2f}in")
            print(f"🔍 Scale: {optimal_scale:.2f} (optimized for A4 fit)")
            print(f"📄 Format: A4 Standard")

    except TimeoutError:
        raise Exception("Timeout: Page took too long to load.")
    except Exception as e:
        print(f"❌ PDF generation error: {str(e)}")
        raise e
    finally:
        if temp_file and os.path.exists(temp_file):
            os.remove(temp_file)

async def _ensure_resources_loaded(page: Page):
    """Ensure all external resources are properly loaded."""

    # Wait for fonts to load
    await page.evaluate("""
        () => {
            return document.fonts.ready;
        }
    """)

    # Wait for all images to load
    await page.evaluate("""
        () => {
            return Promise.all(
                Array.from(document.images)
                    .filter(img => !img.complete)
                    .map(img => new Promise(resolve => {
                        img.onload = img.onerror = resolve;
                    }))
            );
        }
    """)

    # Wait for background images to load
    await page.evaluate("""
        () => {
            const elementsWithBg = document.querySelectorAll('[style*="background-image"], [class*="image"]');
            return Promise.all(
                Array.from(elementsWithBg).map(el => {
                    const style = window.getComputedStyle(el);
                    const bgImage = style.backgroundImage;
                    if (bgImage && bgImage !== 'none') {
                        return new Promise(resolve => {
                            const img = new Image();
                            img.onload = img.onerror = resolve;
                            img.src = bgImage.replace(/url\\(['"]?(.*?)['"]?\\)/g, '$1');
                        });
                    }
                    return Promise.resolve();
                })
            );
        }
    """)

    # Wait for CSS to be fully applied
    await page.wait_for_timeout(2000)

def process_input(input_content: str, output_name: str = None):
    """
    Process the input: determine type and generate PDF(s).
    Returns path to PDF or ZIP file.
    """
    is_url = input_content.startswith('http://') or input_content.startswith('https://')
    is_file = False # HTML content is not a physical file
    is_html_string = True # HTML content is always a string

    if output_name is None:
        output_name = f'single_output_{uuid.uuid4().hex[:8]}.pdf'

    if is_file:
        # This case should ideally not be reached for HTML content
        raise ValueError("HTML content cannot be treated as a file path.")

    pdf_path = os.path.join(TEMP_FOLDER, secure_filename(output_name))

    if is_url:
        asyncio.run(generate_single_pdf(input_content, pdf_path, is_url=True))
    elif is_html_string:
        asyncio.run(generate_single_pdf(input_content, pdf_path, is_html_string=True))
    else:
        raise ValueError("Invalid input type for processing")

    return pdf_path, 'application/pdf'

@app.route('/')
def root():
    """Root endpoint"""
    return jsonify({
        "message": "PDF Generator API",
        "version": "2.0.0",
        "status": "running",
        "timestamp": datetime.now().isoformat(),
        "endpoints": {
            "generate-pdf": "/generate-pdf",
            "health": "/health"
        },
        "features": [
            "HTML string to PDF",
            "URL to PDF",
            "HTML file to PDF",
            "Batch HTML files to ZIP",
            "Standard A4 format",
            "Consistent page sizing"
        ],
        "usage": {
            "method": "POST",
            "endpoint": "/generate-pdf",
            "body": {
                "input": "HTML string, URL, or file path",
                "output": "Optional output filename"
            }
        }
    })

@app.route('/health')
def health_check():
    """Health check endpoint"""
    return jsonify({
        "status": "healthy",
        "timestamp": datetime.now().isoformat(),
        "service": "Advanced HTML to PDF Generator",
        "temp_folder": TEMP_FOLDER,
        "temp_folder_exists": os.path.exists(TEMP_FOLDER),
        "uptime": "running"
    })

@app.route('/generate-pdf', methods=['POST'])
def generate_pdf_api():
    """Main PDF generation endpoint"""
    try:
        # Get request data - handle both JSON and form data more robustly
        input_content = None
        output_name = None

        if request.is_json:
            try:
                data = request.get_json()
                if data and 'input' in data:
                    input_content = data['input']
                    output_name = data.get('output', None)
            except Exception as json_error:
                print(f"❌ JSON parsing error: {json_error}")
                return jsonify({'error': f'Invalid JSON format: {str(json_error)}'}), 400
        else:
            # Handle form data
            input_content = request.form.get('input')
            output_name = request.form.get('output')

            # If input is a file, read its content
            if 'input' in request.files:
                file = request.files['input']
                if file and file.filename:
                    try:
                        input_content = file.read().decode('utf-8')
                        if not output_name:
                            output_name = file.filename.replace('.html', '.pdf')
                    except UnicodeDecodeError:
                        return jsonify({'error': 'File encoding error. Please ensure the file is UTF-8 encoded.'}), 400

        # Validate input
        if not input_content or input_content.strip() == '':
            return jsonify({'error': 'Input cannot be empty. Please provide HTML content.'}), 400

        # Clean the HTML content - remove problematic control characters
        input_content = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', input_content)

        # Process input and generate PDF/ZIP
        file_path, mime_type = process_input(input_content, output_name)

        # Check if file was created
        if not os.path.exists(file_path):
            return jsonify({'error': 'Failed to generate output file'}), 500

        # Send file response
        response = send_file(
            file_path,
            as_attachment=True,
            download_name=os.path.basename(file_path),
            mimetype=mime_type
        )

        # Clean up after sending
        @response.call_on_close
        def cleanup():
            try:
                if os.path.exists(file_path):
                    os.remove(file_path)
                    print(f"🧹 Cleaned up: {file_path}")
            except Exception as e:
                print(f"❌ Cleanup error: {e}")

        return response

    except Exception as e:
        print(f"❌ API Error: {str(e)}")
        return jsonify({'error': str(e)}), 500

@app.after_request
def cleanup_temp_files(response):
    """Clean up temporary files older than 1 hour"""
    try:
        import time
        current_time = time.time()
        for filename in os.listdir(TEMP_FOLDER):
            filepath = os.path.join(TEMP_FOLDER, filename)
            if os.path.isfile(filepath):
                if current_time - os.path.getmtime(filepath) > 3600:  # 1 hour
                    os.remove(filepath)
                    print(f"🧹 Auto-cleanup: {filename}")
    except Exception as e:
        print(f"❌ Auto-cleanup error: {e}")
    return response

if __name__ == '__main__':
    print("🚀 Starting Advanced HTML to PDF Generator API...")
    print("📝 Endpoints available:")
    print("  GET  / - API information")
    print("  GET  /health - Health check")
    print("  POST /generate-pdf - Generate PDF from HTML/URL/file")
    print("")
    print("✨ Features:")
    print("  • HTML string to PDF")
    print("  • URL to PDF")
    print("  • HTML file to PDF")
    print("  • Batch HTML files to ZIP")
    print("  • Standard A4 format")
    print("  • Consistent page sizing")

    app.run(host='0.0.0.0', port=8000, debug=True)