#!/usr/bin/env python3 """ Advanced HTML to PDF Generator API with Intelligent Content Analysis Supports URLs, HTML files, HTML strings, and batch processing Always uses A4 size for consistent output """ from flask import Flask, request, send_file, jsonify import os import asyncio import tempfile import zipfile from playwright.async_api import async_playwright, TimeoutError, Page from werkzeug.utils import secure_filename import uuid from datetime import datetime import re import logging # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) app = Flask(__name__) # Configure temp folder TEMP_FOLDER = 'temp' if not os.path.exists(TEMP_FOLDER): os.makedirs(TEMP_FOLDER) class HTMLPreprocessor: """Intelligently preprocesses HTML to remove spacing issues and optimize for PDF generation.""" @staticmethod def preprocess_html(html_content: str) -> str: """ Dynamically analyze and fix spacing issues in HTML for perfect PDF generation. """ print("๐Ÿ”ง Preprocessing HTML for optimal PDF generation...") # Step 1: Detect page elements and their structure page_info = HTMLPreprocessor._analyze_page_structure(html_content) # Step 2: Remove problematic spacing html_content = HTMLPreprocessor._remove_spacing_issues(html_content, page_info) # Step 3: Optimize for PDF generation html_content = HTMLPreprocessor._optimize_for_pdf(html_content, page_info) print(f"โœ… HTML preprocessing completed - {page_info['page_count']} pages optimized") return html_content @staticmethod def _analyze_page_structure(html_content: str) -> dict: """Analyze the HTML structure to understand page layout and spacing.""" # Detect page elements page_selectors = [ r'class="[^"]*brochure-page[^"]*"', r'class="[^"]*page[^"]*"', r'class="[^"]*pdf-page[^"]*"', r'class="[^"]*slide[^"]*"', r'class="[^"]*section[^"]*"' ] page_count = 0 page_elements = [] for selector in page_selectors: matches = re.findall(selector, html_content, re.IGNORECASE) if matches: page_count = len(matches) page_elements = matches break # If no specific page elements found, look for A4-sized containers if page_count == 0: # Look for elements with A4-like dimensions in CSS a4_patterns = [ r'width:\s*210mm', r'height:\s*297mm', r'width:\s*794px', r'height:\s*1123px', r'width:\s*8\.27in', r'height:\s*11\.7in' ] for pattern in a4_patterns: if re.search(pattern, html_content, re.IGNORECASE): page_count = 1 break # Analyze body and container spacing spacing_issues = HTMLPreprocessor._detect_spacing_issues(html_content) return { 'page_count': page_count, 'page_elements': page_elements, 'spacing_issues': spacing_issues, 'has_flexbox': 'display: flex' in html_content, 'has_grid': 'display: grid' in html_content, 'has_padding': 'padding:' in html_content, 'has_margin': 'margin:' in html_content, 'has_gap': 'gap:' in html_content } @staticmethod def _detect_spacing_issues(html_content: str) -> dict: """Detect various types of spacing issues that affect PDF generation.""" issues = { 'body_padding': False, 'body_margin': False, 'body_gap': False, 'document_level_spacing': False, 'container_spacing': False } # Check for body-level spacing issues if re.search(r'body\s*{[^}]*padding[^}]*}', html_content, re.IGNORECASE): issues['body_padding'] = True if re.search(r'body\s*{[^}]*margin[^}]*}', html_content, re.IGNORECASE): issues['body_margin'] = True if re.search(r'body\s*{[^}]*gap[^}]*}', html_content, re.IGNORECASE): issues['body_gap'] = True # Check for document-level spacing if re.search(r'html\s*{[^}]*padding[^}]*}', html_content, re.IGNORECASE): issues['document_level_spacing'] = True if re.search(r'html\s*{[^}]*margin[^}]*}', html_content, re.IGNORECASE): issues['document_level_spacing'] = True # Check for container spacing if re.search(r'\.container\s*{[^}]*padding[^}]*}', html_content, re.IGNORECASE): issues['container_spacing'] = True if re.search(r'\.wrapper\s*{[^}]*padding[^}]*}', html_content, re.IGNORECASE): issues['container_spacing'] = True return issues @staticmethod def _remove_spacing_issues(html_content: str, page_info: dict) -> str: """Remove problematic spacing while preserving internal page spacing.""" # Only remove document-level spacing, preserve internal spacing if page_info['spacing_issues']['body_padding']: html_content = re.sub( r'(body\s*{[^}]*?)padding[^;]*;?([^}]*})', r'\1\2', html_content, flags=re.IGNORECASE ) if page_info['spacing_issues']['body_margin']: html_content = re.sub( r'(body\s*{[^}]*?)margin[^;]*;?([^}]*})', r'\1\2', html_content, flags=re.IGNORECASE ) if page_info['spacing_issues']['body_gap']: html_content = re.sub( r'(body\s*{[^}]*?)gap[^;]*;?([^}]*})', r'\1\2', html_content, flags=re.IGNORECASE ) if page_info['spacing_issues']['document_level_spacing']: html_content = re.sub( r'(html\s*{[^}]*?)padding[^;]*;?([^}]*})', r'\1\2', html_content, flags=re.IGNORECASE ) html_content = re.sub( r'(html\s*{[^}]*?)margin[^;]*;?([^}]*})', r'\1\2', html_content, flags=re.IGNORECASE ) # Add CSS to ensure continuous flow continuous_flow_css = ''' /* Ensure continuous flow for PDF generation */ body { padding: 0 !important; margin: 0 !important; gap: 0 !important; } /* Preserve all internal page spacing and margins */ .page-layout, .p1-content-side, .p2-grid, .p3-main-content, .p4-info-grid { /* Keep all internal spacing intact */ } /* Ensure no page breaks within content */ .brochure-page, .page, .pdf-page, .slide, .section { page-break-after: auto; page-break-inside: avoid; break-inside: avoid; } /* Preserve internal margins and padding */ * { page-break-inside: avoid; break-inside: avoid; } ''' # Insert the CSS after existing styles if '' in html_content: html_content = html_content.replace('', continuous_flow_css + '\n ') return html_content @staticmethod def _optimize_for_pdf(html_content: str, page_info: dict) -> str: """Add PDF-specific optimizations while preserving internal spacing.""" pdf_optimizations = ''' /* PDF-specific optimizations - preserve internal spacing */ @media print { /* Only remove document-level spacing, preserve internal spacing */ body { padding: 0 !important; margin: 0 !important; gap: 0 !important; } /* Preserve all internal page spacing and margins */ .page-layout { padding: 70px !important; /* Keep internal page padding */ } .p1-content-side { padding: 70px 60px !important; /* Keep content padding */ } /* Ensure no page breaks within content */ .brochure-page, .page, .pdf-page { page-break-after: auto !important; page-break-inside: avoid !important; } } /* Ensure exact color rendering */ * { -webkit-print-color-adjust: exact !important; color-adjust: exact !important; } ''' # Insert PDF optimizations if '' in html_content: html_content = html_content.replace('', pdf_optimizations + '\n ') return html_content class PageDetector: """Detects page elements and their dimensions in HTML documents.""" @staticmethod async def detect_pages_and_format(page: Page) -> dict: """ Advanced page detection with multiple fallback strategies. Handles different HTML structures and CSS approaches robustly. """ page_info = await page.evaluate(""" () => { // Strategy 1: Direct page element detection const pageSelectors = [ '.brochure-page', '.brochure', '.page', '[class*="page"]', '.pdf-page', '.slide', '.section' ]; let pageElements = []; let detectedSelector = ''; // Find page elements with priority order for (const selector of pageSelectors) { const elements = document.querySelectorAll(selector); if (elements.length > 0) { pageElements = Array.from(elements); detectedSelector = selector; break; } } // Strategy 2: A4-sized container detection if (pageElements.length === 0) { const allElements = document.querySelectorAll('*'); const a4Elements = Array.from(allElements).filter(el => { const style = window.getComputedStyle(el); const width = parseFloat(style.width); const height = parseFloat(style.height); // A4 dimensions in different units const isA4Width = (width >= 794 && width <= 800) || (width >= 210 && width <= 220) || (width >= 8.27 && width <= 8.5); const isA4Height = (height >= 1123 && height <= 1130) || (height >= 297 && height <= 300) || (height >= 11.69 && height <= 12); return isA4Width && isA4Height; }); if (a4Elements.length > 0) { pageElements = a4Elements; detectedSelector = 'A4-sized-element'; } } // Strategy 3: Body as single page if (pageElements.length === 0) { pageElements = [document.body]; detectedSelector = 'body'; } // Advanced dimension analysis with multiple measurement methods let dimensionResults = []; pageElements.forEach((element, index) => { const measurements = {}; // Method 1: CSS Computed Style const computedStyle = window.getComputedStyle(element); const cssWidth = parseFloat(computedStyle.width); const cssHeight = parseFloat(computedStyle.height); if (cssWidth > 0 && cssHeight > 0) { measurements.css = { width: cssWidth, height: cssHeight }; } // Method 2: Bounding Client Rect const rect = element.getBoundingClientRect(); if (rect.width > 0 && rect.height > 0) { measurements.bounding = { width: rect.width, height: rect.height }; } // Method 3: Offset Dimensions if (element.offsetWidth > 0 && element.offsetHeight > 0) { measurements.offset = { width: element.offsetWidth, height: element.offsetHeight }; } // Method 4: Scroll Dimensions if (element.scrollWidth > 0 && element.scrollHeight > 0) { measurements.scroll = { width: element.scrollWidth, height: element.scrollHeight }; } // Method 5: Client Dimensions if (element.clientWidth > 0 && element.clientHeight > 0) { measurements.client = { width: element.clientWidth, height: element.clientHeight }; } // Select the best measurement method let bestMeasurement = null; let bestScore = 0; Object.entries(measurements).forEach(([method, dims]) => { const score = calculateDimensionScore(dims.width, dims.height); if (score > bestScore) { bestScore = score; bestMeasurement = { method, ...dims }; } }); if (bestMeasurement) { dimensionResults.push({ index, element: element.tagName + (element.className ? '.' + element.className.split(' ')[0] : ''), ...bestMeasurement }); } }); // Helper function to score dimensions function calculateDimensionScore(width, height) { if (width <= 0 || height <= 0) return 0; if (width > 2000 || height > 2000) return 0; // Too large if (width < 50 || height < 50) return 0; // Too small // Prefer A4-like dimensions const aspectRatio = width / height; const a4Ratio = 210 / 297; // 0.707 const ratioScore = 1 - Math.abs(aspectRatio - a4Ratio) / a4Ratio; // Prefer reasonable sizes const sizeScore = Math.min(width / 800, height / 1200); return ratioScore * sizeScore; } // Calculate final dimensions let maxWidth = 0; let maxHeight = 0; let totalWidth = 0; let totalHeight = 0; let validCount = 0; dimensionResults.forEach(result => { if (result.width > 0 && result.height > 0) { maxWidth = Math.max(maxWidth, result.width); maxHeight = Math.max(maxHeight, result.height); totalWidth += result.width; totalHeight += result.height; validCount++; } }); // Fallback to standard A4 if no valid dimensions if (validCount === 0) { maxWidth = 794; maxHeight = 1123; console.warn('No valid dimensions detected, using standard A4'); } // Enhanced format detection let format = 'a4'; const aspectRatio = maxWidth / maxHeight; if (Math.abs(aspectRatio - 0.707) < 0.1) { // A4 ratio format = 'a4'; } else if (Math.abs(aspectRatio - 0.773) < 0.1) { // Letter ratio format = 'a4'; } else if (Math.abs(aspectRatio - 0.607) < 0.1) { // Legal ratio format = 'a4'; } else if (aspectRatio > 1.2) { // Landscape format = 'a4'; } else if (aspectRatio < 0.5) { // Very tall format = 'a4'; } return { pageCount: pageElements.length, format: format, maxWidth: Math.round(maxWidth), maxHeight: Math.round(maxHeight), totalWidth: Math.round(totalWidth), totalHeight: Math.round(totalHeight), aspectRatio: aspectRatio, detectedSelector: detectedSelector, validDimensions: validCount, averageWidth: validCount > 0 ? Math.round(totalWidth / validCount) : 0, averageHeight: validCount > 0 ? Math.round(totalHeight / validCount) : 0, dimensionResults: dimensionResults, hasReasonableDimensions: maxWidth >= 200 && maxHeight >= 200, measurementMethods: dimensionResults.map(r => r.method) }; } """) return page_info async def generate_single_pdf(input_content: str, output_pdf: str, is_url: bool = False, is_file: bool = False, is_html_string: bool = False): """ Generate PDF for a single input (URL, file path, or HTML string). Always uses A4 size for consistent output with intelligent content fitting. """ temp_file = None try: async with async_playwright() as p: # Use the correct Chromium path browser = await p.chromium.launch( headless=True, args=[ '--disable-dev-shm-usage', '--no-sandbox', '--disable-setuid-sandbox', '--disable-gpu', '--disable-web-security', '--disable-features=VizDisplayCompositor', '--enable-font-antialiasing', '--font-render-hinting=none', '--disable-background-timer-throttling', '--disable-backgrounding-occluded-windows', '--disable-renderer-backgrounding', '--allow-running-insecure-content', '--disable-extensions', '--disable-plugins', '--disable-images=false', '--enable-javascript', '--enable-css', '--enable-fonts' ] ) page = await browser.new_page() await page.set_viewport_size({'width': 1920, 'height': 1080}) page.set_default_timeout(120000) if is_html_string: # Preprocess HTML content processed_html = HTMLPreprocessor.preprocess_html(input_content) # Write processed HTML to temp file with tempfile.NamedTemporaryFile(delete=False, suffix='.html', dir=TEMP_FOLDER) as tmp: tmp.write(processed_html.encode('utf-8')) temp_file = tmp.name abs_path = "file://" + os.path.abspath(temp_file) await page.goto(abs_path, wait_until="load") elif is_url: await page.goto(input_content, wait_until="domcontentloaded") elif is_file: abs_path = "file://" + os.path.abspath(input_content) await page.goto(abs_path, wait_until="load") else: raise ValueError("Invalid input type") # Wait for content to load and stabilize await page.wait_for_timeout(3000) # Wait for any dynamic content to finish loading try: await page.wait_for_load_state('networkidle', timeout=10000) except: pass # Continue if network idle doesn't happen # Ensure all external resources are loaded print("๐Ÿ”„ Loading external resources...") await _ensure_resources_loaded(page) # Detect pages and format print("๐Ÿ” Analyzing page structure...") page_info = await PageDetector.detect_pages_and_format(page) print(f"๐Ÿ“„ Detected {page_info['pageCount']} pages") print(f"๐Ÿ“ Max dimensions: {page_info['maxWidth']}x{page_info['maxHeight']}px") print(f"๐ŸŽฏ Recommended format: {page_info['format']}") print(f"๐Ÿ” Detected selector: {page_info['detectedSelector']}") print(f"โœ… Valid dimensions: {page_info['validDimensions']}") print(f"๐Ÿ“ Average dimensions: {page_info['averageWidth']}x{page_info['averageHeight']}px") print(f"๐Ÿ“Š Aspect ratio: {page_info['aspectRatio']:.3f}") print(f"๐Ÿ”ง Measurement methods: {', '.join(page_info['measurementMethods'])}") # Always use A4 format for consistent output pdf_format = 'a4' # Calculate optimal scale to fit content within A4 dimensions dpi = 96 content_width_px = page_info['maxWidth'] content_height_px = page_info['maxHeight'] # Convert to inches content_width_in = content_width_px / dpi content_height_in = content_height_px / dpi # Determine orientation based on content analysis landscape = False if content_width_in > content_height_in * 1.2: # 20% wider threshold landscape = True elif page_info.get('hasTables', False) and content_width_in > content_height_in * 1.1: # Tables need more width landscape = True # Always use A4 dimensions if landscape: # A4 Landscape: 11" x 8.5" pdf_width = 11.0 pdf_height = 8.5 else: # A4 Portrait: 8.5" x 11" pdf_width = 8.5 pdf_height = 11.0 # Calculate optimal scale to fit content within A4 dimensions # Account for margins when calculating scale margin_in = 0.5 # 0.5 inch margins available_width = pdf_width - (2 * margin_in) available_height = pdf_height - (2 * margin_in) # Calculate scale to fit content within available space width_scale = available_width / content_width_in if content_width_in > 0 else 1.0 height_scale = available_height / content_height_in if content_height_in > 0 else 1.0 # Use the smaller scale to ensure content fits in both dimensions optimal_scale = min(width_scale, height_scale, 1.0) # Don't scale up beyond 100% # Ensure minimum scale for readability if optimal_scale < 0.3: optimal_scale = 0.3 # Minimum 30% scale for readability # Adjust margins based on content type - optimized for A4 size if page_info.get('hasTables', False): # Tables need more breathing room on A4 margins = {'top': '0.75in', 'right': '0.75in', 'bottom': '0.75in', 'left': '0.75in'} elif page_info.get('hasImages', False): # Images look better with balanced margins on A4 margins = {'top': '0.6in', 'right': '0.6in', 'bottom': '0.6in', 'left': '0.6in'} else: # Text content works well with standard A4 margins margins = {'top': '0.5in', 'right': '0.5in', 'bottom': '0.5in', 'left': '0.5in'} # For very small content, use smaller margins to maximize A4 space if content_width_in < 6.0 and content_height_in < 8.0: margins = {'top': '0.4in', 'right': '0.4in', 'bottom': '0.4in', 'left': '0.4in'} # For very large content, use larger margins to ensure readability if content_width_in > 10.0 or content_height_in > 12.0: margins = {'top': '0.8in', 'right': '0.8in', 'bottom': '0.8in', 'left': '0.8in'} pdf_options = { 'path': output_pdf, 'print_background': True, 'margin': margins, 'scale': optimal_scale, 'landscape': landscape, 'width': f"{pdf_width}in", 'height': f"{pdf_height}in", 'prefer_css_page_size': False, # Disable CSS page size to ensure A4 'format': 'A4' # Explicitly set A4 format } # Generate PDF await page.pdf(**pdf_options) await browser.close() print(f"โœ… PDF generated: {output_pdf}") print(f"๐Ÿ“ A4 Size: {pdf_width}in x {pdf_height}in ({'Landscape' if landscape else 'Portrait'})") print(f"๐Ÿ“ Content: {content_width_in:.2f}in x {content_height_in:.2f}in") print(f"๐Ÿ” Scale: {optimal_scale:.2f} (optimized for A4 fit)") print(f"๐Ÿ“„ Format: A4 Standard") except TimeoutError: raise Exception("Timeout: Page took too long to load.") except Exception as e: print(f"โŒ PDF generation error: {str(e)}") raise e finally: if temp_file and os.path.exists(temp_file): os.remove(temp_file) async def _ensure_resources_loaded(page: Page): """Ensure all external resources are properly loaded.""" # Wait for fonts to load await page.evaluate(""" () => { return document.fonts.ready; } """) # Wait for all images to load await page.evaluate(""" () => { return Promise.all( Array.from(document.images) .filter(img => !img.complete) .map(img => new Promise(resolve => { img.onload = img.onerror = resolve; })) ); } """) # Wait for background images to load await page.evaluate(""" () => { const elementsWithBg = document.querySelectorAll('[style*="background-image"], [class*="image"]'); return Promise.all( Array.from(elementsWithBg).map(el => { const style = window.getComputedStyle(el); const bgImage = style.backgroundImage; if (bgImage && bgImage !== 'none') { return new Promise(resolve => { const img = new Image(); img.onload = img.onerror = resolve; img.src = bgImage.replace(/url\\(['"]?(.*?)['"]?\\)/g, '$1'); }); } return Promise.resolve(); }) ); } """) # Wait for CSS to be fully applied await page.wait_for_timeout(2000) def process_input(input_content: str, output_name: str = None): """ Process the input: determine type and generate PDF(s). Returns path to PDF or ZIP file. """ is_url = input_content.startswith('http://') or input_content.startswith('https://') is_file = False # HTML content is not a physical file is_html_string = True # HTML content is always a string if output_name is None: output_name = f'single_output_{uuid.uuid4().hex[:8]}.pdf' if is_file: # This case should ideally not be reached for HTML content raise ValueError("HTML content cannot be treated as a file path.") pdf_path = os.path.join(TEMP_FOLDER, secure_filename(output_name)) if is_url: asyncio.run(generate_single_pdf(input_content, pdf_path, is_url=True)) elif is_html_string: asyncio.run(generate_single_pdf(input_content, pdf_path, is_html_string=True)) else: raise ValueError("Invalid input type for processing") return pdf_path, 'application/pdf' @app.route('/') def root(): """Root endpoint""" return jsonify({ "message": "PDF Generator API", "version": "2.0.0", "status": "running", "timestamp": datetime.now().isoformat(), "endpoints": { "generate-pdf": "/generate-pdf", "health": "/health" }, "features": [ "HTML string to PDF", "URL to PDF", "HTML file to PDF", "Batch HTML files to ZIP", "Standard A4 format", "Consistent page sizing" ], "usage": { "method": "POST", "endpoint": "/generate-pdf", "body": { "input": "HTML string, URL, or file path", "output": "Optional output filename" } } }) @app.route('/health') def health_check(): """Health check endpoint""" return jsonify({ "status": "healthy", "timestamp": datetime.now().isoformat(), "service": "Advanced HTML to PDF Generator", "temp_folder": TEMP_FOLDER, "temp_folder_exists": os.path.exists(TEMP_FOLDER), "uptime": "running" }) @app.route('/generate-pdf', methods=['POST']) def generate_pdf_api(): """Main PDF generation endpoint""" try: # Get request data - handle both JSON and form data more robustly input_content = None output_name = None if request.is_json: try: data = request.get_json() if data and 'input' in data: input_content = data['input'] output_name = data.get('output', None) except Exception as json_error: print(f"โŒ JSON parsing error: {json_error}") return jsonify({'error': f'Invalid JSON format: {str(json_error)}'}), 400 else: # Handle form data input_content = request.form.get('input') output_name = request.form.get('output') # If input is a file, read its content if 'input' in request.files: file = request.files['input'] if file and file.filename: try: input_content = file.read().decode('utf-8') if not output_name: output_name = file.filename.replace('.html', '.pdf') except UnicodeDecodeError: return jsonify({'error': 'File encoding error. Please ensure the file is UTF-8 encoded.'}), 400 # Validate input if not input_content or input_content.strip() == '': return jsonify({'error': 'Input cannot be empty. Please provide HTML content.'}), 400 # Clean the HTML content - remove problematic control characters input_content = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', input_content) # Process input and generate PDF/ZIP file_path, mime_type = process_input(input_content, output_name) # Check if file was created if not os.path.exists(file_path): return jsonify({'error': 'Failed to generate output file'}), 500 # Send file response response = send_file( file_path, as_attachment=True, download_name=os.path.basename(file_path), mimetype=mime_type ) # Clean up after sending @response.call_on_close def cleanup(): try: if os.path.exists(file_path): os.remove(file_path) print(f"๐Ÿงน Cleaned up: {file_path}") except Exception as e: print(f"โŒ Cleanup error: {e}") return response except Exception as e: print(f"โŒ API Error: {str(e)}") return jsonify({'error': str(e)}), 500 @app.after_request def cleanup_temp_files(response): """Clean up temporary files older than 1 hour""" try: import time current_time = time.time() for filename in os.listdir(TEMP_FOLDER): filepath = os.path.join(TEMP_FOLDER, filename) if os.path.isfile(filepath): if current_time - os.path.getmtime(filepath) > 3600: # 1 hour os.remove(filepath) print(f"๐Ÿงน Auto-cleanup: {filename}") except Exception as e: print(f"โŒ Auto-cleanup error: {e}") return response if __name__ == '__main__': print("๐Ÿš€ Starting Advanced HTML to PDF Generator API...") print("๐Ÿ“ Endpoints available:") print(" GET / - API information") print(" GET /health - Health check") print(" POST /generate-pdf - Generate PDF from HTML/URL/file") print("") print("โœจ Features:") print(" โ€ข HTML string to PDF") print(" โ€ข URL to PDF") print(" โ€ข HTML file to PDF") print(" โ€ข Batch HTML files to ZIP") print(" โ€ข Standard A4 format") print(" โ€ข Consistent page sizing") app.run(host='0.0.0.0', port=8000, debug=True)