861 lines
34 KiB
Python
861 lines
34 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Advanced HTML to PDF Generator API with Intelligent Content Analysis
|
|
Supports URLs, HTML files, HTML strings, and batch processing
|
|
Always uses A4 size for consistent output
|
|
"""
|
|
|
|
from flask import Flask, request, send_file, jsonify
|
|
import os
|
|
import asyncio
|
|
import tempfile
|
|
import zipfile
|
|
from playwright.async_api import async_playwright, TimeoutError, Page
|
|
from werkzeug.utils import secure_filename
|
|
import uuid
|
|
from datetime import datetime
|
|
import re
|
|
import logging
|
|
|
|
# Configure logging
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
app = Flask(__name__)
|
|
|
|
# Configure temp folder
|
|
TEMP_FOLDER = 'temp'
|
|
if not os.path.exists(TEMP_FOLDER):
|
|
os.makedirs(TEMP_FOLDER)
|
|
|
|
class HTMLPreprocessor:
|
|
"""Intelligently preprocesses HTML to remove spacing issues and optimize for PDF generation."""
|
|
|
|
@staticmethod
|
|
def preprocess_html(html_content: str) -> str:
|
|
"""
|
|
Dynamically analyze and fix spacing issues in HTML for perfect PDF generation.
|
|
"""
|
|
print("🔧 Preprocessing HTML for optimal PDF generation...")
|
|
|
|
# Step 1: Detect page elements and their structure
|
|
page_info = HTMLPreprocessor._analyze_page_structure(html_content)
|
|
|
|
# Step 2: Remove problematic spacing
|
|
html_content = HTMLPreprocessor._remove_spacing_issues(html_content, page_info)
|
|
|
|
# Step 3: Optimize for PDF generation
|
|
html_content = HTMLPreprocessor._optimize_for_pdf(html_content, page_info)
|
|
|
|
print(f"✅ HTML preprocessing completed - {page_info['page_count']} pages optimized")
|
|
return html_content
|
|
|
|
@staticmethod
|
|
def _analyze_page_structure(html_content: str) -> dict:
|
|
"""Analyze the HTML structure to understand page layout and spacing."""
|
|
|
|
# Detect page elements
|
|
page_selectors = [
|
|
r'class="[^"]*brochure-page[^"]*"',
|
|
r'class="[^"]*page[^"]*"',
|
|
r'class="[^"]*pdf-page[^"]*"',
|
|
r'class="[^"]*slide[^"]*"',
|
|
r'class="[^"]*section[^"]*"'
|
|
]
|
|
|
|
page_count = 0
|
|
page_elements = []
|
|
|
|
for selector in page_selectors:
|
|
matches = re.findall(selector, html_content, re.IGNORECASE)
|
|
if matches:
|
|
page_count = len(matches)
|
|
page_elements = matches
|
|
break
|
|
|
|
# If no specific page elements found, look for A4-sized containers
|
|
if page_count == 0:
|
|
# Look for elements with A4-like dimensions in CSS
|
|
a4_patterns = [
|
|
r'width:\s*210mm',
|
|
r'height:\s*297mm',
|
|
r'width:\s*794px',
|
|
r'height:\s*1123px',
|
|
r'width:\s*8\.27in',
|
|
r'height:\s*11\.7in'
|
|
]
|
|
|
|
for pattern in a4_patterns:
|
|
if re.search(pattern, html_content, re.IGNORECASE):
|
|
page_count = 1
|
|
break
|
|
|
|
# Analyze body and container spacing
|
|
spacing_issues = HTMLPreprocessor._detect_spacing_issues(html_content)
|
|
|
|
return {
|
|
'page_count': page_count,
|
|
'page_elements': page_elements,
|
|
'spacing_issues': spacing_issues,
|
|
'has_flexbox': 'display: flex' in html_content,
|
|
'has_grid': 'display: grid' in html_content,
|
|
'has_padding': 'padding:' in html_content,
|
|
'has_margin': 'margin:' in html_content,
|
|
'has_gap': 'gap:' in html_content
|
|
}
|
|
|
|
@staticmethod
|
|
def _detect_spacing_issues(html_content: str) -> dict:
|
|
"""Detect various types of spacing issues that affect PDF generation."""
|
|
|
|
issues = {
|
|
'body_padding': False,
|
|
'body_margin': False,
|
|
'body_gap': False,
|
|
'document_level_spacing': False,
|
|
'container_spacing': False
|
|
}
|
|
|
|
# Check for body-level spacing issues
|
|
if re.search(r'body\s*{[^}]*padding[^}]*}', html_content, re.IGNORECASE):
|
|
issues['body_padding'] = True
|
|
|
|
if re.search(r'body\s*{[^}]*margin[^}]*}', html_content, re.IGNORECASE):
|
|
issues['body_margin'] = True
|
|
|
|
if re.search(r'body\s*{[^}]*gap[^}]*}', html_content, re.IGNORECASE):
|
|
issues['body_gap'] = True
|
|
|
|
# Check for document-level spacing
|
|
if re.search(r'html\s*{[^}]*padding[^}]*}', html_content, re.IGNORECASE):
|
|
issues['document_level_spacing'] = True
|
|
|
|
if re.search(r'html\s*{[^}]*margin[^}]*}', html_content, re.IGNORECASE):
|
|
issues['document_level_spacing'] = True
|
|
|
|
# Check for container spacing
|
|
if re.search(r'\.container\s*{[^}]*padding[^}]*}', html_content, re.IGNORECASE):
|
|
issues['container_spacing'] = True
|
|
|
|
if re.search(r'\.wrapper\s*{[^}]*padding[^}]*}', html_content, re.IGNORECASE):
|
|
issues['container_spacing'] = True
|
|
|
|
return issues
|
|
|
|
@staticmethod
|
|
def _remove_spacing_issues(html_content: str, page_info: dict) -> str:
|
|
"""Remove problematic spacing while preserving internal page spacing."""
|
|
|
|
# Only remove document-level spacing, preserve internal spacing
|
|
if page_info['spacing_issues']['body_padding']:
|
|
html_content = re.sub(
|
|
r'(body\s*{[^}]*?)padding[^;]*;?([^}]*})',
|
|
r'\1\2',
|
|
html_content,
|
|
flags=re.IGNORECASE
|
|
)
|
|
|
|
if page_info['spacing_issues']['body_margin']:
|
|
html_content = re.sub(
|
|
r'(body\s*{[^}]*?)margin[^;]*;?([^}]*})',
|
|
r'\1\2',
|
|
html_content,
|
|
flags=re.IGNORECASE
|
|
)
|
|
|
|
if page_info['spacing_issues']['body_gap']:
|
|
html_content = re.sub(
|
|
r'(body\s*{[^}]*?)gap[^;]*;?([^}]*})',
|
|
r'\1\2',
|
|
html_content,
|
|
flags=re.IGNORECASE
|
|
)
|
|
|
|
if page_info['spacing_issues']['document_level_spacing']:
|
|
html_content = re.sub(
|
|
r'(html\s*{[^}]*?)padding[^;]*;?([^}]*})',
|
|
r'\1\2',
|
|
html_content,
|
|
flags=re.IGNORECASE
|
|
)
|
|
html_content = re.sub(
|
|
r'(html\s*{[^}]*?)margin[^;]*;?([^}]*})',
|
|
r'\1\2',
|
|
html_content,
|
|
flags=re.IGNORECASE
|
|
)
|
|
|
|
# Add CSS to ensure continuous flow
|
|
continuous_flow_css = '''
|
|
/* Ensure continuous flow for PDF generation */
|
|
body {
|
|
padding: 0 !important;
|
|
margin: 0 !important;
|
|
gap: 0 !important;
|
|
}
|
|
|
|
/* Preserve all internal page spacing and margins */
|
|
.page-layout, .p1-content-side, .p2-grid, .p3-main-content, .p4-info-grid {
|
|
/* Keep all internal spacing intact */
|
|
}
|
|
|
|
/* Ensure no page breaks within content */
|
|
.brochure-page, .page, .pdf-page, .slide, .section {
|
|
page-break-after: auto;
|
|
page-break-inside: avoid;
|
|
break-inside: avoid;
|
|
}
|
|
|
|
/* Preserve internal margins and padding */
|
|
* {
|
|
page-break-inside: avoid;
|
|
break-inside: avoid;
|
|
}
|
|
'''
|
|
|
|
# Insert the CSS after existing styles
|
|
if '</style>' in html_content:
|
|
html_content = html_content.replace('</style>', continuous_flow_css + '\n </style>')
|
|
|
|
return html_content
|
|
|
|
@staticmethod
|
|
def _optimize_for_pdf(html_content: str, page_info: dict) -> str:
|
|
"""Add PDF-specific optimizations while preserving internal spacing."""
|
|
|
|
pdf_optimizations = '''
|
|
/* PDF-specific optimizations - preserve internal spacing */
|
|
@media print {
|
|
/* Only remove document-level spacing, preserve internal spacing */
|
|
body {
|
|
padding: 0 !important;
|
|
margin: 0 !important;
|
|
gap: 0 !important;
|
|
}
|
|
|
|
/* Preserve all internal page spacing and margins */
|
|
.page-layout {
|
|
padding: 70px !important; /* Keep internal page padding */
|
|
}
|
|
|
|
.p1-content-side {
|
|
padding: 70px 60px !important; /* Keep content padding */
|
|
}
|
|
|
|
/* Ensure no page breaks within content */
|
|
.brochure-page, .page, .pdf-page {
|
|
page-break-after: auto !important;
|
|
page-break-inside: avoid !important;
|
|
}
|
|
}
|
|
|
|
/* Ensure exact color rendering */
|
|
* {
|
|
-webkit-print-color-adjust: exact !important;
|
|
color-adjust: exact !important;
|
|
}
|
|
'''
|
|
|
|
# Insert PDF optimizations
|
|
if '</style>' in html_content:
|
|
html_content = html_content.replace('</style>', pdf_optimizations + '\n </style>')
|
|
|
|
return html_content
|
|
|
|
class PageDetector:
|
|
"""Detects page elements and their dimensions in HTML documents."""
|
|
|
|
@staticmethod
|
|
async def detect_pages_and_format(page: Page) -> dict:
|
|
"""
|
|
Advanced page detection with multiple fallback strategies.
|
|
Handles different HTML structures and CSS approaches robustly.
|
|
"""
|
|
page_info = await page.evaluate("""
|
|
() => {
|
|
// Strategy 1: Direct page element detection
|
|
const pageSelectors = [
|
|
'.brochure-page',
|
|
'.brochure',
|
|
'.page',
|
|
'[class*="page"]',
|
|
'.pdf-page',
|
|
'.slide',
|
|
'.section'
|
|
];
|
|
|
|
let pageElements = [];
|
|
let detectedSelector = '';
|
|
|
|
// Find page elements with priority order
|
|
for (const selector of pageSelectors) {
|
|
const elements = document.querySelectorAll(selector);
|
|
if (elements.length > 0) {
|
|
pageElements = Array.from(elements);
|
|
detectedSelector = selector;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Strategy 2: A4-sized container detection
|
|
if (pageElements.length === 0) {
|
|
const allElements = document.querySelectorAll('*');
|
|
const a4Elements = Array.from(allElements).filter(el => {
|
|
const style = window.getComputedStyle(el);
|
|
const width = parseFloat(style.width);
|
|
const height = parseFloat(style.height);
|
|
|
|
// A4 dimensions in different units
|
|
const isA4Width = (width >= 794 && width <= 800) ||
|
|
(width >= 210 && width <= 220) ||
|
|
(width >= 8.27 && width <= 8.5);
|
|
const isA4Height = (height >= 1123 && height <= 1130) ||
|
|
(height >= 297 && height <= 300) ||
|
|
(height >= 11.69 && height <= 12);
|
|
|
|
return isA4Width && isA4Height;
|
|
});
|
|
|
|
if (a4Elements.length > 0) {
|
|
pageElements = a4Elements;
|
|
detectedSelector = 'A4-sized-element';
|
|
}
|
|
}
|
|
|
|
// Strategy 3: Body as single page
|
|
if (pageElements.length === 0) {
|
|
pageElements = [document.body];
|
|
detectedSelector = 'body';
|
|
}
|
|
|
|
// Advanced dimension analysis with multiple measurement methods
|
|
let dimensionResults = [];
|
|
|
|
pageElements.forEach((element, index) => {
|
|
const measurements = {};
|
|
|
|
// Method 1: CSS Computed Style
|
|
const computedStyle = window.getComputedStyle(element);
|
|
const cssWidth = parseFloat(computedStyle.width);
|
|
const cssHeight = parseFloat(computedStyle.height);
|
|
|
|
if (cssWidth > 0 && cssHeight > 0) {
|
|
measurements.css = { width: cssWidth, height: cssHeight };
|
|
}
|
|
|
|
// Method 2: Bounding Client Rect
|
|
const rect = element.getBoundingClientRect();
|
|
if (rect.width > 0 && rect.height > 0) {
|
|
measurements.bounding = { width: rect.width, height: rect.height };
|
|
}
|
|
|
|
// Method 3: Offset Dimensions
|
|
if (element.offsetWidth > 0 && element.offsetHeight > 0) {
|
|
measurements.offset = { width: element.offsetWidth, height: element.offsetHeight };
|
|
}
|
|
|
|
// Method 4: Scroll Dimensions
|
|
if (element.scrollWidth > 0 && element.scrollHeight > 0) {
|
|
measurements.scroll = { width: element.scrollWidth, height: element.scrollHeight };
|
|
}
|
|
|
|
// Method 5: Client Dimensions
|
|
if (element.clientWidth > 0 && element.clientHeight > 0) {
|
|
measurements.client = { width: element.clientWidth, height: element.clientHeight };
|
|
}
|
|
|
|
// Select the best measurement method
|
|
let bestMeasurement = null;
|
|
let bestScore = 0;
|
|
|
|
Object.entries(measurements).forEach(([method, dims]) => {
|
|
const score = calculateDimensionScore(dims.width, dims.height);
|
|
if (score > bestScore) {
|
|
bestScore = score;
|
|
bestMeasurement = { method, ...dims };
|
|
}
|
|
});
|
|
|
|
if (bestMeasurement) {
|
|
dimensionResults.push({
|
|
index,
|
|
element: element.tagName + (element.className ? '.' + element.className.split(' ')[0] : ''),
|
|
...bestMeasurement
|
|
});
|
|
}
|
|
});
|
|
|
|
// Helper function to score dimensions
|
|
function calculateDimensionScore(width, height) {
|
|
if (width <= 0 || height <= 0) return 0;
|
|
if (width > 2000 || height > 2000) return 0; // Too large
|
|
if (width < 50 || height < 50) return 0; // Too small
|
|
|
|
// Prefer A4-like dimensions
|
|
const aspectRatio = width / height;
|
|
const a4Ratio = 210 / 297; // 0.707
|
|
const ratioScore = 1 - Math.abs(aspectRatio - a4Ratio) / a4Ratio;
|
|
|
|
// Prefer reasonable sizes
|
|
const sizeScore = Math.min(width / 800, height / 1200);
|
|
|
|
return ratioScore * sizeScore;
|
|
}
|
|
|
|
// Calculate final dimensions
|
|
let maxWidth = 0;
|
|
let maxHeight = 0;
|
|
let totalWidth = 0;
|
|
let totalHeight = 0;
|
|
let validCount = 0;
|
|
|
|
dimensionResults.forEach(result => {
|
|
if (result.width > 0 && result.height > 0) {
|
|
maxWidth = Math.max(maxWidth, result.width);
|
|
maxHeight = Math.max(maxHeight, result.height);
|
|
totalWidth += result.width;
|
|
totalHeight += result.height;
|
|
validCount++;
|
|
}
|
|
});
|
|
|
|
// Fallback to standard A4 if no valid dimensions
|
|
if (validCount === 0) {
|
|
maxWidth = 794;
|
|
maxHeight = 1123;
|
|
console.warn('No valid dimensions detected, using standard A4');
|
|
}
|
|
|
|
// Enhanced format detection
|
|
let format = 'a4';
|
|
const aspectRatio = maxWidth / maxHeight;
|
|
|
|
if (Math.abs(aspectRatio - 0.707) < 0.1) { // A4 ratio
|
|
format = 'a4';
|
|
} else if (Math.abs(aspectRatio - 0.773) < 0.1) { // Letter ratio
|
|
format = 'a4';
|
|
} else if (Math.abs(aspectRatio - 0.607) < 0.1) { // Legal ratio
|
|
format = 'a4';
|
|
} else if (aspectRatio > 1.2) { // Landscape
|
|
format = 'a4';
|
|
} else if (aspectRatio < 0.5) { // Very tall
|
|
format = 'a4';
|
|
}
|
|
|
|
return {
|
|
pageCount: pageElements.length,
|
|
format: format,
|
|
maxWidth: Math.round(maxWidth),
|
|
maxHeight: Math.round(maxHeight),
|
|
totalWidth: Math.round(totalWidth),
|
|
totalHeight: Math.round(totalHeight),
|
|
aspectRatio: aspectRatio,
|
|
detectedSelector: detectedSelector,
|
|
validDimensions: validCount,
|
|
averageWidth: validCount > 0 ? Math.round(totalWidth / validCount) : 0,
|
|
averageHeight: validCount > 0 ? Math.round(totalHeight / validCount) : 0,
|
|
dimensionResults: dimensionResults,
|
|
hasReasonableDimensions: maxWidth >= 200 && maxHeight >= 200,
|
|
measurementMethods: dimensionResults.map(r => r.method)
|
|
};
|
|
}
|
|
""")
|
|
|
|
return page_info
|
|
|
|
async def generate_single_pdf(input_content: str, output_pdf: str, is_url: bool = False, is_file: bool = False, is_html_string: bool = False):
|
|
"""
|
|
Generate PDF for a single input (URL, file path, or HTML string).
|
|
Always uses A4 size for consistent output with intelligent content fitting.
|
|
"""
|
|
temp_file = None
|
|
try:
|
|
async with async_playwright() as p:
|
|
# Use the correct Chromium path
|
|
browser = await p.chromium.launch(
|
|
headless=True,
|
|
args=[
|
|
'--disable-dev-shm-usage',
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-gpu',
|
|
'--disable-web-security',
|
|
'--disable-features=VizDisplayCompositor',
|
|
'--enable-font-antialiasing',
|
|
'--font-render-hinting=none',
|
|
'--disable-background-timer-throttling',
|
|
'--disable-backgrounding-occluded-windows',
|
|
'--disable-renderer-backgrounding',
|
|
'--allow-running-insecure-content',
|
|
'--disable-extensions',
|
|
'--disable-plugins',
|
|
'--disable-images=false',
|
|
'--enable-javascript',
|
|
'--enable-css',
|
|
'--enable-fonts'
|
|
]
|
|
)
|
|
|
|
page = await browser.new_page()
|
|
await page.set_viewport_size({'width': 1920, 'height': 1080})
|
|
page.set_default_timeout(120000)
|
|
|
|
if is_html_string:
|
|
# Preprocess HTML content
|
|
processed_html = HTMLPreprocessor.preprocess_html(input_content)
|
|
|
|
# Write processed HTML to temp file
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.html', dir=TEMP_FOLDER) as tmp:
|
|
tmp.write(processed_html.encode('utf-8'))
|
|
temp_file = tmp.name
|
|
abs_path = "file://" + os.path.abspath(temp_file)
|
|
await page.goto(abs_path, wait_until="load")
|
|
elif is_url:
|
|
await page.goto(input_content, wait_until="domcontentloaded")
|
|
elif is_file:
|
|
abs_path = "file://" + os.path.abspath(input_content)
|
|
await page.goto(abs_path, wait_until="load")
|
|
else:
|
|
raise ValueError("Invalid input type")
|
|
|
|
# Wait for content to load and stabilize
|
|
await page.wait_for_timeout(3000)
|
|
|
|
# Wait for any dynamic content to finish loading
|
|
try:
|
|
await page.wait_for_load_state('networkidle', timeout=10000)
|
|
except:
|
|
pass # Continue if network idle doesn't happen
|
|
|
|
# Ensure all external resources are loaded
|
|
print("🔄 Loading external resources...")
|
|
await _ensure_resources_loaded(page)
|
|
|
|
# Detect pages and format
|
|
print("🔍 Analyzing page structure...")
|
|
page_info = await PageDetector.detect_pages_and_format(page)
|
|
|
|
print(f"📄 Detected {page_info['pageCount']} pages")
|
|
print(f"📐 Max dimensions: {page_info['maxWidth']}x{page_info['maxHeight']}px")
|
|
print(f"🎯 Recommended format: {page_info['format']}")
|
|
print(f"🔍 Detected selector: {page_info['detectedSelector']}")
|
|
print(f"✅ Valid dimensions: {page_info['validDimensions']}")
|
|
print(f"📏 Average dimensions: {page_info['averageWidth']}x{page_info['averageHeight']}px")
|
|
print(f"📊 Aspect ratio: {page_info['aspectRatio']:.3f}")
|
|
print(f"🔧 Measurement methods: {', '.join(page_info['measurementMethods'])}")
|
|
|
|
# Always use A4 format for consistent output
|
|
pdf_format = 'a4'
|
|
|
|
# Calculate optimal scale to fit content within A4 dimensions
|
|
dpi = 96
|
|
content_width_px = page_info['maxWidth']
|
|
content_height_px = page_info['maxHeight']
|
|
|
|
# Convert to inches
|
|
content_width_in = content_width_px / dpi
|
|
content_height_in = content_height_px / dpi
|
|
|
|
# Determine orientation based on content analysis
|
|
landscape = False
|
|
if content_width_in > content_height_in * 1.2: # 20% wider threshold
|
|
landscape = True
|
|
elif page_info.get('hasTables', False) and content_width_in > content_height_in * 1.1: # Tables need more width
|
|
landscape = True
|
|
|
|
# Always use A4 dimensions
|
|
if landscape:
|
|
# A4 Landscape: 11" x 8.5"
|
|
pdf_width = 11.0
|
|
pdf_height = 8.5
|
|
else:
|
|
# A4 Portrait: 8.5" x 11"
|
|
pdf_width = 8.5
|
|
pdf_height = 11.0
|
|
|
|
# Calculate optimal scale to fit content within A4 dimensions
|
|
# Account for margins when calculating scale
|
|
margin_in = 0.5 # 0.5 inch margins
|
|
available_width = pdf_width - (2 * margin_in)
|
|
available_height = pdf_height - (2 * margin_in)
|
|
|
|
# Calculate scale to fit content within available space
|
|
width_scale = available_width / content_width_in if content_width_in > 0 else 1.0
|
|
height_scale = available_height / content_height_in if content_height_in > 0 else 1.0
|
|
|
|
# Use the smaller scale to ensure content fits in both dimensions
|
|
optimal_scale = min(width_scale, height_scale, 1.0) # Don't scale up beyond 100%
|
|
|
|
# Ensure minimum scale for readability
|
|
if optimal_scale < 0.3:
|
|
optimal_scale = 0.3 # Minimum 30% scale for readability
|
|
|
|
# Adjust margins based on content type - optimized for A4 size
|
|
if page_info.get('hasTables', False):
|
|
# Tables need more breathing room on A4
|
|
margins = {'top': '0.75in', 'right': '0.75in', 'bottom': '0.75in', 'left': '0.75in'}
|
|
elif page_info.get('hasImages', False):
|
|
# Images look better with balanced margins on A4
|
|
margins = {'top': '0.6in', 'right': '0.6in', 'bottom': '0.6in', 'left': '0.6in'}
|
|
else:
|
|
# Text content works well with standard A4 margins
|
|
margins = {'top': '0.5in', 'right': '0.5in', 'bottom': '0.5in', 'left': '0.5in'}
|
|
|
|
# For very small content, use smaller margins to maximize A4 space
|
|
if content_width_in < 6.0 and content_height_in < 8.0:
|
|
margins = {'top': '0.4in', 'right': '0.4in', 'bottom': '0.4in', 'left': '0.4in'}
|
|
|
|
# For very large content, use larger margins to ensure readability
|
|
if content_width_in > 10.0 or content_height_in > 12.0:
|
|
margins = {'top': '0.8in', 'right': '0.8in', 'bottom': '0.8in', 'left': '0.8in'}
|
|
|
|
pdf_options = {
|
|
'path': output_pdf,
|
|
'print_background': True,
|
|
'margin': margins,
|
|
'scale': optimal_scale,
|
|
'landscape': landscape,
|
|
'width': f"{pdf_width}in",
|
|
'height': f"{pdf_height}in",
|
|
'prefer_css_page_size': False, # Disable CSS page size to ensure A4
|
|
'format': 'A4' # Explicitly set A4 format
|
|
}
|
|
|
|
# Generate PDF
|
|
await page.pdf(**pdf_options)
|
|
await browser.close()
|
|
|
|
print(f"✅ PDF generated: {output_pdf}")
|
|
print(f"📏 A4 Size: {pdf_width}in x {pdf_height}in ({'Landscape' if landscape else 'Portrait'})")
|
|
print(f"📐 Content: {content_width_in:.2f}in x {content_height_in:.2f}in")
|
|
print(f"🔍 Scale: {optimal_scale:.2f} (optimized for A4 fit)")
|
|
print(f"📄 Format: A4 Standard")
|
|
|
|
except TimeoutError:
|
|
raise Exception("Timeout: Page took too long to load.")
|
|
except Exception as e:
|
|
print(f"❌ PDF generation error: {str(e)}")
|
|
raise e
|
|
finally:
|
|
if temp_file and os.path.exists(temp_file):
|
|
os.remove(temp_file)
|
|
|
|
async def _ensure_resources_loaded(page: Page):
|
|
"""Ensure all external resources are properly loaded."""
|
|
|
|
# Wait for fonts to load
|
|
await page.evaluate("""
|
|
() => {
|
|
return document.fonts.ready;
|
|
}
|
|
""")
|
|
|
|
# Wait for all images to load
|
|
await page.evaluate("""
|
|
() => {
|
|
return Promise.all(
|
|
Array.from(document.images)
|
|
.filter(img => !img.complete)
|
|
.map(img => new Promise(resolve => {
|
|
img.onload = img.onerror = resolve;
|
|
}))
|
|
);
|
|
}
|
|
""")
|
|
|
|
# Wait for background images to load
|
|
await page.evaluate("""
|
|
() => {
|
|
const elementsWithBg = document.querySelectorAll('[style*="background-image"], [class*="image"]');
|
|
return Promise.all(
|
|
Array.from(elementsWithBg).map(el => {
|
|
const style = window.getComputedStyle(el);
|
|
const bgImage = style.backgroundImage;
|
|
if (bgImage && bgImage !== 'none') {
|
|
return new Promise(resolve => {
|
|
const img = new Image();
|
|
img.onload = img.onerror = resolve;
|
|
img.src = bgImage.replace(/url\\(['"]?(.*?)['"]?\\)/g, '$1');
|
|
});
|
|
}
|
|
return Promise.resolve();
|
|
})
|
|
);
|
|
}
|
|
""")
|
|
|
|
# Wait for CSS to be fully applied
|
|
await page.wait_for_timeout(2000)
|
|
|
|
def process_input(input_content: str, output_name: str = None):
|
|
"""
|
|
Process the input: determine type and generate PDF(s).
|
|
Returns path to PDF or ZIP file.
|
|
"""
|
|
is_url = input_content.startswith('http://') or input_content.startswith('https://')
|
|
is_file = False # HTML content is not a physical file
|
|
is_html_string = True # HTML content is always a string
|
|
|
|
if output_name is None:
|
|
output_name = f'single_output_{uuid.uuid4().hex[:8]}.pdf'
|
|
|
|
if is_file:
|
|
# This case should ideally not be reached for HTML content
|
|
raise ValueError("HTML content cannot be treated as a file path.")
|
|
|
|
pdf_path = os.path.join(TEMP_FOLDER, secure_filename(output_name))
|
|
|
|
if is_url:
|
|
asyncio.run(generate_single_pdf(input_content, pdf_path, is_url=True))
|
|
elif is_html_string:
|
|
asyncio.run(generate_single_pdf(input_content, pdf_path, is_html_string=True))
|
|
else:
|
|
raise ValueError("Invalid input type for processing")
|
|
|
|
return pdf_path, 'application/pdf'
|
|
|
|
@app.route('/')
|
|
def root():
|
|
"""Root endpoint"""
|
|
return jsonify({
|
|
"message": "PDF Generator API",
|
|
"version": "2.0.0",
|
|
"status": "running",
|
|
"timestamp": datetime.now().isoformat(),
|
|
"endpoints": {
|
|
"generate-pdf": "/generate-pdf",
|
|
"health": "/health"
|
|
},
|
|
"features": [
|
|
"HTML string to PDF",
|
|
"URL to PDF",
|
|
"HTML file to PDF",
|
|
"Batch HTML files to ZIP",
|
|
"Standard A4 format",
|
|
"Consistent page sizing"
|
|
],
|
|
"usage": {
|
|
"method": "POST",
|
|
"endpoint": "/generate-pdf",
|
|
"body": {
|
|
"input": "HTML string, URL, or file path",
|
|
"output": "Optional output filename"
|
|
}
|
|
}
|
|
})
|
|
|
|
@app.route('/health')
|
|
def health_check():
|
|
"""Health check endpoint"""
|
|
return jsonify({
|
|
"status": "healthy",
|
|
"timestamp": datetime.now().isoformat(),
|
|
"service": "Advanced HTML to PDF Generator",
|
|
"temp_folder": TEMP_FOLDER,
|
|
"temp_folder_exists": os.path.exists(TEMP_FOLDER),
|
|
"uptime": "running"
|
|
})
|
|
|
|
@app.route('/generate-pdf', methods=['POST'])
|
|
def generate_pdf_api():
|
|
"""Main PDF generation endpoint"""
|
|
try:
|
|
# Get request data - handle both JSON and form data more robustly
|
|
input_content = None
|
|
output_name = None
|
|
|
|
if request.is_json:
|
|
try:
|
|
data = request.get_json()
|
|
if data and 'input' in data:
|
|
input_content = data['input']
|
|
output_name = data.get('output', None)
|
|
except Exception as json_error:
|
|
print(f"❌ JSON parsing error: {json_error}")
|
|
return jsonify({'error': f'Invalid JSON format: {str(json_error)}'}), 400
|
|
else:
|
|
# Handle form data
|
|
input_content = request.form.get('input')
|
|
output_name = request.form.get('output')
|
|
|
|
# If input is a file, read its content
|
|
if 'input' in request.files:
|
|
file = request.files['input']
|
|
if file and file.filename:
|
|
try:
|
|
input_content = file.read().decode('utf-8')
|
|
if not output_name:
|
|
output_name = file.filename.replace('.html', '.pdf')
|
|
except UnicodeDecodeError:
|
|
return jsonify({'error': 'File encoding error. Please ensure the file is UTF-8 encoded.'}), 400
|
|
|
|
# Validate input
|
|
if not input_content or input_content.strip() == '':
|
|
return jsonify({'error': 'Input cannot be empty. Please provide HTML content.'}), 400
|
|
|
|
# Clean the HTML content - remove problematic control characters
|
|
input_content = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', input_content)
|
|
|
|
# Process input and generate PDF/ZIP
|
|
file_path, mime_type = process_input(input_content, output_name)
|
|
|
|
# Check if file was created
|
|
if not os.path.exists(file_path):
|
|
return jsonify({'error': 'Failed to generate output file'}), 500
|
|
|
|
# Send file response
|
|
response = send_file(
|
|
file_path,
|
|
as_attachment=True,
|
|
download_name=os.path.basename(file_path),
|
|
mimetype=mime_type
|
|
)
|
|
|
|
# Clean up after sending
|
|
@response.call_on_close
|
|
def cleanup():
|
|
try:
|
|
if os.path.exists(file_path):
|
|
os.remove(file_path)
|
|
print(f"🧹 Cleaned up: {file_path}")
|
|
except Exception as e:
|
|
print(f"❌ Cleanup error: {e}")
|
|
|
|
return response
|
|
|
|
except Exception as e:
|
|
print(f"❌ API Error: {str(e)}")
|
|
return jsonify({'error': str(e)}), 500
|
|
|
|
@app.after_request
|
|
def cleanup_temp_files(response):
|
|
"""Clean up temporary files older than 1 hour"""
|
|
try:
|
|
import time
|
|
current_time = time.time()
|
|
for filename in os.listdir(TEMP_FOLDER):
|
|
filepath = os.path.join(TEMP_FOLDER, filename)
|
|
if os.path.isfile(filepath):
|
|
if current_time - os.path.getmtime(filepath) > 3600: # 1 hour
|
|
os.remove(filepath)
|
|
print(f"🧹 Auto-cleanup: {filename}")
|
|
except Exception as e:
|
|
print(f"❌ Auto-cleanup error: {e}")
|
|
return response
|
|
|
|
if __name__ == '__main__':
|
|
print("🚀 Starting Advanced HTML to PDF Generator API...")
|
|
print("📝 Endpoints available:")
|
|
print(" GET / - API information")
|
|
print(" GET /health - Health check")
|
|
print(" POST /generate-pdf - Generate PDF from HTML/URL/file")
|
|
print("")
|
|
print("✨ Features:")
|
|
print(" • HTML string to PDF")
|
|
print(" • URL to PDF")
|
|
print(" • HTML file to PDF")
|
|
print(" • Batch HTML files to ZIP")
|
|
print(" • Standard A4 format")
|
|
print(" • Consistent page sizing")
|
|
|
|
app.run(host='0.0.0.0', port=8000, debug=True) |