PDF_Generation_and_Automation/python-pdf-generator/app.py
2025-08-30 17:07:35 +05:30

861 lines
34 KiB
Python

#!/usr/bin/env python3
"""
Advanced HTML to PDF Generator API with Intelligent Content Analysis
Supports URLs, HTML files, HTML strings, and batch processing
Always uses A4 size for consistent output
"""
from flask import Flask, request, send_file, jsonify
import os
import asyncio
import tempfile
import zipfile
from playwright.async_api import async_playwright, TimeoutError, Page
from werkzeug.utils import secure_filename
import uuid
from datetime import datetime
import re
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = Flask(__name__)
# Configure temp folder
TEMP_FOLDER = 'temp'
if not os.path.exists(TEMP_FOLDER):
os.makedirs(TEMP_FOLDER)
class HTMLPreprocessor:
"""Intelligently preprocesses HTML to remove spacing issues and optimize for PDF generation."""
@staticmethod
def preprocess_html(html_content: str) -> str:
"""
Dynamically analyze and fix spacing issues in HTML for perfect PDF generation.
"""
print("🔧 Preprocessing HTML for optimal PDF generation...")
# Step 1: Detect page elements and their structure
page_info = HTMLPreprocessor._analyze_page_structure(html_content)
# Step 2: Remove problematic spacing
html_content = HTMLPreprocessor._remove_spacing_issues(html_content, page_info)
# Step 3: Optimize for PDF generation
html_content = HTMLPreprocessor._optimize_for_pdf(html_content, page_info)
print(f"✅ HTML preprocessing completed - {page_info['page_count']} pages optimized")
return html_content
@staticmethod
def _analyze_page_structure(html_content: str) -> dict:
"""Analyze the HTML structure to understand page layout and spacing."""
# Detect page elements
page_selectors = [
r'class="[^"]*brochure-page[^"]*"',
r'class="[^"]*page[^"]*"',
r'class="[^"]*pdf-page[^"]*"',
r'class="[^"]*slide[^"]*"',
r'class="[^"]*section[^"]*"'
]
page_count = 0
page_elements = []
for selector in page_selectors:
matches = re.findall(selector, html_content, re.IGNORECASE)
if matches:
page_count = len(matches)
page_elements = matches
break
# If no specific page elements found, look for A4-sized containers
if page_count == 0:
# Look for elements with A4-like dimensions in CSS
a4_patterns = [
r'width:\s*210mm',
r'height:\s*297mm',
r'width:\s*794px',
r'height:\s*1123px',
r'width:\s*8\.27in',
r'height:\s*11\.7in'
]
for pattern in a4_patterns:
if re.search(pattern, html_content, re.IGNORECASE):
page_count = 1
break
# Analyze body and container spacing
spacing_issues = HTMLPreprocessor._detect_spacing_issues(html_content)
return {
'page_count': page_count,
'page_elements': page_elements,
'spacing_issues': spacing_issues,
'has_flexbox': 'display: flex' in html_content,
'has_grid': 'display: grid' in html_content,
'has_padding': 'padding:' in html_content,
'has_margin': 'margin:' in html_content,
'has_gap': 'gap:' in html_content
}
@staticmethod
def _detect_spacing_issues(html_content: str) -> dict:
"""Detect various types of spacing issues that affect PDF generation."""
issues = {
'body_padding': False,
'body_margin': False,
'body_gap': False,
'document_level_spacing': False,
'container_spacing': False
}
# Check for body-level spacing issues
if re.search(r'body\s*{[^}]*padding[^}]*}', html_content, re.IGNORECASE):
issues['body_padding'] = True
if re.search(r'body\s*{[^}]*margin[^}]*}', html_content, re.IGNORECASE):
issues['body_margin'] = True
if re.search(r'body\s*{[^}]*gap[^}]*}', html_content, re.IGNORECASE):
issues['body_gap'] = True
# Check for document-level spacing
if re.search(r'html\s*{[^}]*padding[^}]*}', html_content, re.IGNORECASE):
issues['document_level_spacing'] = True
if re.search(r'html\s*{[^}]*margin[^}]*}', html_content, re.IGNORECASE):
issues['document_level_spacing'] = True
# Check for container spacing
if re.search(r'\.container\s*{[^}]*padding[^}]*}', html_content, re.IGNORECASE):
issues['container_spacing'] = True
if re.search(r'\.wrapper\s*{[^}]*padding[^}]*}', html_content, re.IGNORECASE):
issues['container_spacing'] = True
return issues
@staticmethod
def _remove_spacing_issues(html_content: str, page_info: dict) -> str:
"""Remove problematic spacing while preserving internal page spacing."""
# Only remove document-level spacing, preserve internal spacing
if page_info['spacing_issues']['body_padding']:
html_content = re.sub(
r'(body\s*{[^}]*?)padding[^;]*;?([^}]*})',
r'\1\2',
html_content,
flags=re.IGNORECASE
)
if page_info['spacing_issues']['body_margin']:
html_content = re.sub(
r'(body\s*{[^}]*?)margin[^;]*;?([^}]*})',
r'\1\2',
html_content,
flags=re.IGNORECASE
)
if page_info['spacing_issues']['body_gap']:
html_content = re.sub(
r'(body\s*{[^}]*?)gap[^;]*;?([^}]*})',
r'\1\2',
html_content,
flags=re.IGNORECASE
)
if page_info['spacing_issues']['document_level_spacing']:
html_content = re.sub(
r'(html\s*{[^}]*?)padding[^;]*;?([^}]*})',
r'\1\2',
html_content,
flags=re.IGNORECASE
)
html_content = re.sub(
r'(html\s*{[^}]*?)margin[^;]*;?([^}]*})',
r'\1\2',
html_content,
flags=re.IGNORECASE
)
# Add CSS to ensure continuous flow
continuous_flow_css = '''
/* Ensure continuous flow for PDF generation */
body {
padding: 0 !important;
margin: 0 !important;
gap: 0 !important;
}
/* Preserve all internal page spacing and margins */
.page-layout, .p1-content-side, .p2-grid, .p3-main-content, .p4-info-grid {
/* Keep all internal spacing intact */
}
/* Ensure no page breaks within content */
.brochure-page, .page, .pdf-page, .slide, .section {
page-break-after: auto;
page-break-inside: avoid;
break-inside: avoid;
}
/* Preserve internal margins and padding */
* {
page-break-inside: avoid;
break-inside: avoid;
}
'''
# Insert the CSS after existing styles
if '</style>' in html_content:
html_content = html_content.replace('</style>', continuous_flow_css + '\n </style>')
return html_content
@staticmethod
def _optimize_for_pdf(html_content: str, page_info: dict) -> str:
"""Add PDF-specific optimizations while preserving internal spacing."""
pdf_optimizations = '''
/* PDF-specific optimizations - preserve internal spacing */
@media print {
/* Only remove document-level spacing, preserve internal spacing */
body {
padding: 0 !important;
margin: 0 !important;
gap: 0 !important;
}
/* Preserve all internal page spacing and margins */
.page-layout {
padding: 70px !important; /* Keep internal page padding */
}
.p1-content-side {
padding: 70px 60px !important; /* Keep content padding */
}
/* Ensure no page breaks within content */
.brochure-page, .page, .pdf-page {
page-break-after: auto !important;
page-break-inside: avoid !important;
}
}
/* Ensure exact color rendering */
* {
-webkit-print-color-adjust: exact !important;
color-adjust: exact !important;
}
'''
# Insert PDF optimizations
if '</style>' in html_content:
html_content = html_content.replace('</style>', pdf_optimizations + '\n </style>')
return html_content
class PageDetector:
"""Detects page elements and their dimensions in HTML documents."""
@staticmethod
async def detect_pages_and_format(page: Page) -> dict:
"""
Advanced page detection with multiple fallback strategies.
Handles different HTML structures and CSS approaches robustly.
"""
page_info = await page.evaluate("""
() => {
// Strategy 1: Direct page element detection
const pageSelectors = [
'.brochure-page',
'.brochure',
'.page',
'[class*="page"]',
'.pdf-page',
'.slide',
'.section'
];
let pageElements = [];
let detectedSelector = '';
// Find page elements with priority order
for (const selector of pageSelectors) {
const elements = document.querySelectorAll(selector);
if (elements.length > 0) {
pageElements = Array.from(elements);
detectedSelector = selector;
break;
}
}
// Strategy 2: A4-sized container detection
if (pageElements.length === 0) {
const allElements = document.querySelectorAll('*');
const a4Elements = Array.from(allElements).filter(el => {
const style = window.getComputedStyle(el);
const width = parseFloat(style.width);
const height = parseFloat(style.height);
// A4 dimensions in different units
const isA4Width = (width >= 794 && width <= 800) ||
(width >= 210 && width <= 220) ||
(width >= 8.27 && width <= 8.5);
const isA4Height = (height >= 1123 && height <= 1130) ||
(height >= 297 && height <= 300) ||
(height >= 11.69 && height <= 12);
return isA4Width && isA4Height;
});
if (a4Elements.length > 0) {
pageElements = a4Elements;
detectedSelector = 'A4-sized-element';
}
}
// Strategy 3: Body as single page
if (pageElements.length === 0) {
pageElements = [document.body];
detectedSelector = 'body';
}
// Advanced dimension analysis with multiple measurement methods
let dimensionResults = [];
pageElements.forEach((element, index) => {
const measurements = {};
// Method 1: CSS Computed Style
const computedStyle = window.getComputedStyle(element);
const cssWidth = parseFloat(computedStyle.width);
const cssHeight = parseFloat(computedStyle.height);
if (cssWidth > 0 && cssHeight > 0) {
measurements.css = { width: cssWidth, height: cssHeight };
}
// Method 2: Bounding Client Rect
const rect = element.getBoundingClientRect();
if (rect.width > 0 && rect.height > 0) {
measurements.bounding = { width: rect.width, height: rect.height };
}
// Method 3: Offset Dimensions
if (element.offsetWidth > 0 && element.offsetHeight > 0) {
measurements.offset = { width: element.offsetWidth, height: element.offsetHeight };
}
// Method 4: Scroll Dimensions
if (element.scrollWidth > 0 && element.scrollHeight > 0) {
measurements.scroll = { width: element.scrollWidth, height: element.scrollHeight };
}
// Method 5: Client Dimensions
if (element.clientWidth > 0 && element.clientHeight > 0) {
measurements.client = { width: element.clientWidth, height: element.clientHeight };
}
// Select the best measurement method
let bestMeasurement = null;
let bestScore = 0;
Object.entries(measurements).forEach(([method, dims]) => {
const score = calculateDimensionScore(dims.width, dims.height);
if (score > bestScore) {
bestScore = score;
bestMeasurement = { method, ...dims };
}
});
if (bestMeasurement) {
dimensionResults.push({
index,
element: element.tagName + (element.className ? '.' + element.className.split(' ')[0] : ''),
...bestMeasurement
});
}
});
// Helper function to score dimensions
function calculateDimensionScore(width, height) {
if (width <= 0 || height <= 0) return 0;
if (width > 2000 || height > 2000) return 0; // Too large
if (width < 50 || height < 50) return 0; // Too small
// Prefer A4-like dimensions
const aspectRatio = width / height;
const a4Ratio = 210 / 297; // 0.707
const ratioScore = 1 - Math.abs(aspectRatio - a4Ratio) / a4Ratio;
// Prefer reasonable sizes
const sizeScore = Math.min(width / 800, height / 1200);
return ratioScore * sizeScore;
}
// Calculate final dimensions
let maxWidth = 0;
let maxHeight = 0;
let totalWidth = 0;
let totalHeight = 0;
let validCount = 0;
dimensionResults.forEach(result => {
if (result.width > 0 && result.height > 0) {
maxWidth = Math.max(maxWidth, result.width);
maxHeight = Math.max(maxHeight, result.height);
totalWidth += result.width;
totalHeight += result.height;
validCount++;
}
});
// Fallback to standard A4 if no valid dimensions
if (validCount === 0) {
maxWidth = 794;
maxHeight = 1123;
console.warn('No valid dimensions detected, using standard A4');
}
// Enhanced format detection
let format = 'a4';
const aspectRatio = maxWidth / maxHeight;
if (Math.abs(aspectRatio - 0.707) < 0.1) { // A4 ratio
format = 'a4';
} else if (Math.abs(aspectRatio - 0.773) < 0.1) { // Letter ratio
format = 'a4';
} else if (Math.abs(aspectRatio - 0.607) < 0.1) { // Legal ratio
format = 'a4';
} else if (aspectRatio > 1.2) { // Landscape
format = 'a4';
} else if (aspectRatio < 0.5) { // Very tall
format = 'a4';
}
return {
pageCount: pageElements.length,
format: format,
maxWidth: Math.round(maxWidth),
maxHeight: Math.round(maxHeight),
totalWidth: Math.round(totalWidth),
totalHeight: Math.round(totalHeight),
aspectRatio: aspectRatio,
detectedSelector: detectedSelector,
validDimensions: validCount,
averageWidth: validCount > 0 ? Math.round(totalWidth / validCount) : 0,
averageHeight: validCount > 0 ? Math.round(totalHeight / validCount) : 0,
dimensionResults: dimensionResults,
hasReasonableDimensions: maxWidth >= 200 && maxHeight >= 200,
measurementMethods: dimensionResults.map(r => r.method)
};
}
""")
return page_info
async def generate_single_pdf(input_content: str, output_pdf: str, is_url: bool = False, is_file: bool = False, is_html_string: bool = False):
"""
Generate PDF for a single input (URL, file path, or HTML string).
Always uses A4 size for consistent output with intelligent content fitting.
"""
temp_file = None
try:
async with async_playwright() as p:
# Use the correct Chromium path
browser = await p.chromium.launch(
headless=True,
args=[
'--disable-dev-shm-usage',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-gpu',
'--disable-web-security',
'--disable-features=VizDisplayCompositor',
'--enable-font-antialiasing',
'--font-render-hinting=none',
'--disable-background-timer-throttling',
'--disable-backgrounding-occluded-windows',
'--disable-renderer-backgrounding',
'--allow-running-insecure-content',
'--disable-extensions',
'--disable-plugins',
'--disable-images=false',
'--enable-javascript',
'--enable-css',
'--enable-fonts'
]
)
page = await browser.new_page()
await page.set_viewport_size({'width': 1920, 'height': 1080})
page.set_default_timeout(120000)
if is_html_string:
# Preprocess HTML content
processed_html = HTMLPreprocessor.preprocess_html(input_content)
# Write processed HTML to temp file
with tempfile.NamedTemporaryFile(delete=False, suffix='.html', dir=TEMP_FOLDER) as tmp:
tmp.write(processed_html.encode('utf-8'))
temp_file = tmp.name
abs_path = "file://" + os.path.abspath(temp_file)
await page.goto(abs_path, wait_until="load")
elif is_url:
await page.goto(input_content, wait_until="domcontentloaded")
elif is_file:
abs_path = "file://" + os.path.abspath(input_content)
await page.goto(abs_path, wait_until="load")
else:
raise ValueError("Invalid input type")
# Wait for content to load and stabilize
await page.wait_for_timeout(3000)
# Wait for any dynamic content to finish loading
try:
await page.wait_for_load_state('networkidle', timeout=10000)
except:
pass # Continue if network idle doesn't happen
# Ensure all external resources are loaded
print("🔄 Loading external resources...")
await _ensure_resources_loaded(page)
# Detect pages and format
print("🔍 Analyzing page structure...")
page_info = await PageDetector.detect_pages_and_format(page)
print(f"📄 Detected {page_info['pageCount']} pages")
print(f"📐 Max dimensions: {page_info['maxWidth']}x{page_info['maxHeight']}px")
print(f"🎯 Recommended format: {page_info['format']}")
print(f"🔍 Detected selector: {page_info['detectedSelector']}")
print(f"✅ Valid dimensions: {page_info['validDimensions']}")
print(f"📏 Average dimensions: {page_info['averageWidth']}x{page_info['averageHeight']}px")
print(f"📊 Aspect ratio: {page_info['aspectRatio']:.3f}")
print(f"🔧 Measurement methods: {', '.join(page_info['measurementMethods'])}")
# Always use A4 format for consistent output
pdf_format = 'a4'
# Calculate optimal scale to fit content within A4 dimensions
dpi = 96
content_width_px = page_info['maxWidth']
content_height_px = page_info['maxHeight']
# Convert to inches
content_width_in = content_width_px / dpi
content_height_in = content_height_px / dpi
# Determine orientation based on content analysis
landscape = False
if content_width_in > content_height_in * 1.2: # 20% wider threshold
landscape = True
elif page_info.get('hasTables', False) and content_width_in > content_height_in * 1.1: # Tables need more width
landscape = True
# Always use A4 dimensions
if landscape:
# A4 Landscape: 11" x 8.5"
pdf_width = 11.0
pdf_height = 8.5
else:
# A4 Portrait: 8.5" x 11"
pdf_width = 8.5
pdf_height = 11.0
# Calculate optimal scale to fit content within A4 dimensions
# Account for margins when calculating scale
margin_in = 0.5 # 0.5 inch margins
available_width = pdf_width - (2 * margin_in)
available_height = pdf_height - (2 * margin_in)
# Calculate scale to fit content within available space
width_scale = available_width / content_width_in if content_width_in > 0 else 1.0
height_scale = available_height / content_height_in if content_height_in > 0 else 1.0
# Use the smaller scale to ensure content fits in both dimensions
optimal_scale = min(width_scale, height_scale, 1.0) # Don't scale up beyond 100%
# Ensure minimum scale for readability
if optimal_scale < 0.3:
optimal_scale = 0.3 # Minimum 30% scale for readability
# Adjust margins based on content type - optimized for A4 size
if page_info.get('hasTables', False):
# Tables need more breathing room on A4
margins = {'top': '0.75in', 'right': '0.75in', 'bottom': '0.75in', 'left': '0.75in'}
elif page_info.get('hasImages', False):
# Images look better with balanced margins on A4
margins = {'top': '0.6in', 'right': '0.6in', 'bottom': '0.6in', 'left': '0.6in'}
else:
# Text content works well with standard A4 margins
margins = {'top': '0.5in', 'right': '0.5in', 'bottom': '0.5in', 'left': '0.5in'}
# For very small content, use smaller margins to maximize A4 space
if content_width_in < 6.0 and content_height_in < 8.0:
margins = {'top': '0.4in', 'right': '0.4in', 'bottom': '0.4in', 'left': '0.4in'}
# For very large content, use larger margins to ensure readability
if content_width_in > 10.0 or content_height_in > 12.0:
margins = {'top': '0.8in', 'right': '0.8in', 'bottom': '0.8in', 'left': '0.8in'}
pdf_options = {
'path': output_pdf,
'print_background': True,
'margin': margins,
'scale': optimal_scale,
'landscape': landscape,
'width': f"{pdf_width}in",
'height': f"{pdf_height}in",
'prefer_css_page_size': False, # Disable CSS page size to ensure A4
'format': 'A4' # Explicitly set A4 format
}
# Generate PDF
await page.pdf(**pdf_options)
await browser.close()
print(f"✅ PDF generated: {output_pdf}")
print(f"📏 A4 Size: {pdf_width}in x {pdf_height}in ({'Landscape' if landscape else 'Portrait'})")
print(f"📐 Content: {content_width_in:.2f}in x {content_height_in:.2f}in")
print(f"🔍 Scale: {optimal_scale:.2f} (optimized for A4 fit)")
print(f"📄 Format: A4 Standard")
except TimeoutError:
raise Exception("Timeout: Page took too long to load.")
except Exception as e:
print(f"❌ PDF generation error: {str(e)}")
raise e
finally:
if temp_file and os.path.exists(temp_file):
os.remove(temp_file)
async def _ensure_resources_loaded(page: Page):
"""Ensure all external resources are properly loaded."""
# Wait for fonts to load
await page.evaluate("""
() => {
return document.fonts.ready;
}
""")
# Wait for all images to load
await page.evaluate("""
() => {
return Promise.all(
Array.from(document.images)
.filter(img => !img.complete)
.map(img => new Promise(resolve => {
img.onload = img.onerror = resolve;
}))
);
}
""")
# Wait for background images to load
await page.evaluate("""
() => {
const elementsWithBg = document.querySelectorAll('[style*="background-image"], [class*="image"]');
return Promise.all(
Array.from(elementsWithBg).map(el => {
const style = window.getComputedStyle(el);
const bgImage = style.backgroundImage;
if (bgImage && bgImage !== 'none') {
return new Promise(resolve => {
const img = new Image();
img.onload = img.onerror = resolve;
img.src = bgImage.replace(/url\\(['"]?(.*?)['"]?\\)/g, '$1');
});
}
return Promise.resolve();
})
);
}
""")
# Wait for CSS to be fully applied
await page.wait_for_timeout(2000)
def process_input(input_content: str, output_name: str = None):
"""
Process the input: determine type and generate PDF(s).
Returns path to PDF or ZIP file.
"""
is_url = input_content.startswith('http://') or input_content.startswith('https://')
is_file = False # HTML content is not a physical file
is_html_string = True # HTML content is always a string
if output_name is None:
output_name = f'single_output_{uuid.uuid4().hex[:8]}.pdf'
if is_file:
# This case should ideally not be reached for HTML content
raise ValueError("HTML content cannot be treated as a file path.")
pdf_path = os.path.join(TEMP_FOLDER, secure_filename(output_name))
if is_url:
asyncio.run(generate_single_pdf(input_content, pdf_path, is_url=True))
elif is_html_string:
asyncio.run(generate_single_pdf(input_content, pdf_path, is_html_string=True))
else:
raise ValueError("Invalid input type for processing")
return pdf_path, 'application/pdf'
@app.route('/')
def root():
"""Root endpoint"""
return jsonify({
"message": "PDF Generator API",
"version": "2.0.0",
"status": "running",
"timestamp": datetime.now().isoformat(),
"endpoints": {
"generate-pdf": "/generate-pdf",
"health": "/health"
},
"features": [
"HTML string to PDF",
"URL to PDF",
"HTML file to PDF",
"Batch HTML files to ZIP",
"Standard A4 format",
"Consistent page sizing"
],
"usage": {
"method": "POST",
"endpoint": "/generate-pdf",
"body": {
"input": "HTML string, URL, or file path",
"output": "Optional output filename"
}
}
})
@app.route('/health')
def health_check():
"""Health check endpoint"""
return jsonify({
"status": "healthy",
"timestamp": datetime.now().isoformat(),
"service": "Advanced HTML to PDF Generator",
"temp_folder": TEMP_FOLDER,
"temp_folder_exists": os.path.exists(TEMP_FOLDER),
"uptime": "running"
})
@app.route('/generate-pdf', methods=['POST'])
def generate_pdf_api():
"""Main PDF generation endpoint"""
try:
# Get request data - handle both JSON and form data more robustly
input_content = None
output_name = None
if request.is_json:
try:
data = request.get_json()
if data and 'input' in data:
input_content = data['input']
output_name = data.get('output', None)
except Exception as json_error:
print(f"❌ JSON parsing error: {json_error}")
return jsonify({'error': f'Invalid JSON format: {str(json_error)}'}), 400
else:
# Handle form data
input_content = request.form.get('input')
output_name = request.form.get('output')
# If input is a file, read its content
if 'input' in request.files:
file = request.files['input']
if file and file.filename:
try:
input_content = file.read().decode('utf-8')
if not output_name:
output_name = file.filename.replace('.html', '.pdf')
except UnicodeDecodeError:
return jsonify({'error': 'File encoding error. Please ensure the file is UTF-8 encoded.'}), 400
# Validate input
if not input_content or input_content.strip() == '':
return jsonify({'error': 'Input cannot be empty. Please provide HTML content.'}), 400
# Clean the HTML content - remove problematic control characters
input_content = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', input_content)
# Process input and generate PDF/ZIP
file_path, mime_type = process_input(input_content, output_name)
# Check if file was created
if not os.path.exists(file_path):
return jsonify({'error': 'Failed to generate output file'}), 500
# Send file response
response = send_file(
file_path,
as_attachment=True,
download_name=os.path.basename(file_path),
mimetype=mime_type
)
# Clean up after sending
@response.call_on_close
def cleanup():
try:
if os.path.exists(file_path):
os.remove(file_path)
print(f"🧹 Cleaned up: {file_path}")
except Exception as e:
print(f"❌ Cleanup error: {e}")
return response
except Exception as e:
print(f"❌ API Error: {str(e)}")
return jsonify({'error': str(e)}), 500
@app.after_request
def cleanup_temp_files(response):
"""Clean up temporary files older than 1 hour"""
try:
import time
current_time = time.time()
for filename in os.listdir(TEMP_FOLDER):
filepath = os.path.join(TEMP_FOLDER, filename)
if os.path.isfile(filepath):
if current_time - os.path.getmtime(filepath) > 3600: # 1 hour
os.remove(filepath)
print(f"🧹 Auto-cleanup: {filename}")
except Exception as e:
print(f"❌ Auto-cleanup error: {e}")
return response
if __name__ == '__main__':
print("🚀 Starting Advanced HTML to PDF Generator API...")
print("📝 Endpoints available:")
print(" GET / - API information")
print(" GET /health - Health check")
print(" POST /generate-pdf - Generate PDF from HTML/URL/file")
print("")
print("✨ Features:")
print(" • HTML string to PDF")
print(" • URL to PDF")
print(" • HTML file to PDF")
print(" • Batch HTML files to ZIP")
print(" • Standard A4 format")
print(" • Consistent page sizing")
app.run(host='0.0.0.0', port=8000, debug=True)