111 lines
4.3 KiB
Python
Raw Permalink Normal View History

2025-01-26 22:13:46 +01:00
import os
import sys
import requests
from bs4 import BeautifulSoup
from PIL import Image
from fpdf import FPDF
def download_peeron_images(set_number, version_number):
# Define URLs and paths
base_url = f"http://peeron.com/scans/{set_number}-{version_number}"
thumb_base_url = f"http://belay.peeron.com/thumbs/{set_number}-{version_number}/"
scan_base_url = f"http://belay.peeron.com/scans/{set_number}-{version_number}/"
output_folder = f"{set_number}-{version_number}_images"
pdf_filename = f"{set_number}-{version_number}_peeron.pdf"
# Check if PDF already exists to avoid re-downloading and processing
if os.path.exists(pdf_filename):
print(f"PDF already exists: {pdf_filename}")
return
# Create output folder if it doesn't exist, ensuring all images can be stored
os.makedirs(output_folder, exist_ok=True)
# Set up a session to manage cookies and headers for consistent requests
session = requests.Session()
session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
})
# Download the main HTML page containing the thumbnails
response = session.get(base_url)
if response.status_code != 200:
print(f"Failed to download page: {base_url} (status code {response.status_code})")
return
# Parse HTML to locate the relevant content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
# Check for a specific header indicating the set does not exist on Peeron
if soup.find('h1', string="Browse instruction library"):
print(f"Set number {set_number} not found on Peeron.")
return
# Locate all thumbnail images in the expected table structure
thumbnails = soup.select('table[cellspacing="5"] a img[src^="http://belay.peeron.com/thumbs/"]')
# Exit early if no thumbnails are found to avoid unnecessary processing
if not thumbnails:
print("No thumbnails found.")
return
image_files = [] # To store paths of successfully downloaded images
for img in thumbnails:
thumb_url = img['src']
# Extract the page number from the thumbnail URL to build the full image URL
page_number = thumb_url.split('/')[-2]
image_url = f"{scan_base_url}{page_number}/"
# Download the full-size image
img_response = session.get(image_url, stream=True)
if img_response.status_code == 200:
img_path = os.path.join(output_folder, f"{page_number}.jpg")
# Save the image locally in chunks to handle large files efficiently
with open(img_path, 'wb') as f:
for chunk in img_response.iter_content(1024):
f.write(chunk)
image_files.append(img_path)
print(f"Downloaded: {image_url}")
else:
print(f"Failed to download image: {image_url} (status code {img_response.status_code})")
# If no images were downloaded, exit to avoid creating an empty PDF
if not image_files:
print("No images downloaded.")
return
# Create a PDF from the downloaded images using FPDF
pdf = FPDF()
for img_file in image_files:
# Open the image with Pillow to get its dimensions for accurate scaling
image = Image.open(img_file)
width, height = image.size
# Add a new page and scale the image to fit
pdf.add_page(format=(width * 0.264583, height * 0.264583)) # Convert pixels to mm (1 px = 0.264583 mm)
pdf.image(img_file, x=0, y=0, w=width * 0.264583, h=height * 0.264583) # Convert pixels to mm
# Save the generated PDF to disk
pdf.output(pdf_filename)
print(f"PDF created: {pdf_filename}")
# Cleanup: remove downloaded images and the temporary folder to save space
for img_file in image_files:
os.remove(img_file)
os.rmdir(output_folder)
print(f"Cleaned up temporary files.")
if __name__ == "__main__":
# Ensure the script is called with the correct arguments
if len(sys.argv) != 2:
print("Usage: python script.py <set_number>")
sys.exit(1)
set_version_number = sys.argv[1].split('-')
set_number = set_version_number[0]
version_number = set_version_number[1]
download_peeron_images(set_number,version_number)