InstructionsDownloader/dl_peeron.py

import os
import sys
import requests
from bs4 import BeautifulSoup
from PIL import Image
from fpdf import FPDF

def download_peeron_images(set_number, version_number):
    # Define URLs and paths
    base_url = f"http://peeron.com/scans/{set_number}-{version_number}"
    thumb_base_url = f"http://belay.peeron.com/thumbs/{set_number}-{version_number}/"
    scan_base_url = f"http://belay.peeron.com/scans/{set_number}-{version_number}/"
    output_folder = f"{set_number}-{version_number}_images"
    pdf_filename = f"{set_number}-{version_number}_peeron.pdf"

    # Check if PDF already exists to avoid re-downloading and processing
    if os.path.exists(pdf_filename):
        print(f"PDF already exists: {pdf_filename}")
        return

    # Create output folder if it doesn't exist, ensuring all images can be stored
    os.makedirs(output_folder, exist_ok=True)

    # Set up a session to manage cookies and headers for consistent requests
    session = requests.Session()
    session.headers.update({
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
    })

    # Download the main HTML page containing the thumbnails
    response = session.get(base_url)
    if response.status_code != 200:
        print(f"Failed to download page: {base_url} (status code {response.status_code})")
        return

    # Parse HTML to locate the relevant content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Check for a specific header indicating the set does not exist on Peeron
    if soup.find('h1', string="Browse instruction library"):
        print(f"Set number {set_number} not found on Peeron.")
        return

    # Locate all thumbnail images in the expected table structure
    thumbnails = soup.select('table[cellspacing="5"] a img[src^="http://belay.peeron.com/thumbs/"]')

    # Exit early if no thumbnails are found to avoid unnecessary processing
    if not thumbnails:
        print("No thumbnails found.")
        return

    image_files = []  # To store paths of successfully downloaded images

    for img in thumbnails:
        thumb_url = img['src']
        # Extract the page number from the thumbnail URL to build the full image URL
        page_number = thumb_url.split('/')[-2]
        image_url = f"{scan_base_url}{page_number}/"

        # Download the full-size image
        img_response = session.get(image_url, stream=True)
        if img_response.status_code == 200:
            img_path = os.path.join(output_folder, f"{page_number}.jpg")
            # Save the image locally in chunks to handle large files efficiently
            with open(img_path, 'wb') as f:
                for chunk in img_response.iter_content(1024):
                    f.write(chunk)
            image_files.append(img_path)
            print(f"Downloaded: {image_url}")
        else:
            print(f"Failed to download image: {image_url} (status code {img_response.status_code})")

    # If no images were downloaded, exit to avoid creating an empty PDF
    if not image_files:
        print("No images downloaded.")
        return

    # Create a PDF from the downloaded images using FPDF
    pdf = FPDF()
    for img_file in image_files:
        # Open the image with Pillow to get its dimensions for accurate scaling
        image = Image.open(img_file)
        width, height = image.size
        # Add a new page and scale the image to fit
        pdf.add_page(format=(width * 0.264583, height * 0.264583))  # Convert pixels to mm (1 px = 0.264583 mm)
        pdf.image(img_file, x=0, y=0, w=width * 0.264583, h=height * 0.264583)  # Convert pixels to mm

    # Save the generated PDF to disk
    pdf.output(pdf_filename)
    print(f"PDF created: {pdf_filename}")

    # Cleanup: remove downloaded images and the temporary folder to save space
    for img_file in image_files:
        os.remove(img_file)
    os.rmdir(output_folder)
    print(f"Cleaned up temporary files.")

if __name__ == "__main__":
    # Ensure the script is called with the correct arguments
    if len(sys.argv) != 2:
        print("Usage: python script.py <set_number>")
        sys.exit(1)

    set_version_number = sys.argv[1].split('-')

    set_number = set_version_number[0]
    version_number = set_version_number[1]

    download_peeron_images(set_number,version_number)
Added peeron downloader 2025-01-26 22:13:46 +01:00			`import os`
			`import sys`
			`import requests`
			`from bs4 import BeautifulSoup`
			`from PIL import Image`
			`from fpdf import FPDF`

			`def download_peeron_images(set_number, version_number):`
			`# Define URLs and paths`
			`base_url = f"http://peeron.com/scans/{set_number}-{version_number}"`
			`thumb_base_url = f"http://belay.peeron.com/thumbs/{set_number}-{version_number}/"`
			`scan_base_url = f"http://belay.peeron.com/scans/{set_number}-{version_number}/"`
			`output_folder = f"{set_number}-{version_number}_images"`
			`pdf_filename = f"{set_number}-{version_number}_peeron.pdf"`

			`# Check if PDF already exists to avoid re-downloading and processing`
			`if os.path.exists(pdf_filename):`
			`print(f"PDF already exists: {pdf_filename}")`
			`return`

			`# Create output folder if it doesn't exist, ensuring all images can be stored`
			`os.makedirs(output_folder, exist_ok=True)`

			`# Set up a session to manage cookies and headers for consistent requests`
			`session = requests.Session()`
			`session.headers.update({`
			`"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"`
			`})`

			`# Download the main HTML page containing the thumbnails`
			`response = session.get(base_url)`
			`if response.status_code != 200:`
			`print(f"Failed to download page: {base_url} (status code {response.status_code})")`
			`return`

			`# Parse HTML to locate the relevant content using BeautifulSoup`
			`soup = BeautifulSoup(response.text, 'html.parser')`

			`# Check for a specific header indicating the set does not exist on Peeron`
			`if soup.find('h1', string="Browse instruction library"):`
			`print(f"Set number {set_number} not found on Peeron.")`
			`return`

			`# Locate all thumbnail images in the expected table structure`
			`thumbnails = soup.select('table[cellspacing="5"] a img[src^="http://belay.peeron.com/thumbs/"]')`

			`# Exit early if no thumbnails are found to avoid unnecessary processing`
			`if not thumbnails:`
			`print("No thumbnails found.")`
			`return`

			`image_files = [] # To store paths of successfully downloaded images`

			`for img in thumbnails:`
			`thumb_url = img['src']`
			`# Extract the page number from the thumbnail URL to build the full image URL`
			`page_number = thumb_url.split('/')[-2]`
			`image_url = f"{scan_base_url}{page_number}/"`

			`# Download the full-size image`
			`img_response = session.get(image_url, stream=True)`
			`if img_response.status_code == 200:`
			`img_path = os.path.join(output_folder, f"{page_number}.jpg")`
			`# Save the image locally in chunks to handle large files efficiently`
			`with open(img_path, 'wb') as f:`
			`for chunk in img_response.iter_content(1024):`
			`f.write(chunk)`
			`image_files.append(img_path)`
			`print(f"Downloaded: {image_url}")`
			`else:`
			`print(f"Failed to download image: {image_url} (status code {img_response.status_code})")`

			`# If no images were downloaded, exit to avoid creating an empty PDF`
			`if not image_files:`
			`print("No images downloaded.")`
			`return`

			`# Create a PDF from the downloaded images using FPDF`
			`pdf = FPDF()`
			`for img_file in image_files:`
			`# Open the image with Pillow to get its dimensions for accurate scaling`
			`image = Image.open(img_file)`
			`width, height = image.size`
			`# Add a new page and scale the image to fit`
			`pdf.add_page(format=(width * 0.264583, height * 0.264583)) # Convert pixels to mm (1 px = 0.264583 mm)`
			`pdf.image(img_file, x=0, y=0, w=width * 0.264583, h=height * 0.264583) # Convert pixels to mm`

			`# Save the generated PDF to disk`
			`pdf.output(pdf_filename)`
			`print(f"PDF created: {pdf_filename}")`

			`# Cleanup: remove downloaded images and the temporary folder to save space`
			`for img_file in image_files:`
			`os.remove(img_file)`
			`os.rmdir(output_folder)`
			`print(f"Cleaned up temporary files.")`

			`if __name__ == "__main__":`
			`# Ensure the script is called with the correct arguments`
			`if len(sys.argv) != 2:`
			`print("Usage: python script.py <set_number>")`
			`sys.exit(1)`

			`set_version_number = sys.argv[1].split('-')`

			`set_number = set_version_number[0]`
			`version_number = set_version_number[1]`

			`download_peeron_images(set_number,version_number)`