Added peeron downloader

2025-01-26 22:13:46 +01:00 · 2025-01-26 22:13:46 +01:00 · c8651027eb
commit c8651027eb
parent 60a61cf6ad
1 changed files with 110 additions and 0 deletions
--- a/dl_peeron.py
+++ b/dl_peeron.py
@ -0,0 +1,110 @@
 import os
 import sys
 import requests
 from bs4 import BeautifulSoup
 from PIL import Image
 from fpdf import FPDF
 def download_peeron_images(set_number, version_number):
    # Define URLs and paths
    base_url = f"http://peeron.com/scans/{set_number}-{version_number}"
    thumb_base_url = f"http://belay.peeron.com/thumbs/{set_number}-{version_number}/"
    scan_base_url = f"http://belay.peeron.com/scans/{set_number}-{version_number}/"
    output_folder = f"{set_number}-{version_number}_images"
    pdf_filename = f"{set_number}-{version_number}_peeron.pdf"
    # Check if PDF already exists to avoid re-downloading and processing
    if os.path.exists(pdf_filename):
        print(f"PDF already exists: {pdf_filename}")
        return
    # Create output folder if it doesn't exist, ensuring all images can be stored
    os.makedirs(output_folder, exist_ok=True)
    # Set up a session to manage cookies and headers for consistent requests
    session = requests.Session()
    session.headers.update({
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
    })
    # Download the main HTML page containing the thumbnails
    response = session.get(base_url)
    if response.status_code != 200:
        print(f"Failed to download page: {base_url} (status code {response.status_code})")
        return
    # Parse HTML to locate the relevant content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    # Check for a specific header indicating the set does not exist on Peeron
    if soup.find('h1', string="Browse instruction library"):
        print(f"Set number {set_number} not found on Peeron.")
        return
    # Locate all thumbnail images in the expected table structure
    thumbnails = soup.select('table[cellspacing="5"] a img[src^="http://belay.peeron.com/thumbs/"]')
    # Exit early if no thumbnails are found to avoid unnecessary processing
    if not thumbnails:
        print("No thumbnails found.")
        return
    image_files = []  # To store paths of successfully downloaded images
    for img in thumbnails:
        thumb_url = img['src']
        # Extract the page number from the thumbnail URL to build the full image URL
        page_number = thumb_url.split('/')[-2]
        image_url = f"{scan_base_url}{page_number}/"
        # Download the full-size image
        img_response = session.get(image_url, stream=True)
        if img_response.status_code == 200:
            img_path = os.path.join(output_folder, f"{page_number}.jpg")
            # Save the image locally in chunks to handle large files efficiently
            with open(img_path, 'wb') as f:
                for chunk in img_response.iter_content(1024):
                    f.write(chunk)
            image_files.append(img_path)
            print(f"Downloaded: {image_url}")
        else:
            print(f"Failed to download image: {image_url} (status code {img_response.status_code})")
    # If no images were downloaded, exit to avoid creating an empty PDF
    if not image_files:
        print("No images downloaded.")
        return
    # Create a PDF from the downloaded images using FPDF
    pdf = FPDF()
    for img_file in image_files:
        # Open the image with Pillow to get its dimensions for accurate scaling
        image = Image.open(img_file)
        width, height = image.size
        # Add a new page and scale the image to fit
        pdf.add_page(format=(width * 0.264583, height * 0.264583))  # Convert pixels to mm (1 px = 0.264583 mm)
        pdf.image(img_file, x=0, y=0, w=width * 0.264583, h=height * 0.264583)  # Convert pixels to mm
    # Save the generated PDF to disk
    pdf.output(pdf_filename)
    print(f"PDF created: {pdf_filename}")
    # Cleanup: remove downloaded images and the temporary folder to save space
    for img_file in image_files:
        os.remove(img_file)
    os.rmdir(output_folder)
    print(f"Cleaned up temporary files.")
 if __name__ == "__main__":
    # Ensure the script is called with the correct arguments
    if len(sys.argv) != 2:
        print("Usage: python script.py <set_number>")
        sys.exit(1)
    set_version_number = sys.argv[1].split('-')
    set_number = set_version_number[0]
    version_number = set_version_number[1]
    download_peeron_images(set_number,version_number)