From c8651027eb4d401481349a90f3b2737711d4249f Mon Sep 17 00:00:00 2001 From: FrederikBaerentsen Date: Sun, 26 Jan 2025 22:13:46 +0100 Subject: [PATCH] Added peeron downloader --- dl_peeron.py | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 dl_peeron.py diff --git a/dl_peeron.py b/dl_peeron.py new file mode 100644 index 0000000..0583859 --- /dev/null +++ b/dl_peeron.py @@ -0,0 +1,110 @@ +import os +import sys +import requests +from bs4 import BeautifulSoup +from PIL import Image +from fpdf import FPDF + +def download_peeron_images(set_number, version_number): + # Define URLs and paths + base_url = f"http://peeron.com/scans/{set_number}-{version_number}" + thumb_base_url = f"http://belay.peeron.com/thumbs/{set_number}-{version_number}/" + scan_base_url = f"http://belay.peeron.com/scans/{set_number}-{version_number}/" + output_folder = f"{set_number}-{version_number}_images" + pdf_filename = f"{set_number}-{version_number}_peeron.pdf" + + # Check if PDF already exists to avoid re-downloading and processing + if os.path.exists(pdf_filename): + print(f"PDF already exists: {pdf_filename}") + return + + # Create output folder if it doesn't exist, ensuring all images can be stored + os.makedirs(output_folder, exist_ok=True) + + # Set up a session to manage cookies and headers for consistent requests + session = requests.Session() + session.headers.update({ + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36" + }) + + # Download the main HTML page containing the thumbnails + response = session.get(base_url) + if response.status_code != 200: + print(f"Failed to download page: {base_url} (status code {response.status_code})") + return + + # Parse HTML to locate the relevant content using BeautifulSoup + soup = BeautifulSoup(response.text, 'html.parser') + + # Check for a specific header indicating the set does not exist on Peeron + if soup.find('h1', string="Browse instruction library"): + print(f"Set number {set_number} not found on Peeron.") + return + + # Locate all thumbnail images in the expected table structure + thumbnails = soup.select('table[cellspacing="5"] a img[src^="http://belay.peeron.com/thumbs/"]') + + # Exit early if no thumbnails are found to avoid unnecessary processing + if not thumbnails: + print("No thumbnails found.") + return + + image_files = [] # To store paths of successfully downloaded images + + for img in thumbnails: + thumb_url = img['src'] + # Extract the page number from the thumbnail URL to build the full image URL + page_number = thumb_url.split('/')[-2] + image_url = f"{scan_base_url}{page_number}/" + + # Download the full-size image + img_response = session.get(image_url, stream=True) + if img_response.status_code == 200: + img_path = os.path.join(output_folder, f"{page_number}.jpg") + # Save the image locally in chunks to handle large files efficiently + with open(img_path, 'wb') as f: + for chunk in img_response.iter_content(1024): + f.write(chunk) + image_files.append(img_path) + print(f"Downloaded: {image_url}") + else: + print(f"Failed to download image: {image_url} (status code {img_response.status_code})") + + # If no images were downloaded, exit to avoid creating an empty PDF + if not image_files: + print("No images downloaded.") + return + + # Create a PDF from the downloaded images using FPDF + pdf = FPDF() + for img_file in image_files: + # Open the image with Pillow to get its dimensions for accurate scaling + image = Image.open(img_file) + width, height = image.size + # Add a new page and scale the image to fit + pdf.add_page(format=(width * 0.264583, height * 0.264583)) # Convert pixels to mm (1 px = 0.264583 mm) + pdf.image(img_file, x=0, y=0, w=width * 0.264583, h=height * 0.264583) # Convert pixels to mm + + # Save the generated PDF to disk + pdf.output(pdf_filename) + print(f"PDF created: {pdf_filename}") + + # Cleanup: remove downloaded images and the temporary folder to save space + for img_file in image_files: + os.remove(img_file) + os.rmdir(output_folder) + print(f"Cleaned up temporary files.") + +if __name__ == "__main__": + # Ensure the script is called with the correct arguments + if len(sys.argv) != 2: + print("Usage: python script.py ") + sys.exit(1) + + set_version_number = sys.argv[1].split('-') + + set_number = set_version_number[0] + version_number = set_version_number[1] + + download_peeron_images(set_number,version_number) +