Added peeron downloader
This commit is contained in:
parent
60a61cf6ad
commit
c8651027eb
110
dl_peeron.py
Normal file
110
dl_peeron.py
Normal file
@ -0,0 +1,110 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from PIL import Image
|
||||||
|
from fpdf import FPDF
|
||||||
|
|
||||||
|
def download_peeron_images(set_number, version_number):
|
||||||
|
# Define URLs and paths
|
||||||
|
base_url = f"http://peeron.com/scans/{set_number}-{version_number}"
|
||||||
|
thumb_base_url = f"http://belay.peeron.com/thumbs/{set_number}-{version_number}/"
|
||||||
|
scan_base_url = f"http://belay.peeron.com/scans/{set_number}-{version_number}/"
|
||||||
|
output_folder = f"{set_number}-{version_number}_images"
|
||||||
|
pdf_filename = f"{set_number}-{version_number}_peeron.pdf"
|
||||||
|
|
||||||
|
# Check if PDF already exists to avoid re-downloading and processing
|
||||||
|
if os.path.exists(pdf_filename):
|
||||||
|
print(f"PDF already exists: {pdf_filename}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Create output folder if it doesn't exist, ensuring all images can be stored
|
||||||
|
os.makedirs(output_folder, exist_ok=True)
|
||||||
|
|
||||||
|
# Set up a session to manage cookies and headers for consistent requests
|
||||||
|
session = requests.Session()
|
||||||
|
session.headers.update({
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
|
||||||
|
})
|
||||||
|
|
||||||
|
# Download the main HTML page containing the thumbnails
|
||||||
|
response = session.get(base_url)
|
||||||
|
if response.status_code != 200:
|
||||||
|
print(f"Failed to download page: {base_url} (status code {response.status_code})")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Parse HTML to locate the relevant content using BeautifulSoup
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
|
||||||
|
# Check for a specific header indicating the set does not exist on Peeron
|
||||||
|
if soup.find('h1', string="Browse instruction library"):
|
||||||
|
print(f"Set number {set_number} not found on Peeron.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Locate all thumbnail images in the expected table structure
|
||||||
|
thumbnails = soup.select('table[cellspacing="5"] a img[src^="http://belay.peeron.com/thumbs/"]')
|
||||||
|
|
||||||
|
# Exit early if no thumbnails are found to avoid unnecessary processing
|
||||||
|
if not thumbnails:
|
||||||
|
print("No thumbnails found.")
|
||||||
|
return
|
||||||
|
|
||||||
|
image_files = [] # To store paths of successfully downloaded images
|
||||||
|
|
||||||
|
for img in thumbnails:
|
||||||
|
thumb_url = img['src']
|
||||||
|
# Extract the page number from the thumbnail URL to build the full image URL
|
||||||
|
page_number = thumb_url.split('/')[-2]
|
||||||
|
image_url = f"{scan_base_url}{page_number}/"
|
||||||
|
|
||||||
|
# Download the full-size image
|
||||||
|
img_response = session.get(image_url, stream=True)
|
||||||
|
if img_response.status_code == 200:
|
||||||
|
img_path = os.path.join(output_folder, f"{page_number}.jpg")
|
||||||
|
# Save the image locally in chunks to handle large files efficiently
|
||||||
|
with open(img_path, 'wb') as f:
|
||||||
|
for chunk in img_response.iter_content(1024):
|
||||||
|
f.write(chunk)
|
||||||
|
image_files.append(img_path)
|
||||||
|
print(f"Downloaded: {image_url}")
|
||||||
|
else:
|
||||||
|
print(f"Failed to download image: {image_url} (status code {img_response.status_code})")
|
||||||
|
|
||||||
|
# If no images were downloaded, exit to avoid creating an empty PDF
|
||||||
|
if not image_files:
|
||||||
|
print("No images downloaded.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Create a PDF from the downloaded images using FPDF
|
||||||
|
pdf = FPDF()
|
||||||
|
for img_file in image_files:
|
||||||
|
# Open the image with Pillow to get its dimensions for accurate scaling
|
||||||
|
image = Image.open(img_file)
|
||||||
|
width, height = image.size
|
||||||
|
# Add a new page and scale the image to fit
|
||||||
|
pdf.add_page(format=(width * 0.264583, height * 0.264583)) # Convert pixels to mm (1 px = 0.264583 mm)
|
||||||
|
pdf.image(img_file, x=0, y=0, w=width * 0.264583, h=height * 0.264583) # Convert pixels to mm
|
||||||
|
|
||||||
|
# Save the generated PDF to disk
|
||||||
|
pdf.output(pdf_filename)
|
||||||
|
print(f"PDF created: {pdf_filename}")
|
||||||
|
|
||||||
|
# Cleanup: remove downloaded images and the temporary folder to save space
|
||||||
|
for img_file in image_files:
|
||||||
|
os.remove(img_file)
|
||||||
|
os.rmdir(output_folder)
|
||||||
|
print(f"Cleaned up temporary files.")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Ensure the script is called with the correct arguments
|
||||||
|
if len(sys.argv) != 2:
|
||||||
|
print("Usage: python script.py <set_number>")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
set_version_number = sys.argv[1].split('-')
|
||||||
|
|
||||||
|
set_number = set_version_number[0]
|
||||||
|
version_number = set_version_number[1]
|
||||||
|
|
||||||
|
download_peeron_images(set_number,version_number)
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user