diff --git a/bricktracker/peeron_instructions.py b/bricktracker/peeron_instructions.py new file mode 100644 index 0000000..2a3e302 --- /dev/null +++ b/bricktracker/peeron_instructions.py @@ -0,0 +1,197 @@ +import logging +from typing import Any, NamedTuple, TYPE_CHECKING +from urllib.parse import urljoin + +from bs4 import BeautifulSoup +import cloudscraper +from flask import current_app +import requests + +from .exceptions import ErrorException +if TYPE_CHECKING: + from .socket import BrickSocket + +logger = logging.getLogger(__name__) + + +def get_peeron_user_agent(): + """Get the User-Agent string for Peeron requests from config""" + return current_app.config.get('REBRICKABLE_USER_AGENT', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36') + + +def get_peeron_download_delay(): + """Get the delay in milliseconds between Peeron page downloads from config""" + return current_app.config.get('PEERON_DOWNLOAD_DELAY', 1000) + + +def get_min_image_size(): + """Get the minimum image size for valid Peeron instruction pages from config""" + return current_app.config.get('PEERON_MIN_IMAGE_SIZE', 100) + + +def get_peeron_instruction_url(set_number: str, version_number: str): + """Get the Peeron instruction page URL using the configured pattern""" + pattern = current_app.config.get('PEERON_INSTRUCTION_PATTERN', 'http://peeron.com/scans/{set_number}-{version_number}') + return pattern.format(set_number=set_number, version_number=version_number) + + +def get_peeron_thumbnail_url(set_number: str, version_number: str): + """Get the Peeron thumbnail base URL using the configured pattern""" + pattern = current_app.config.get('PEERON_THUMBNAIL_PATTERN', 'http://belay.peeron.com/thumbs/{set_number}-{version_number}/') + return pattern.format(set_number=set_number, version_number=version_number) + + +def get_peeron_scan_url(set_number: str, version_number: str): + """Get the Peeron scan base URL using the configured pattern""" + pattern = current_app.config.get('PEERON_SCAN_PATTERN', 'http://belay.peeron.com/scans/{set_number}-{version_number}/') + return pattern.format(set_number=set_number, version_number=version_number) + + +def create_peeron_scraper(): + """Create a cloudscraper instance configured for Peeron""" + scraper = cloudscraper.create_scraper() + scraper.headers.update({ + "User-Agent": get_peeron_user_agent() + }) + return scraper + + +class PeeronPage(NamedTuple): + """Represents a single instruction page from Peeron""" + page_number: str + thumbnail_url: str + image_url: str + alt_text: str + + +# Peeron instruction scraper +class PeeronInstructions(object): + socket: 'BrickSocket | None' + set_number: str + version_number: str + pages: list[PeeronPage] + + def __init__( + self, + set_number: str, + version_number: str = '1', + /, + *, + socket: 'BrickSocket | None' = None, + ): + # Save the socket + self.socket = socket + + # Parse set number (handle both "4011" and "4011-1" formats) + if '-' in set_number: + parts = set_number.split('-', 1) + self.set_number = parts[0] + self.version_number = parts[1] if len(parts) > 1 else '1' + else: + self.set_number = set_number + self.version_number = version_number + + # Placeholder for pages + self.pages = [] + + # Check if instructions exist on Peeron + def exists(self, /) -> bool: + """Check if the set exists on Peeron without downloading pages""" + try: + pages = self.find_pages() + return len(pages) > 0 + except ErrorException: + return False + + # Find all available instruction pages on Peeron + def find_pages(self, /) -> list[PeeronPage]: + """ + Scrape Peeron's HTML and return a list of available instruction pages. + Similar to BrickInstructions.find_instructions() but for Peeron. + """ + base_url = get_peeron_instruction_url(self.set_number, self.version_number) + thumb_base_url = get_peeron_thumbnail_url(self.set_number, self.version_number) + scan_base_url = get_peeron_scan_url(self.set_number, self.version_number) + + logger.debug(f"[find_pages] fetching HTML from {base_url!r}") + + # Set up cloudscraper with cookies enabled for Peeron + scraper = create_peeron_scraper() + + # Download the main HTML page + try: + response = scraper.get(base_url) + if response.status_code != 200: + raise ErrorException(f'Failed to load Peeron page for {self.set_number}-{self.version_number}. HTTP {response.status_code}') + except requests.exceptions.RequestException as e: + raise ErrorException(f'Failed to connect to Peeron: {e}') + + # Parse HTML to locate instruction pages + soup = BeautifulSoup(response.text, 'html.parser') + + # Check for "Browse instruction library" header (set not found) + if soup.find('h1', string="Browse instruction library"): + raise ErrorException(f'Set {self.set_number}-{self.version_number} not found on Peeron') + + # Locate all thumbnail images in the expected table structure + thumbnails = soup.select('table[cellspacing="5"] a img[src^="http://belay.peeron.com/thumbs/"]') + + if not thumbnails: + raise ErrorException(f'No instruction pages found for {self.set_number}-{self.version_number} on Peeron') + + pages: list[PeeronPage] = [] + for img in thumbnails: + thumb_url = img['src'] + + # Extract the page number from the thumbnail URL + page_number = thumb_url.split('/')[-2] + + # Build the full-size image URL + image_url = f"{scan_base_url}{page_number}/" + + logger.debug(f"[find_pages] Page {page_number}: thumb={thumb_url}, image={image_url}") + + # Create alt text for the page + alt_text = f"LEGO Instructions {self.set_number}-{self.version_number} Page {page_number}" + + page = PeeronPage( + page_number=page_number, + thumbnail_url=thumb_url, + image_url=image_url, + alt_text=alt_text + ) + pages.append(page) + + # Cache the pages for later use + self.pages = pages + + logger.debug(f"[find_pages] found {len(pages)} pages for {self.set_number}-{self.version_number}") + return pages + + # Find instructions with fallback to Peeron + @staticmethod + def find_instructions_with_peeron_fallback(set: str, /) -> tuple[list[tuple[str, str]], list[PeeronPage] | None]: + """ + Enhanced version of BrickInstructions.find_instructions() that falls back to Peeron. + Returns (rebrickable_instructions, peeron_pages). + If rebrickable_instructions is empty, peeron_pages will contain Peeron data. + """ + from .instructions import BrickInstructions + + # First try Rebrickable + try: + rebrickable_instructions = BrickInstructions.find_instructions(set) + return rebrickable_instructions, None + except ErrorException as e: + logger.info(f"Rebrickable failed for {set}: {e}. Trying Peeron fallback...") + + # Fallback to Peeron + try: + peeron = PeeronInstructions(set) + peeron_pages = peeron.find_pages() + return [], peeron_pages + except ErrorException as peeron_error: + # Both failed, re-raise original Rebrickable error + logger.info(f"Peeron also failed for {set}: {peeron_error}") + raise e from peeron_error \ No newline at end of file diff --git a/bricktracker/peeron_pdf.py b/bricktracker/peeron_pdf.py new file mode 100644 index 0000000..75d5d08 --- /dev/null +++ b/bricktracker/peeron_pdf.py @@ -0,0 +1,269 @@ +import logging +import os +import tempfile +import time +from typing import Any, TYPE_CHECKING + +import cloudscraper +from flask import current_app +from PIL import Image + +from .exceptions import DownloadException, ErrorException +from .instructions import BrickInstructions +from .peeron_instructions import PeeronPage, get_min_image_size, get_peeron_download_delay, get_peeron_instruction_url, create_peeron_scraper +if TYPE_CHECKING: + from .socket import BrickSocket + +logger = logging.getLogger(__name__) + + +# PDF generator for Peeron instruction pages +class PeeronPDF(object): + socket: 'BrickSocket' + set_number: str + version_number: str + pages: list[PeeronPage] + filename: str + + def __init__( + self, + set_number: str, + version_number: str, + pages: list[PeeronPage], + /, + *, + socket: 'BrickSocket', + ): + # Save the socket + self.socket = socket + + # Save set information + self.set_number = set_number + self.version_number = version_number + self.pages = pages + + # Generate filename following BrickTracker conventions + self.filename = f"{set_number}-{version_number}_peeron.pdf" + + # Download pages and create PDF + def create_pdf(self, /) -> None: + """ + Downloads selected Peeron pages and merges them into a PDF. + Uses progress updates via socket similar to BrickInstructions.download() + """ + try: + target_path = self._get_target_path() + + # Skip if we already have it + if os.path.isfile(target_path): + return self.socket.complete( + message=f"File {self.filename} already exists, skipped" + ) + + # Set up progress tracking + total_pages = len(self.pages) + self.socket.update_total(total_pages) + self.socket.progress_count = 0 + self.socket.progress(message=f"Starting download of {total_pages} pages") + + # Set up cloudscraper session for all downloads + scraper = create_peeron_scraper() + + # First visit the main instruction page to establish session with Peeron + try: + main_page_url = get_peeron_instruction_url(self.set_number, self.version_number) + logger.debug(f"Establishing session by visiting: {main_page_url}") + main_response = scraper.get(main_page_url) + logger.debug(f"Main page visit: HTTP {main_response.status_code}") + except Exception as e: + logger.warning(f"Failed to visit main page: {e}") + + # Download images to temporary files + temp_files = [] + failed_pages = [] + + try: + for i, page in enumerate(self.pages): + # Add delay between requests to avoid being blocked + if i > 0: + delay_ms = get_peeron_download_delay() + time.sleep(delay_ms / 1000.0) # Convert milliseconds to seconds + + temp_file = self._download_page_image(page, i + 1, scraper) + if temp_file: + temp_files.append(temp_file) + else: + failed_pages.append(page.page_number) + + if not temp_files: + # Collect detailed error information + error_msg = f"Failed to download any instruction pages for set {self.set_number}-{self.version_number}." + + # Check if it's a bot protection issue by trying to access the main page + try: + test_response = scraper.get(get_peeron_instruction_url(self.set_number, self.version_number)) + if test_response.status_code == 403: + error_msg += " Peeron blocked the request (HTTP 403) - bot protection is active." + elif test_response.status_code == 404: + error_msg += " Set not found on Peeron (HTTP 404)." + elif "Browse instruction library" in test_response.text: + error_msg += " Set exists on Peeron but has no instruction scans available." + else: + min_size = get_min_image_size() + error_msg += f" All pages returned small error images (smaller than {min_size}x{min_size}) - likely bot protection." + except Exception: + error_msg += " Could not connect to Peeron - check internet connection." + + raise DownloadException(error_msg) + + elif len(temp_files) < total_pages: + # Partial success + error_msg = f"Only downloaded {len(temp_files)}/{total_pages} pages successfully." + if failed_pages: + error_msg += f" Failed pages: {', '.join(failed_pages)}." + logger.warning(error_msg) + + # Create PDF from downloaded images + self._create_pdf_from_images(temp_files, target_path) + + # Success + logger.info(f"Created PDF {self.filename} with {len(temp_files)} pages") + self.socket.complete( + message=f"PDF {self.filename} created with {len(temp_files)} pages" + ) + + finally: + # Cleanup temporary files + for temp_file in temp_files: + try: + os.remove(temp_file) + except Exception as e: + logger.warning(f"Failed to remove temp file {temp_file}: {e}") + + except Exception as e: + logger.error(f"Error creating PDF {self.filename}: {e}") + self.socket.fail( + message=f"Error creating PDF {self.filename}: {e}" + ) + + # Download a single page image + def _download_page_image(self, page: PeeronPage, page_num: int, scraper, /) -> str | None: + """Download a single page image to a temporary file using provided scraper session""" + try: + logger.debug(f"Attempting to download page {page.page_number} from: {page.image_url}") + + # Download the image using the shared scraper session + response = scraper.get(page.image_url, stream=True) + logger.debug(f"Page {page.page_number}: HTTP {response.status_code}, Content-Type: {response.headers.get('content-type', 'unknown')}") + + if not response.ok: + logger.warning(f"Failed to download page {page.page_number}: HTTP {response.status_code}") + return None + + # Check if response is actually an image (not an error page) + content_type = response.headers.get('content-type', '') + if not content_type.startswith('image/'): + # Log first 500 chars of response for debugging + try: + response_text = response.text[:500] + logger.warning(f"Page {page.page_number}: Response is not an image (content-type: {content_type}). Response preview: {response_text}") + except: + logger.warning(f"Page {page.page_number}: Response is not an image (content-type: {content_type})") + return None + + # Create temporary file + temp_fd, temp_path = tempfile.mkstemp(suffix='.jpg', prefix=f'peeron_{page.page_number}_') + + try: + with os.fdopen(temp_fd, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) + + # Validate that we actually got an image (not an HTML error page) + try: + with Image.open(temp_path) as test_img: + width, height = test_img.size + min_size = get_min_image_size() + if width < min_size or height < min_size: # Too small to be a real instruction page + logger.warning(f"Page {page.page_number}: Image too small ({width}x{height}) - likely an error page") + os.remove(temp_path) + return None + except Exception as img_error: + logger.warning(f"Page {page.page_number}: Invalid image file - {img_error}") + os.remove(temp_path) + return None + + # Update progress + self.socket.progress_count += 1 + self.socket.progress( + message=f"Downloaded page {page.page_number} ({page_num}/{len(self.pages)})" + ) + + return temp_path + + except Exception as e: + # Clean up file descriptor if something went wrong + try: + os.close(temp_fd) + except: + pass + try: + os.remove(temp_path) + except: + pass + raise e + + except Exception as e: + logger.warning(f"Failed to download page {page.page_number}: {e}") + return None + + # Create PDF from downloaded images + def _create_pdf_from_images(self, image_paths: list[str], output_path: str, /) -> None: + """Create a PDF from a list of image files""" + try: + # Import FPDF (should be available from requirements) + from fpdf import FPDF + except ImportError: + raise ErrorException("FPDF library not available. Install with: pip install fpdf2") + + pdf = FPDF() + + for i, img_path in enumerate(image_paths): + try: + # Open image to get dimensions + with Image.open(img_path) as image: + width, height = image.size + + # Add page with image dimensions (convert pixels to mm) + # 1 pixel = 0.264583 mm (assuming 96 DPI) + page_width = width * 0.264583 + page_height = height * 0.264583 + + pdf.add_page(format=(page_width, page_height)) + pdf.image(img_path, x=0, y=0, w=page_width, h=page_height) + + # Update progress + progress_msg = f"Processing page {i + 1}/{len(image_paths)} into PDF" + self.socket.progress(message=progress_msg) + + except Exception as e: + logger.warning(f"Failed to add image {img_path} to PDF: {e}") + continue + + # Save the PDF + pdf.output(output_path) + + # Get target file path + def _get_target_path(self, /) -> str: + """Get the full path where the PDF should be saved""" + instructions_folder = os.path.join( + current_app.static_folder, # type: ignore + current_app.config['INSTRUCTIONS_FOLDER'] + ) + return os.path.join(instructions_folder, self.filename) + + # Create BrickInstructions instance for the generated PDF + def get_instructions(self, /) -> BrickInstructions: + """Return a BrickInstructions instance for the generated PDF""" + return BrickInstructions(self.filename) \ No newline at end of file diff --git a/bricktracker/views/instructions.py b/bricktracker/views/instructions.py index 2c2138a..dc44e34 100644 --- a/bricktracker/views/instructions.py +++ b/bricktracker/views/instructions.py @@ -14,6 +14,7 @@ from .exceptions import exception_handler from ..instructions import BrickInstructions from ..instructions_list import BrickInstructionsList from ..parser import parse_set +from ..peeron_instructions import PeeronInstructions from ..socket import MESSAGES from .upload import upload_helper @@ -160,12 +161,41 @@ def do_download() -> str: except Exception: set = '' - return render_template( - 'instructions.html', - download=True, - instructions=BrickInstructions.find_instructions(set), - set=set, - path=current_app.config['SOCKET_PATH'], - namespace=current_app.config['SOCKET_NAMESPACE'], - messages=MESSAGES - ) + # Try Rebrickable first, fallback to Peeron if it fails + rebrickable_instructions, peeron_pages = PeeronInstructions.find_instructions_with_peeron_fallback(set) + + # Determine which template to render based on what we found + if rebrickable_instructions: + # Standard Rebrickable instructions found + return render_template( + 'instructions.html', + download=True, + instructions=rebrickable_instructions, + set=set, + path=current_app.config['SOCKET_PATH'], + namespace=current_app.config['SOCKET_NAMESPACE'], + messages=MESSAGES + ) + elif peeron_pages: + # Peeron pages found - show page selection interface + return render_template( + 'peeron_select.html', + download=True, + pages=peeron_pages, + set=set, + path=current_app.config['SOCKET_PATH'], + namespace=current_app.config['SOCKET_NAMESPACE'], + messages=MESSAGES + ) + else: + # This shouldn't happen as the fallback method re-raises the original error + return render_template( + 'instructions.html', + download=True, + instructions=[], + set=set, + error='No instructions found on Rebrickable or Peeron', + path=current_app.config['SOCKET_PATH'], + namespace=current_app.config['SOCKET_NAMESPACE'], + messages=MESSAGES + ) diff --git a/templates/instructions/peeron_socket.html b/templates/instructions/peeron_socket.html new file mode 100644 index 0000000..6fecd0c --- /dev/null +++ b/templates/instructions/peeron_socket.html @@ -0,0 +1,148 @@ + \ No newline at end of file diff --git a/templates/peeron_select.html b/templates/peeron_select.html new file mode 100644 index 0000000..9ee8657 --- /dev/null +++ b/templates/peeron_select.html @@ -0,0 +1,88 @@ +{% extends 'base.html' %} + +{% block title %} - Download instructions from Peeron{% endblock %} + +{% block main %} +
+ Progress + + + Loading... + +
+