Feat(peeron): Initial upload

This commit is contained in:
2025-09-23 17:30:30 +02:00
parent 90c0c20d75
commit eddf4311d3
5 changed files with 741 additions and 9 deletions
+197
View File
@@ -0,0 +1,197 @@
import logging
from typing import Any, NamedTuple, TYPE_CHECKING
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import cloudscraper
from flask import current_app
import requests
from .exceptions import ErrorException
if TYPE_CHECKING:
from .socket import BrickSocket
logger = logging.getLogger(__name__)
def get_peeron_user_agent():
"""Get the User-Agent string for Peeron requests from config"""
return current_app.config.get('REBRICKABLE_USER_AGENT',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
def get_peeron_download_delay():
"""Get the delay in milliseconds between Peeron page downloads from config"""
return current_app.config.get('PEERON_DOWNLOAD_DELAY', 1000)
def get_min_image_size():
"""Get the minimum image size for valid Peeron instruction pages from config"""
return current_app.config.get('PEERON_MIN_IMAGE_SIZE', 100)
def get_peeron_instruction_url(set_number: str, version_number: str):
"""Get the Peeron instruction page URL using the configured pattern"""
pattern = current_app.config.get('PEERON_INSTRUCTION_PATTERN', 'http://peeron.com/scans/{set_number}-{version_number}')
return pattern.format(set_number=set_number, version_number=version_number)
def get_peeron_thumbnail_url(set_number: str, version_number: str):
"""Get the Peeron thumbnail base URL using the configured pattern"""
pattern = current_app.config.get('PEERON_THUMBNAIL_PATTERN', 'http://belay.peeron.com/thumbs/{set_number}-{version_number}/')
return pattern.format(set_number=set_number, version_number=version_number)
def get_peeron_scan_url(set_number: str, version_number: str):
"""Get the Peeron scan base URL using the configured pattern"""
pattern = current_app.config.get('PEERON_SCAN_PATTERN', 'http://belay.peeron.com/scans/{set_number}-{version_number}/')
return pattern.format(set_number=set_number, version_number=version_number)
def create_peeron_scraper():
"""Create a cloudscraper instance configured for Peeron"""
scraper = cloudscraper.create_scraper()
scraper.headers.update({
"User-Agent": get_peeron_user_agent()
})
return scraper
class PeeronPage(NamedTuple):
"""Represents a single instruction page from Peeron"""
page_number: str
thumbnail_url: str
image_url: str
alt_text: str
# Peeron instruction scraper
class PeeronInstructions(object):
socket: 'BrickSocket | None'
set_number: str
version_number: str
pages: list[PeeronPage]
def __init__(
self,
set_number: str,
version_number: str = '1',
/,
*,
socket: 'BrickSocket | None' = None,
):
# Save the socket
self.socket = socket
# Parse set number (handle both "4011" and "4011-1" formats)
if '-' in set_number:
parts = set_number.split('-', 1)
self.set_number = parts[0]
self.version_number = parts[1] if len(parts) > 1 else '1'
else:
self.set_number = set_number
self.version_number = version_number
# Placeholder for pages
self.pages = []
# Check if instructions exist on Peeron
def exists(self, /) -> bool:
"""Check if the set exists on Peeron without downloading pages"""
try:
pages = self.find_pages()
return len(pages) > 0
except ErrorException:
return False
# Find all available instruction pages on Peeron
def find_pages(self, /) -> list[PeeronPage]:
"""
Scrape Peeron's HTML and return a list of available instruction pages.
Similar to BrickInstructions.find_instructions() but for Peeron.
"""
base_url = get_peeron_instruction_url(self.set_number, self.version_number)
thumb_base_url = get_peeron_thumbnail_url(self.set_number, self.version_number)
scan_base_url = get_peeron_scan_url(self.set_number, self.version_number)
logger.debug(f"[find_pages] fetching HTML from {base_url!r}")
# Set up cloudscraper with cookies enabled for Peeron
scraper = create_peeron_scraper()
# Download the main HTML page
try:
response = scraper.get(base_url)
if response.status_code != 200:
raise ErrorException(f'Failed to load Peeron page for {self.set_number}-{self.version_number}. HTTP {response.status_code}')
except requests.exceptions.RequestException as e:
raise ErrorException(f'Failed to connect to Peeron: {e}')
# Parse HTML to locate instruction pages
soup = BeautifulSoup(response.text, 'html.parser')
# Check for "Browse instruction library" header (set not found)
if soup.find('h1', string="Browse instruction library"):
raise ErrorException(f'Set {self.set_number}-{self.version_number} not found on Peeron')
# Locate all thumbnail images in the expected table structure
thumbnails = soup.select('table[cellspacing="5"] a img[src^="http://belay.peeron.com/thumbs/"]')
if not thumbnails:
raise ErrorException(f'No instruction pages found for {self.set_number}-{self.version_number} on Peeron')
pages: list[PeeronPage] = []
for img in thumbnails:
thumb_url = img['src']
# Extract the page number from the thumbnail URL
page_number = thumb_url.split('/')[-2]
# Build the full-size image URL
image_url = f"{scan_base_url}{page_number}/"
logger.debug(f"[find_pages] Page {page_number}: thumb={thumb_url}, image={image_url}")
# Create alt text for the page
alt_text = f"LEGO Instructions {self.set_number}-{self.version_number} Page {page_number}"
page = PeeronPage(
page_number=page_number,
thumbnail_url=thumb_url,
image_url=image_url,
alt_text=alt_text
)
pages.append(page)
# Cache the pages for later use
self.pages = pages
logger.debug(f"[find_pages] found {len(pages)} pages for {self.set_number}-{self.version_number}")
return pages
# Find instructions with fallback to Peeron
@staticmethod
def find_instructions_with_peeron_fallback(set: str, /) -> tuple[list[tuple[str, str]], list[PeeronPage] | None]:
"""
Enhanced version of BrickInstructions.find_instructions() that falls back to Peeron.
Returns (rebrickable_instructions, peeron_pages).
If rebrickable_instructions is empty, peeron_pages will contain Peeron data.
"""
from .instructions import BrickInstructions
# First try Rebrickable
try:
rebrickable_instructions = BrickInstructions.find_instructions(set)
return rebrickable_instructions, None
except ErrorException as e:
logger.info(f"Rebrickable failed for {set}: {e}. Trying Peeron fallback...")
# Fallback to Peeron
try:
peeron = PeeronInstructions(set)
peeron_pages = peeron.find_pages()
return [], peeron_pages
except ErrorException as peeron_error:
# Both failed, re-raise original Rebrickable error
logger.info(f"Peeron also failed for {set}: {peeron_error}")
raise e from peeron_error
+269
View File
@@ -0,0 +1,269 @@
import logging
import os
import tempfile
import time
from typing import Any, TYPE_CHECKING
import cloudscraper
from flask import current_app
from PIL import Image
from .exceptions import DownloadException, ErrorException
from .instructions import BrickInstructions
from .peeron_instructions import PeeronPage, get_min_image_size, get_peeron_download_delay, get_peeron_instruction_url, create_peeron_scraper
if TYPE_CHECKING:
from .socket import BrickSocket
logger = logging.getLogger(__name__)
# PDF generator for Peeron instruction pages
class PeeronPDF(object):
socket: 'BrickSocket'
set_number: str
version_number: str
pages: list[PeeronPage]
filename: str
def __init__(
self,
set_number: str,
version_number: str,
pages: list[PeeronPage],
/,
*,
socket: 'BrickSocket',
):
# Save the socket
self.socket = socket
# Save set information
self.set_number = set_number
self.version_number = version_number
self.pages = pages
# Generate filename following BrickTracker conventions
self.filename = f"{set_number}-{version_number}_peeron.pdf"
# Download pages and create PDF
def create_pdf(self, /) -> None:
"""
Downloads selected Peeron pages and merges them into a PDF.
Uses progress updates via socket similar to BrickInstructions.download()
"""
try:
target_path = self._get_target_path()
# Skip if we already have it
if os.path.isfile(target_path):
return self.socket.complete(
message=f"File {self.filename} already exists, skipped"
)
# Set up progress tracking
total_pages = len(self.pages)
self.socket.update_total(total_pages)
self.socket.progress_count = 0
self.socket.progress(message=f"Starting download of {total_pages} pages")
# Set up cloudscraper session for all downloads
scraper = create_peeron_scraper()
# First visit the main instruction page to establish session with Peeron
try:
main_page_url = get_peeron_instruction_url(self.set_number, self.version_number)
logger.debug(f"Establishing session by visiting: {main_page_url}")
main_response = scraper.get(main_page_url)
logger.debug(f"Main page visit: HTTP {main_response.status_code}")
except Exception as e:
logger.warning(f"Failed to visit main page: {e}")
# Download images to temporary files
temp_files = []
failed_pages = []
try:
for i, page in enumerate(self.pages):
# Add delay between requests to avoid being blocked
if i > 0:
delay_ms = get_peeron_download_delay()
time.sleep(delay_ms / 1000.0) # Convert milliseconds to seconds
temp_file = self._download_page_image(page, i + 1, scraper)
if temp_file:
temp_files.append(temp_file)
else:
failed_pages.append(page.page_number)
if not temp_files:
# Collect detailed error information
error_msg = f"Failed to download any instruction pages for set {self.set_number}-{self.version_number}."
# Check if it's a bot protection issue by trying to access the main page
try:
test_response = scraper.get(get_peeron_instruction_url(self.set_number, self.version_number))
if test_response.status_code == 403:
error_msg += " Peeron blocked the request (HTTP 403) - bot protection is active."
elif test_response.status_code == 404:
error_msg += " Set not found on Peeron (HTTP 404)."
elif "Browse instruction library" in test_response.text:
error_msg += " Set exists on Peeron but has no instruction scans available."
else:
min_size = get_min_image_size()
error_msg += f" All pages returned small error images (smaller than {min_size}x{min_size}) - likely bot protection."
except Exception:
error_msg += " Could not connect to Peeron - check internet connection."
raise DownloadException(error_msg)
elif len(temp_files) < total_pages:
# Partial success
error_msg = f"Only downloaded {len(temp_files)}/{total_pages} pages successfully."
if failed_pages:
error_msg += f" Failed pages: {', '.join(failed_pages)}."
logger.warning(error_msg)
# Create PDF from downloaded images
self._create_pdf_from_images(temp_files, target_path)
# Success
logger.info(f"Created PDF {self.filename} with {len(temp_files)} pages")
self.socket.complete(
message=f"PDF {self.filename} created with {len(temp_files)} pages"
)
finally:
# Cleanup temporary files
for temp_file in temp_files:
try:
os.remove(temp_file)
except Exception as e:
logger.warning(f"Failed to remove temp file {temp_file}: {e}")
except Exception as e:
logger.error(f"Error creating PDF {self.filename}: {e}")
self.socket.fail(
message=f"Error creating PDF {self.filename}: {e}"
)
# Download a single page image
def _download_page_image(self, page: PeeronPage, page_num: int, scraper, /) -> str | None:
"""Download a single page image to a temporary file using provided scraper session"""
try:
logger.debug(f"Attempting to download page {page.page_number} from: {page.image_url}")
# Download the image using the shared scraper session
response = scraper.get(page.image_url, stream=True)
logger.debug(f"Page {page.page_number}: HTTP {response.status_code}, Content-Type: {response.headers.get('content-type', 'unknown')}")
if not response.ok:
logger.warning(f"Failed to download page {page.page_number}: HTTP {response.status_code}")
return None
# Check if response is actually an image (not an error page)
content_type = response.headers.get('content-type', '')
if not content_type.startswith('image/'):
# Log first 500 chars of response for debugging
try:
response_text = response.text[:500]
logger.warning(f"Page {page.page_number}: Response is not an image (content-type: {content_type}). Response preview: {response_text}")
except:
logger.warning(f"Page {page.page_number}: Response is not an image (content-type: {content_type})")
return None
# Create temporary file
temp_fd, temp_path = tempfile.mkstemp(suffix='.jpg', prefix=f'peeron_{page.page_number}_')
try:
with os.fdopen(temp_fd, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
# Validate that we actually got an image (not an HTML error page)
try:
with Image.open(temp_path) as test_img:
width, height = test_img.size
min_size = get_min_image_size()
if width < min_size or height < min_size: # Too small to be a real instruction page
logger.warning(f"Page {page.page_number}: Image too small ({width}x{height}) - likely an error page")
os.remove(temp_path)
return None
except Exception as img_error:
logger.warning(f"Page {page.page_number}: Invalid image file - {img_error}")
os.remove(temp_path)
return None
# Update progress
self.socket.progress_count += 1
self.socket.progress(
message=f"Downloaded page {page.page_number} ({page_num}/{len(self.pages)})"
)
return temp_path
except Exception as e:
# Clean up file descriptor if something went wrong
try:
os.close(temp_fd)
except:
pass
try:
os.remove(temp_path)
except:
pass
raise e
except Exception as e:
logger.warning(f"Failed to download page {page.page_number}: {e}")
return None
# Create PDF from downloaded images
def _create_pdf_from_images(self, image_paths: list[str], output_path: str, /) -> None:
"""Create a PDF from a list of image files"""
try:
# Import FPDF (should be available from requirements)
from fpdf import FPDF
except ImportError:
raise ErrorException("FPDF library not available. Install with: pip install fpdf2")
pdf = FPDF()
for i, img_path in enumerate(image_paths):
try:
# Open image to get dimensions
with Image.open(img_path) as image:
width, height = image.size
# Add page with image dimensions (convert pixels to mm)
# 1 pixel = 0.264583 mm (assuming 96 DPI)
page_width = width * 0.264583
page_height = height * 0.264583
pdf.add_page(format=(page_width, page_height))
pdf.image(img_path, x=0, y=0, w=page_width, h=page_height)
# Update progress
progress_msg = f"Processing page {i + 1}/{len(image_paths)} into PDF"
self.socket.progress(message=progress_msg)
except Exception as e:
logger.warning(f"Failed to add image {img_path} to PDF: {e}")
continue
# Save the PDF
pdf.output(output_path)
# Get target file path
def _get_target_path(self, /) -> str:
"""Get the full path where the PDF should be saved"""
instructions_folder = os.path.join(
current_app.static_folder, # type: ignore
current_app.config['INSTRUCTIONS_FOLDER']
)
return os.path.join(instructions_folder, self.filename)
# Create BrickInstructions instance for the generated PDF
def get_instructions(self, /) -> BrickInstructions:
"""Return a BrickInstructions instance for the generated PDF"""
return BrickInstructions(self.filename)
+39 -9
View File
@@ -14,6 +14,7 @@ from .exceptions import exception_handler
from ..instructions import BrickInstructions
from ..instructions_list import BrickInstructionsList
from ..parser import parse_set
from ..peeron_instructions import PeeronInstructions
from ..socket import MESSAGES
from .upload import upload_helper
@@ -160,12 +161,41 @@ def do_download() -> str:
except Exception:
set = ''
return render_template(
'instructions.html',
download=True,
instructions=BrickInstructions.find_instructions(set),
set=set,
path=current_app.config['SOCKET_PATH'],
namespace=current_app.config['SOCKET_NAMESPACE'],
messages=MESSAGES
)
# Try Rebrickable first, fallback to Peeron if it fails
rebrickable_instructions, peeron_pages = PeeronInstructions.find_instructions_with_peeron_fallback(set)
# Determine which template to render based on what we found
if rebrickable_instructions:
# Standard Rebrickable instructions found
return render_template(
'instructions.html',
download=True,
instructions=rebrickable_instructions,
set=set,
path=current_app.config['SOCKET_PATH'],
namespace=current_app.config['SOCKET_NAMESPACE'],
messages=MESSAGES
)
elif peeron_pages:
# Peeron pages found - show page selection interface
return render_template(
'peeron_select.html',
download=True,
pages=peeron_pages,
set=set,
path=current_app.config['SOCKET_PATH'],
namespace=current_app.config['SOCKET_NAMESPACE'],
messages=MESSAGES
)
else:
# This shouldn't happen as the fallback method re-raises the original error
return render_template(
'instructions.html',
download=True,
instructions=[],
set=set,
error='No instructions found on Rebrickable or Peeron',
path=current_app.config['SOCKET_PATH'],
namespace=current_app.config['SOCKET_NAMESPACE'],
messages=MESSAGES
)
+148
View File
@@ -0,0 +1,148 @@
<script type="text/javascript">
// Wait for all scripts to load, then initialize Peeron socket
window.addEventListener('load', () => {
// Double-check that BrickSocket is available
if (typeof BrickSocket === 'undefined') {
console.error('BrickSocket is not available after page load');
return;
}
// Define Peeron socket class after BrickSocket is guaranteed to be available
class BrickPeeronSocket extends BrickSocket {
constructor(id, path, namespace, messages) {
super(id, path, namespace, messages, true);
// Form elements
this.html_button = document.getElementById(id);
this.html_files = document.getElementById(`${id}-files`);
if (this.html_button) {
this.html_button.addEventListener("click", (e) => {
this.execute();
});
}
// Add select all button
this.add_select_all_button();
// Setup the socket
this.setup();
}
add_select_all_button() {
if (this.html_button) {
const selectAllButton = document.createElement('button');
selectAllButton.type = 'button';
selectAllButton.className = 'btn btn-sm btn-outline-secondary me-2';
selectAllButton.innerHTML = '<i class="ri-checkbox-multiple-line"></i> Select All';
selectAllButton.addEventListener('click', () => {
const checkboxes = this.get_files();
const allChecked = checkboxes.every(cb => cb.checked);
checkboxes.forEach(cb => cb.checked = !allChecked);
selectAllButton.innerHTML = allChecked ?
'<i class="ri-checkbox-multiple-line"></i> Select All' :
'<i class="ri-checkbox-blank-line"></i> Deselect All';
});
this.html_button.parentNode.insertBefore(selectAllButton, this.html_button);
}
}
complete(data) {
super.complete(data);
// Clear progress display after completion
if (this.html_progress_message) {
this.html_progress_message.classList.add("d-none");
this.html_progress_message.textContent = "";
}
if (this.html_count) {
this.html_count.classList.add("d-none");
this.html_count.textContent = "";
}
// Ensure spinner is hidden
this.spinner(false);
this.toggle(true);
}
execute() {
if (!this.disabled && this.socket !== undefined && this.socket.connected) {
this.toggle(false);
this.download_peeron_pages();
}
}
get_files(checked=false) {
let files = [];
if (this.html_files) {
files = [...this.html_files.querySelectorAll('input[type="checkbox"]')];
if (checked) {
files = files.filter(file => file.checked);
}
}
return files;
}
download_peeron_pages() {
if (this.html_files) {
const selectedFiles = this.get_files(true);
if (selectedFiles.length === 0) {
this.fail({message: "Please select at least one page to download."});
this.toggle(true);
return;
}
const pages = selectedFiles.map(checkbox => ({
page_number: checkbox.getAttribute('data-page-number'),
thumbnail_url: checkbox.getAttribute('data-thumbnail-url'),
image_url: checkbox.getAttribute('data-image-url'),
alt_text: checkbox.getAttribute('data-alt-text')
}));
this.clear();
this.spinner(true);
const setElement = document.querySelector('input[name="download-set"]');
const set = setElement ? setElement.value : '';
this.socket.emit(this.messages.DOWNLOAD_PEERON_PAGES, {
set: set,
pages: pages,
total: pages.length,
current: 0
});
} else {
this.fail({message: "Could not find the list of pages to download"});
}
}
toggle(enabled) {
super.toggle(enabled);
if (this.html_files) {
this.get_files().forEach(el => el.disabled = !enabled);
}
if (this.html_button) {
this.html_button.disabled = !enabled;
}
}
}
// Initialize the socket
try {
new BrickPeeronSocket('peeron-download', '{{ path }}', '{{ namespace }}', {
COMPLETE: '{{ messages['COMPLETE'] }}',
DOWNLOAD_PEERON_PAGES: 'download_peeron_pages',
FAIL: '{{ messages['FAIL'] }}',
PROGRESS: '{{ messages['PROGRESS'] }}',
});
} catch (error) {
console.error('Error initializing BrickPeeronSocket:', error);
}
});
</script>
+88
View File
@@ -0,0 +1,88 @@
{% extends 'base.html' %}
{% block title %} - Download instructions from Peeron{% endblock %}
{% block main %}
<div class="container">
{% if error %}<div class="alert alert-danger" role="alert"><strong>Error:</strong> {{ error }}.</div>{% endif %}
<div class="row">
<div class="col-12">
<form method="POST" action="{{ url_for('instructions.do_download') }}">
<div class="card mb-3">
<div class="card-header">
<h5 class="mb-0"><i class="ri-download-line"></i> Download instructions from Rebrickable</h5>
</div>
<div class="card-body">
<div class="mb-3">
<label for="download-set" class="form-label">Set number (only one)</label>
<input type="text" class="form-control" id="download-set" name="download-set" placeholder="107-1 or 1642-1 or ..." value="{{ set }}">
</div>
</div>
<div class="card-footer text-end">
<button type="submit" class="btn btn-primary"><i class="ri-search-line"></i> Search</button>
</div>
</div>
</form>
{% if pages %}
<div class="alert alert-info" role="alert">
<i class="ri-information-line"></i> <strong>Instructions found on Peeron:</strong> {{ set }} was not available on Rebrickable, but {{ pages|length }} instruction pages were found on Peeron.
</div>
<div class="card mb-3">
<div class="card-header">
<h5 class="mb-0"><i class="ri-checkbox-line"></i> Select instructions to download</h5>
</div>
<div class="card-body">
<div class="mb-3">
<div id="peeron-download-fail" class="alert alert-danger d-none" role="alert"></div>
<div id="peeron-download-complete"></div>
<h5 class="border-bottom">Available Instructions</h5>
<div id="peeron-download-files" class="row g-2">
{% for page in pages %}
<div class="col-12 col-md-6 col-lg-4">
<div class="card border-0 shadow-sm">
<div class="card-body p-2">
<div class="form-check">
<input class="form-check-input" type="checkbox" id="peeron-page-{{ loop.index }}"
data-page-number="{{ page.page_number }}"
data-thumbnail-url="{{ page.thumbnail_url }}"
data-image-url="{{ page.image_url }}"
data-alt-text="{{ page.alt_text }}"
autocomplete="off">
<label class="form-check-label w-100" for="peeron-page-{{ loop.index }}">
<div class="text-center">
<img src="{{ page.thumbnail_url }}" alt="{{ page.alt_text }}" class="img-fluid mb-2 border rounded" style="max-height: 150px;">
<div class="small fw-bold">Page {{ page.page_number }}</div>
</div>
</label>
</div>
</div>
</div>
</div>
{% endfor %}
</div>
</div>
<hr>
<div class="mb-3">
<p>
Progress <span id="peeron-download-count"></span>
<span id="peeron-download-spinner" class="d-none">
<span class="spinner-border spinner-border-sm" aria-hidden="true"></span>
<span class="visually-hidden" role="status">Loading...</span>
</span>
</p>
<div id="peeron-download-progress" class="progress" role="progressbar" aria-label="Download Peeron instructions progress" aria-valuenow="0" aria-valuemin="0" aria-valuemax="100">
<div id="peeron-download-progress-bar" class="progress-bar" style="width: 0%"></div>
</div>
<p id="peeron-download-progress-message" class="text-center d-none"></p>
</div>
</div>
<div class="card-footer text-end">
<span id="peeron-download-status-icon" class="me-1"></span><span id="peeron-download-status" class="me-1"></span><button id="peeron-download" type="button" class="btn btn-primary"><i class="ri-download-line"></i> Download selected files</button>
</div>
</div>
{% include 'instructions/peeron_socket.html' %}
{% endif %}
</div>
</div>
</div>
{% endblock %}