Merge pull request 'Fixed the rebrickable scraping to deal with changes' (#81) from hiddenside/BrickTracker:fix-instructions-download into master

Reviewed-on: #81
This commit was merged in pull request #81.
This commit is contained in:
2025-08-08 19:47:14 +02:00
2 changed files with 94 additions and 97 deletions

View File

@@ -1,6 +1,7 @@
from datetime import datetime, timezone from datetime import datetime, timezone
import logging import logging
import os import os
from urllib.parse import urljoin
from shutil import copyfileobj from shutil import copyfileobj
import traceback import traceback
from typing import Tuple, TYPE_CHECKING from typing import Tuple, TYPE_CHECKING
@@ -11,6 +12,8 @@ import humanize
import requests import requests
from werkzeug.datastructures import FileStorage from werkzeug.datastructures import FileStorage
from werkzeug.utils import secure_filename from werkzeug.utils import secure_filename
import re
import cloudscraper
from .exceptions import ErrorException, DownloadException from .exceptions import ErrorException, DownloadException
if TYPE_CHECKING: if TYPE_CHECKING:
@@ -89,91 +92,72 @@ class BrickInstructions(object):
# Download an instruction file # Download an instruction file
def download(self, path: str, /) -> None: def download(self, path: str, /) -> None:
"""
Streams the PDF in chunks and uses self.socket.update_total
+ self.socket.progress_count to drive a determinate bar.
"""
try: try:
# Just to make sure that the progress is initiated
self.socket.progress(
message='Downloading {file}'.format(
file=self.filename,
)
)
target = self.path(filename=secure_filename(self.filename)) target = self.path(filename=secure_filename(self.filename))
# Skipping rather than failing here # Skip if we already have it
if os.path.isfile(target): if os.path.isfile(target):
self.socket.complete( return self.socket.complete(
message='File {file} already exists, skipped'.format( message=f"File {self.filename} already exists, skipped"
file=self.filename,
)
) )
else: # Fetch PDF via cloudscraper (to bypass Cloudflare)
url = current_app.config['REBRICKABLE_LINK_INSTRUCTIONS_PATTERN'].format( # noqa: E501 scraper = cloudscraper.create_scraper()
path=path scraper.headers.update({
) "User-Agent": current_app.config['REBRICKABLE_USER_AGENT']
trimmed_url = current_app.config['REBRICKABLE_LINK_INSTRUCTIONS_PATTERN'].format( # noqa: E501 })
path=path.partition('/')[0] resp = scraper.get(path, stream=True)
) if not resp.ok:
raise DownloadException(f"Failed to download: HTTP {resp.status_code}")
# Request the file # Tell the socket how many bytes in total
self.socket.progress( total = int(resp.headers.get("Content-Length", 0))
message='Requesting {url}'.format( self.socket.update_total(total)
url=trimmed_url,
)
)
response = requests.get(url, stream=True) # Reset the counter and kick off at 0%
if response.ok: self.socket.progress_count = 0
self.socket.progress(message=f"Starting download {self.filename}")
# Store the content header as size # Write out in 8 KiB chunks and update the counter
try: with open(target, "wb") as f:
self.size = int( for chunk in resp.iter_content(chunk_size=8192):
response.headers.get('Content-length', 0) if not chunk:
) continue
except Exception: f.write(chunk)
self.size = 0
# Downloading the file # Bump the internal counter and emit
self.socket.progress_count += len(chunk)
self.socket.progress( self.socket.progress(
message='Downloading {url} ({size})'.format( message=(
url=trimmed_url, f"Downloading {self.filename} "
size=self.human_size(), f"({humanize.naturalsize(self.socket.progress_count)}/"
f"{humanize.naturalsize(self.socket.progress_total)})"
) )
) )
with open(target, 'wb') as f: # Done!
copyfileobj(response.raw, f) logger.info(f"Downloaded {self.filename}")
else: self.socket.complete(
raise DownloadException('failed to download: {code}'.format( # noqa: E501 message=f"File {self.filename} downloaded ({self.human_size()})"
code=response.status_code
))
# Info
logger.info('The instruction file {file} has been downloaded'.format( # noqa: E501
file=self.filename
))
# Complete
self.socket.complete(
message='File {file} downloaded ({size})'.format( # noqa: E501
file=self.filename,
size=self.human_size()
)
)
except Exception as e:
self.socket.fail(
message='Error while downloading instruction {file}: {error}'.format( # noqa: E501
file=self.filename,
error=e,
)
) )
except Exception as e:
logger.debug(traceback.format_exc()) logger.debug(traceback.format_exc())
self.socket.fail(
message=f"Error downloading {self.filename}: {e}"
)
# Display the size in a human format # Display the size in a human format
def human_size(self) -> str: def human_size(self) -> str:
return humanize.naturalsize(self.size) try:
size = self.size
except AttributeError:
size = os.path.getsize(self.path())
return humanize.naturalsize(size)
# Display the time in a human format # Display the time in a human format
def human_time(self) -> str: def human_time(self) -> str:
@@ -250,40 +234,52 @@ class BrickInstructions(object):
# Find the instructions for a set # Find the instructions for a set
@staticmethod @staticmethod
def find_instructions(set: str, /) -> list[Tuple[str, str]]: def find_instructions(set: str, /) -> list[Tuple[str, str]]:
response = requests.get( """
current_app.config['REBRICKABLE_LINK_INSTRUCTIONS_PATTERN'].format( Scrape Rebrickables HTML and return a list of
path=set, (filename_slug, download_url). Duplicate slugs get _1, _2, …
), """
headers={ page_url = f"https://rebrickable.com/instructions/{set}/"
'User-Agent': current_app.config['REBRICKABLE_USER_AGENT'] logger.debug(f"[find_instructions] fetching HTML from {page_url!r}")
}
)
if not response.ok: # Solve Cloudflares challenge
raise ErrorException('Failed to load the Rebrickable instructions page. Status code: {code}'.format( # noqa: E501 scraper = cloudscraper.create_scraper()
code=response.status_code scraper.headers.update({'User-Agent': current_app.config['REBRICKABLE_USER_AGENT']})
)) resp = scraper.get(page_url)
if not resp.ok:
raise ErrorException(f'Failed to load instructions page for {set}. HTTP {resp.status_code}')
# Parse the HTML content soup = BeautifulSoup(resp.content, 'html.parser')
soup = BeautifulSoup(response.content, 'html.parser') link_re = re.compile(r'^/instructions/\d+/.+/download/')
# Collect all <img> tags with "LEGO Building Instructions" in the raw: list[tuple[str, str]] = []
# alt attribute for a in soup.find_all('a', href=link_re):
found_tags: list[Tuple[str, str]] = [] img = a.find('img', alt=True)
for a_tag in soup.find_all('a', href=True): if not img or set not in img['alt']:
img_tag = a_tag.find('img', alt=True) continue
if img_tag and "LEGO Building Instructions" in img_tag['alt']:
found_tags.append(
(
img_tag['alt'].removeprefix('LEGO Building Instructions for '), # noqa: E501
a_tag['href']
)
) # Save alt and href
# Raise an error if nothing found # Turn the alt text into a slug
if not len(found_tags): alt_text = img['alt'].removeprefix('LEGO Building Instructions for ')
raise ErrorException('No instruction found for set {set}'.format( slug = re.sub(r'[^A-Za-z0-9]+', '-', alt_text).strip('-')
set=set
))
return found_tags # Build the absolute download URL
download_url = urljoin('https://rebrickable.com', a['href'])
raw.append((slug, download_url))
if not raw:
raise ErrorException(f'No download links found on instructions page for {set}')
# Disambiguate duplicate slugs by appending _1, _2, …
from collections import Counter, defaultdict
counts = Counter(name for name, _ in raw)
seen: dict[str, int] = defaultdict(int)
unique: list[tuple[str, str]] = []
for name, url in raw:
idx = seen[name]
if counts[name] > 1 and idx > 0:
final_name = f"{name}_{idx}"
else:
final_name = name
seen[name] += 1
unique.append((final_name, url))
return unique

View File

@@ -9,3 +9,4 @@ rebrick
requests requests
tzdata tzdata
bs4 bs4
cloudscraper