Fixed the rebrickable scraping to deal with changes

Created a common naming schema for the instructions when downloaded
	setnumber-set-name-rebrickable-name
so set 3816-1 Glove World would end up
	3816-1-Glove-World-BI-3004-32-3816-V-29-39
If there is ever a duplicate name it appends _1+++
This commit is contained in:
jl
2025-07-31 22:24:55 -07:00
committed by hiddenside
parent cb24cfc014
commit 07be7b6004
2 changed files with 74 additions and 99 deletions

View File

@@ -1,6 +1,7 @@
from datetime import datetime, timezone from datetime import datetime, timezone
import logging import logging
import os import os
from urllib.parse import urljoin
from shutil import copyfileobj from shutil import copyfileobj
import traceback import traceback
from typing import Tuple, TYPE_CHECKING from typing import Tuple, TYPE_CHECKING
@@ -11,6 +12,8 @@ import humanize
import requests import requests
from werkzeug.datastructures import FileStorage from werkzeug.datastructures import FileStorage
from werkzeug.utils import secure_filename from werkzeug.utils import secure_filename
import re
import cloudscraper
from .exceptions import ErrorException, DownloadException from .exceptions import ErrorException, DownloadException
if TYPE_CHECKING: if TYPE_CHECKING:
@@ -90,85 +93,44 @@ class BrickInstructions(object):
# Download an instruction file # Download an instruction file
def download(self, path: str, /) -> None: def download(self, path: str, /) -> None:
try: try:
# Just to make sure that the progress is initiated # start progress
self.socket.progress( self.socket.progress(message=f'Downloading {self.filename}')
message='Downloading {file}'.format(
file=self.filename,
)
)
target = self.path(filename=secure_filename(self.filename)) target = self.path(filename=secure_filename(self.filename))
# Skipping rather than failing here # skip if already exists
if os.path.isfile(target): if os.path.isfile(target):
self.socket.complete( return self.socket.complete(
message='File {file} already exists, skipped'.format( message=f'File {self.filename} already exists, skipped'
file=self.filename,
)
) )
else: # path is already a full URL from find_instructions()
url = current_app.config['REBRICKABLE_LINK_INSTRUCTIONS_PATTERN'].format( # noqa: E501 url = path
path=path self.socket.progress(message=f'Requesting {url}')
) # use cloudscraper to pass the CF challenge here too
trimmed_url = current_app.config['REBRICKABLE_LINK_INSTRUCTIONS_PATTERN'].format( # noqa: E501 scraper = cloudscraper.create_scraper()
path=path.partition('/')[0] scraper.headers.update({'User-Agent': current_app.config['REBRICKABLE_USER_AGENT']})
) response = scraper.get(url, stream=True)
if not response.ok:
raise DownloadException(f'Failed to download: HTTP {response.status_code}')
# Request the file # record size if available
self.socket.progress( try:
message='Requesting {url}'.format( self.size = int(response.headers.get('Content-Length', 0))
url=trimmed_url, except ValueError:
) self.size = 0
)
response = requests.get(url, stream=True) # download to disk
if response.ok: self.socket.progress(message=f'Downloading {self.filename} ({self.human_size()})')
with open(target, 'wb') as f:
copyfileobj(response.raw, f)
# Store the content header as size logger.info(f'The instruction file {self.filename} has been downloaded')
try: self.socket.complete(message=f'File {self.filename} downloaded ({self.human_size()})')
self.size = int(
response.headers.get('Content-length', 0)
)
except Exception:
self.size = 0
# Downloading the file
self.socket.progress(
message='Downloading {url} ({size})'.format(
url=trimmed_url,
size=self.human_size(),
)
)
with open(target, 'wb') as f:
copyfileobj(response.raw, f)
else:
raise DownloadException('failed to download: {code}'.format( # noqa: E501
code=response.status_code
))
# Info
logger.info('The instruction file {file} has been downloaded'.format( # noqa: E501
file=self.filename
))
# Complete
self.socket.complete(
message='File {file} downloaded ({size})'.format( # noqa: E501
file=self.filename,
size=self.human_size()
)
)
except Exception as e: except Exception as e:
self.socket.fail( self.socket.fail(
message='Error while downloading instruction {file}: {error}'.format( # noqa: E501 message=f'Error downloading {self.filename}: {e}'
file=self.filename,
error=e,
)
) )
logger.debug(traceback.format_exc()) logger.debug(traceback.format_exc())
# Display the size in a human format # Display the size in a human format
@@ -250,40 +212,52 @@ class BrickInstructions(object):
# Find the instructions for a set # Find the instructions for a set
@staticmethod @staticmethod
def find_instructions(set: str, /) -> list[Tuple[str, str]]: def find_instructions(set: str, /) -> list[Tuple[str, str]]:
response = requests.get( """
current_app.config['REBRICKABLE_LINK_INSTRUCTIONS_PATTERN'].format( Scrape Rebrickables HTML and return a list of
path=set, (filename_slug, download_url). Duplicate slugs get _1, _2, …
), """
headers={ page_url = f"https://rebrickable.com/instructions/{set}/"
'User-Agent': current_app.config['REBRICKABLE_USER_AGENT'] logger.debug(f"[find_instructions] fetching HTML from {page_url!r}")
}
)
if not response.ok: # Solve Cloudflares challenge
raise ErrorException('Failed to load the Rebrickable instructions page. Status code: {code}'.format( # noqa: E501 scraper = cloudscraper.create_scraper()
code=response.status_code scraper.headers.update({'User-Agent': current_app.config['REBRICKABLE_USER_AGENT']})
)) resp = scraper.get(page_url)
if not resp.ok:
raise ErrorException(f'Failed to load instructions page for {set}. HTTP {resp.status_code}')
# Parse the HTML content soup = BeautifulSoup(resp.content, 'html.parser')
soup = BeautifulSoup(response.content, 'html.parser') link_re = re.compile(r'^/instructions/\d+/.+/download/')
# Collect all <img> tags with "LEGO Building Instructions" in the raw: list[tuple[str, str]] = []
# alt attribute for a in soup.find_all('a', href=link_re):
found_tags: list[Tuple[str, str]] = [] img = a.find('img', alt=True)
for a_tag in soup.find_all('a', href=True): if not img or set not in img['alt']:
img_tag = a_tag.find('img', alt=True) continue
if img_tag and "LEGO Building Instructions" in img_tag['alt']:
found_tags.append(
(
img_tag['alt'].removeprefix('LEGO Building Instructions for '), # noqa: E501
a_tag['href']
)
) # Save alt and href
# Raise an error if nothing found # Turn the alt text into a slug
if not len(found_tags): alt_text = img['alt'].removeprefix('LEGO Building Instructions for ')
raise ErrorException('No instruction found for set {set}'.format( slug = re.sub(r'[^A-Za-z0-9]+', '-', alt_text).strip('-')
set=set
))
return found_tags # Build the absolute download URL
download_url = urljoin('https://rebrickable.com', a['href'])
raw.append((slug, download_url))
if not raw:
raise ErrorException(f'No download links found on instructions page for {set}')
# Disambiguate duplicate slugs by appending _1, _2, …
from collections import Counter, defaultdict
counts = Counter(name for name, _ in raw)
seen: dict[str, int] = defaultdict(int)
unique: list[tuple[str, str]] = []
for name, url in raw:
idx = seen[name]
if counts[name] > 1 and idx > 0:
final_name = f"{name}_{idx}"
else:
final_name = name
seen[name] += 1
unique.append((final_name, url))
return unique

View File

@@ -9,3 +9,4 @@ rebrick
requests requests
tzdata tzdata
bs4 bs4
cloudscraper