Merge pull request 'Fixed the rebrickable scraping to deal with changes' (#81) from hiddenside/BrickTracker:fix-instructions-download into master
Reviewed-on: #81
This commit was merged in pull request #81.
This commit is contained in:
@@ -1,6 +1,7 @@
|
|||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
from urllib.parse import urljoin
|
||||||
from shutil import copyfileobj
|
from shutil import copyfileobj
|
||||||
import traceback
|
import traceback
|
||||||
from typing import Tuple, TYPE_CHECKING
|
from typing import Tuple, TYPE_CHECKING
|
||||||
@@ -11,6 +12,8 @@ import humanize
|
|||||||
import requests
|
import requests
|
||||||
from werkzeug.datastructures import FileStorage
|
from werkzeug.datastructures import FileStorage
|
||||||
from werkzeug.utils import secure_filename
|
from werkzeug.utils import secure_filename
|
||||||
|
import re
|
||||||
|
import cloudscraper
|
||||||
|
|
||||||
from .exceptions import ErrorException, DownloadException
|
from .exceptions import ErrorException, DownloadException
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
@@ -89,91 +92,72 @@ class BrickInstructions(object):
|
|||||||
|
|
||||||
# Download an instruction file
|
# Download an instruction file
|
||||||
def download(self, path: str, /) -> None:
|
def download(self, path: str, /) -> None:
|
||||||
|
"""
|
||||||
|
Streams the PDF in chunks and uses self.socket.update_total
|
||||||
|
+ self.socket.progress_count to drive a determinate bar.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
# Just to make sure that the progress is initiated
|
|
||||||
self.socket.progress(
|
|
||||||
message='Downloading {file}'.format(
|
|
||||||
file=self.filename,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
target = self.path(filename=secure_filename(self.filename))
|
target = self.path(filename=secure_filename(self.filename))
|
||||||
|
|
||||||
# Skipping rather than failing here
|
# Skip if we already have it
|
||||||
if os.path.isfile(target):
|
if os.path.isfile(target):
|
||||||
self.socket.complete(
|
return self.socket.complete(
|
||||||
message='File {file} already exists, skipped'.format(
|
message=f"File {self.filename} already exists, skipped"
|
||||||
file=self.filename,
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
else:
|
# Fetch PDF via cloudscraper (to bypass Cloudflare)
|
||||||
url = current_app.config['REBRICKABLE_LINK_INSTRUCTIONS_PATTERN'].format( # noqa: E501
|
scraper = cloudscraper.create_scraper()
|
||||||
path=path
|
scraper.headers.update({
|
||||||
)
|
"User-Agent": current_app.config['REBRICKABLE_USER_AGENT']
|
||||||
trimmed_url = current_app.config['REBRICKABLE_LINK_INSTRUCTIONS_PATTERN'].format( # noqa: E501
|
})
|
||||||
path=path.partition('/')[0]
|
resp = scraper.get(path, stream=True)
|
||||||
)
|
if not resp.ok:
|
||||||
|
raise DownloadException(f"Failed to download: HTTP {resp.status_code}")
|
||||||
|
|
||||||
# Request the file
|
# Tell the socket how many bytes in total
|
||||||
self.socket.progress(
|
total = int(resp.headers.get("Content-Length", 0))
|
||||||
message='Requesting {url}'.format(
|
self.socket.update_total(total)
|
||||||
url=trimmed_url,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
response = requests.get(url, stream=True)
|
# Reset the counter and kick off at 0%
|
||||||
if response.ok:
|
self.socket.progress_count = 0
|
||||||
|
self.socket.progress(message=f"Starting download {self.filename}")
|
||||||
|
|
||||||
# Store the content header as size
|
# Write out in 8 KiB chunks and update the counter
|
||||||
try:
|
with open(target, "wb") as f:
|
||||||
self.size = int(
|
for chunk in resp.iter_content(chunk_size=8192):
|
||||||
response.headers.get('Content-length', 0)
|
if not chunk:
|
||||||
)
|
continue
|
||||||
except Exception:
|
f.write(chunk)
|
||||||
self.size = 0
|
|
||||||
|
|
||||||
# Downloading the file
|
# Bump the internal counter and emit
|
||||||
|
self.socket.progress_count += len(chunk)
|
||||||
self.socket.progress(
|
self.socket.progress(
|
||||||
message='Downloading {url} ({size})'.format(
|
message=(
|
||||||
url=trimmed_url,
|
f"Downloading {self.filename} "
|
||||||
size=self.human_size(),
|
f"({humanize.naturalsize(self.socket.progress_count)}/"
|
||||||
|
f"{humanize.naturalsize(self.socket.progress_total)})"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
with open(target, 'wb') as f:
|
# Done!
|
||||||
copyfileobj(response.raw, f)
|
logger.info(f"Downloaded {self.filename}")
|
||||||
else:
|
self.socket.complete(
|
||||||
raise DownloadException('failed to download: {code}'.format( # noqa: E501
|
message=f"File {self.filename} downloaded ({self.human_size()})"
|
||||||
code=response.status_code
|
|
||||||
))
|
|
||||||
|
|
||||||
# Info
|
|
||||||
logger.info('The instruction file {file} has been downloaded'.format( # noqa: E501
|
|
||||||
file=self.filename
|
|
||||||
))
|
|
||||||
|
|
||||||
# Complete
|
|
||||||
self.socket.complete(
|
|
||||||
message='File {file} downloaded ({size})'.format( # noqa: E501
|
|
||||||
file=self.filename,
|
|
||||||
size=self.human_size()
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self.socket.fail(
|
|
||||||
message='Error while downloading instruction {file}: {error}'.format( # noqa: E501
|
|
||||||
file=self.filename,
|
|
||||||
error=e,
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
logger.debug(traceback.format_exc())
|
logger.debug(traceback.format_exc())
|
||||||
|
self.socket.fail(
|
||||||
|
message=f"Error downloading {self.filename}: {e}"
|
||||||
|
)
|
||||||
|
|
||||||
# Display the size in a human format
|
# Display the size in a human format
|
||||||
def human_size(self) -> str:
|
def human_size(self) -> str:
|
||||||
return humanize.naturalsize(self.size)
|
try:
|
||||||
|
size = self.size
|
||||||
|
except AttributeError:
|
||||||
|
size = os.path.getsize(self.path())
|
||||||
|
return humanize.naturalsize(size)
|
||||||
|
|
||||||
# Display the time in a human format
|
# Display the time in a human format
|
||||||
def human_time(self) -> str:
|
def human_time(self) -> str:
|
||||||
@@ -250,40 +234,52 @@ class BrickInstructions(object):
|
|||||||
# Find the instructions for a set
|
# Find the instructions for a set
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def find_instructions(set: str, /) -> list[Tuple[str, str]]:
|
def find_instructions(set: str, /) -> list[Tuple[str, str]]:
|
||||||
response = requests.get(
|
"""
|
||||||
current_app.config['REBRICKABLE_LINK_INSTRUCTIONS_PATTERN'].format(
|
Scrape Rebrickable’s HTML and return a list of
|
||||||
path=set,
|
(filename_slug, download_url). Duplicate slugs get _1, _2, …
|
||||||
),
|
"""
|
||||||
headers={
|
page_url = f"https://rebrickable.com/instructions/{set}/"
|
||||||
'User-Agent': current_app.config['REBRICKABLE_USER_AGENT']
|
logger.debug(f"[find_instructions] fetching HTML from {page_url!r}")
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
if not response.ok:
|
# Solve Cloudflare’s challenge
|
||||||
raise ErrorException('Failed to load the Rebrickable instructions page. Status code: {code}'.format( # noqa: E501
|
scraper = cloudscraper.create_scraper()
|
||||||
code=response.status_code
|
scraper.headers.update({'User-Agent': current_app.config['REBRICKABLE_USER_AGENT']})
|
||||||
))
|
resp = scraper.get(page_url)
|
||||||
|
if not resp.ok:
|
||||||
|
raise ErrorException(f'Failed to load instructions page for {set}. HTTP {resp.status_code}')
|
||||||
|
|
||||||
# Parse the HTML content
|
soup = BeautifulSoup(resp.content, 'html.parser')
|
||||||
soup = BeautifulSoup(response.content, 'html.parser')
|
link_re = re.compile(r'^/instructions/\d+/.+/download/')
|
||||||
|
|
||||||
# Collect all <img> tags with "LEGO Building Instructions" in the
|
raw: list[tuple[str, str]] = []
|
||||||
# alt attribute
|
for a in soup.find_all('a', href=link_re):
|
||||||
found_tags: list[Tuple[str, str]] = []
|
img = a.find('img', alt=True)
|
||||||
for a_tag in soup.find_all('a', href=True):
|
if not img or set not in img['alt']:
|
||||||
img_tag = a_tag.find('img', alt=True)
|
continue
|
||||||
if img_tag and "LEGO Building Instructions" in img_tag['alt']:
|
|
||||||
found_tags.append(
|
|
||||||
(
|
|
||||||
img_tag['alt'].removeprefix('LEGO Building Instructions for '), # noqa: E501
|
|
||||||
a_tag['href']
|
|
||||||
)
|
|
||||||
) # Save alt and href
|
|
||||||
|
|
||||||
# Raise an error if nothing found
|
# Turn the alt text into a slug
|
||||||
if not len(found_tags):
|
alt_text = img['alt'].removeprefix('LEGO Building Instructions for ')
|
||||||
raise ErrorException('No instruction found for set {set}'.format(
|
slug = re.sub(r'[^A-Za-z0-9]+', '-', alt_text).strip('-')
|
||||||
set=set
|
|
||||||
))
|
|
||||||
|
|
||||||
return found_tags
|
# Build the absolute download URL
|
||||||
|
download_url = urljoin('https://rebrickable.com', a['href'])
|
||||||
|
raw.append((slug, download_url))
|
||||||
|
|
||||||
|
if not raw:
|
||||||
|
raise ErrorException(f'No download links found on instructions page for {set}')
|
||||||
|
|
||||||
|
# Disambiguate duplicate slugs by appending _1, _2, …
|
||||||
|
from collections import Counter, defaultdict
|
||||||
|
counts = Counter(name for name, _ in raw)
|
||||||
|
seen: dict[str, int] = defaultdict(int)
|
||||||
|
unique: list[tuple[str, str]] = []
|
||||||
|
for name, url in raw:
|
||||||
|
idx = seen[name]
|
||||||
|
if counts[name] > 1 and idx > 0:
|
||||||
|
final_name = f"{name}_{idx}"
|
||||||
|
else:
|
||||||
|
final_name = name
|
||||||
|
seen[name] += 1
|
||||||
|
unique.append((final_name, url))
|
||||||
|
|
||||||
|
return unique
|
||||||
|
|||||||
@@ -9,3 +9,4 @@ rebrick
|
|||||||
requests
|
requests
|
||||||
tzdata
|
tzdata
|
||||||
bs4
|
bs4
|
||||||
|
cloudscraper
|
||||||
|
|||||||
Reference in New Issue
Block a user