InstructionsDownloader/instDownloader.py

119 lines
4.7 KiB
Python
Raw Permalink Normal View History

2025-01-20 22:41:22 +01:00
import requests
from bs4 import BeautifulSoup
def get_instruction_links(set_id):
# Construct the URL
url = f"https://rebrickable.com/instructions/{set_id}"
# Load the page
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code != 200:
print(f"Failed to load page. Status code: {response.status_code}")
return []
# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Collect all <img> tags with "LEGO Building Instructions" in the alt attribute
found_tags = []
links = []
for a_tag in soup.find_all('a', href=True):
img_tag = a_tag.find('img', alt=True)
if img_tag and "LEGO Building Instructions" in img_tag['alt']:
found_tags.append((img_tag['alt'], a_tag['href'])) # Save alt and href
# Filter links containing "V29"
v29_links = []
for alt_text, href in found_tags:
if "V29" in alt_text or "V 29" in alt_text:
2025-01-20 23:08:17 +01:00
print(alt_text)
2025-01-20 22:41:22 +01:00
# Check for x/y format or assign sequential numbers if missing
if '/' in alt_text:
2025-01-20 23:08:17 +01:00
parts = alt_text.split('/')
2025-01-20 22:41:22 +01:00
try:
2025-01-20 23:08:17 +01:00
numbers = [int(parts[0][-1]),int(parts[1][0])] #[int(num) for num in parts.replace("-", " ").split() if num.isdigit()]
2025-01-20 22:41:22 +01:00
if len(numbers) == 2:
x, y = numbers
if x <= 10 and y <= 10: # Only consider valid x/y pairs
v29_links.append((x, y, href))
except ValueError:
print(f"Failed to parse x/y in alt text: {alt_text}")
else:
# Assign sequential numbers if no x/y format is found
v29_links.append((len(v29_links) + 1, None, href))
# Sort the V29 links by x value (if available)
v29_links.sort(key=lambda link: link[0])
# If no V29 links found, return all found tags
if not v29_links:
print("No instructions with 'V29' found. Found the following tags:")
for alt_text, href in found_tags:
print(f"ALT: {alt_text}, HREF: {href}")
return found_tags # Return all links for downloading
return v29_links
def normalize_alt(alt_text):
# Normalize the alt text to a file-friendly name
normalized = alt_text.replace("/", "-").replace(",", "").replace(" ", "-").replace(".", "")
return normalized
def download_instructions(set_id, links, is_v29=True):
for index, link in enumerate(links):
if is_v29:
# V29 links have three elements: (x, y, href)
href = link[2] # Extract href from the tuple
if len(links) == 1:
# Single link, use the set ID only
file_name = f"{set_id}.pdf"
else:
# Multiple links, use the x value
file_name = f"{set_id}+{index + 1}.pdf"
else:
# Non-V29 links have two elements: (alt_text, href)
alt_text, href = link
normalized_name = normalize_alt(alt_text)
file_name = f"{set_id}-{normalized_name}.pdf"
download_url = f"https://rebrickable.com{href}" # Full URL
print(f"Downloading: {download_url} -> {file_name}")
# Fetch the file
response = requests.get(download_url, stream=True)
if response.status_code == 200:
with open(file_name, 'wb') as file:
for chunk in response.iter_content(chunk_size=1024):
file.write(chunk)
print(f"Saved: {file_name}")
else:
print(f"Failed to download {download_url}. Status code: {response.status_code}")
# Example usage
if __name__ == "__main__":
import argparse
# Set up argument parser
parser = argparse.ArgumentParser(description="Download LEGO building instructions for a given set ID.")
parser.add_argument("set_id", type=str, help="The LEGO set ID (e.g., 10313-1)")
args = parser.parse_args()
set_id = args.set_id
instruction_links = get_instruction_links(set_id)
if instruction_links:
if isinstance(instruction_links[0], tuple) and len(instruction_links[0]) == 3: # V29 links
print(f"Found {len(instruction_links)} V29 instruction links.")
download_instructions(set_id, instruction_links)
else: # Non-V29 links
print(f"Found {len(instruction_links)} non-V29 instruction links.")
download_instructions(set_id, instruction_links, is_v29=False)
else:
print("No instruction links found.")