Compare commits

...

2 Commits

Author SHA1 Message Date
a882dfc743 Updated readme 2025-01-20 22:41:37 +01:00
2232f289f0 Initial upload of working copy 2025-01-20 22:41:22 +01:00
2 changed files with 193 additions and 2 deletions

View File

@ -1,3 +1,78 @@
# InstructionsDownloader
# Instruction Downloader Script
Script to download LEGO instructions
This Python script is designed to download LEGO building instructions for a given set ID from the Rebrickable website. It will search for all available instruction links related to a set, and based on the presence of a "V29" (or "V 29") tag in the alt text, it will download specific versions of the instructions.
## Features
- **Download Specific Versions**: Automatically handles downloading of instructions based on the `V29` version.
- **Handle Complex Alt Text**: The script handles alt text that includes variations like `V29`, including `x/y` patterns and ignores invalid patterns.
- **Fallback for Missing or Invalid V29 Versions**: If no valid V29 instruction links are found, the script will download all available instructions.
- **Customizable Download Filenames**: Instructions are saved with filenames based on the set ID and the instruction variant.
- **User-Agent Spoofing**: The script mimics a browser request to avoid being blocked as a bot.
## Requirements
- Python 3.x
- `requests` library
- `beautifulsoup4` library
You can install the required dependencies by running:
```bash
python3 -m pip install requests beautifulsoup4
```
## Usage
### Running the Script
To use this script, you need to provide the LEGO set ID as an argument. The set ID is typically in the format `####-1` (e.g., `10313-1`).
### Example Command
```bash
python3 dl.py 10313-1
```
This will download the LEGO building instructions for the set `10313-1` (Wildflower Bouquet).
### How It Works
1. The script will visit the Rebrickable page for the provided set ID.
2. It will search for all available instruction links containing "LEGO Building Instructions" in the `alt` text.
3. If links with "V29" are found, the script will download them in order, naming them as `{set_id}+1.pdf`, `{set_id}+2.pdf`, etc.
4. If the `x/y` values in the `alt` text are invalid (i.e., `x` or `y` greater than 10), those links will be ignored.
5. If no valid `V29` links are found, the script will fall back to downloading all available instruction links, with filenames based on the alt text.
## Script Output
The script will print status messages during the execution, including:
- URLs of the files being downloaded.
- The filenames that the instructions will be saved as.
- Any errors encountered while downloading.
## Example Output
```bash
Downloading: https://rebrickable.com/instructions/10313-1/123456/download/?expire=1737409263 -> 10313+1.pdf
Saved: 10313+1.pdf
Downloading: https://rebrickable.com/instructions/10313-1/789012/download/?expire=1737409263 -> 10313+2.pdf
Saved: 10313+2.pdf
```
If no valid `V29` links are found, the script will print out the available instruction links.
## Error Handling
- If the page cannot be loaded (e.g., invalid set ID), the script will print an error message and stop.
- If an instruction download fails, the script will print an error message with the HTTP status code.
## Customization
- **User-Agent**: The script uses a browser-like User-Agent string to avoid being detected as a bot. You can customize the `headers` variable if needed.
- **Download Filename Format**: The filenames for the downloaded instructions are automatically generated. You can modify the `download_instructions` function to change the naming convention.
## License
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.

116
instDownloader.py Normal file
View File

@ -0,0 +1,116 @@
import requests
from bs4 import BeautifulSoup
def get_instruction_links(set_id):
# Construct the URL
url = f"https://rebrickable.com/instructions/{set_id}"
# Load the page
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code != 200:
print(f"Failed to load page. Status code: {response.status_code}")
return []
# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Collect all <img> tags with "LEGO Building Instructions" in the alt attribute
found_tags = []
links = []
for a_tag in soup.find_all('a', href=True):
img_tag = a_tag.find('img', alt=True)
if img_tag and "LEGO Building Instructions" in img_tag['alt']:
found_tags.append((img_tag['alt'], a_tag['href'])) # Save alt and href
# Filter links containing "V29"
v29_links = []
for alt_text, href in found_tags:
if "V29" in alt_text or "V 29" in alt_text:
# Check for x/y format or assign sequential numbers if missing
if '/' in alt_text:
parts = alt_text.split('/')[0]
try:
numbers = [int(num) for num in parts.replace("-", " ").split() if num.isdigit()]
if len(numbers) == 2:
x, y = numbers
if x <= 10 and y <= 10: # Only consider valid x/y pairs
v29_links.append((x, y, href))
except ValueError:
print(f"Failed to parse x/y in alt text: {alt_text}")
else:
# Assign sequential numbers if no x/y format is found
v29_links.append((len(v29_links) + 1, None, href))
# Sort the V29 links by x value (if available)
v29_links.sort(key=lambda link: link[0])
# If no V29 links found, return all found tags
if not v29_links:
print("No instructions with 'V29' found. Found the following tags:")
for alt_text, href in found_tags:
print(f"ALT: {alt_text}, HREF: {href}")
return found_tags # Return all links for downloading
return v29_links
def normalize_alt(alt_text):
# Normalize the alt text to a file-friendly name
normalized = alt_text.replace("/", "-").replace(",", "").replace(" ", "-").replace(".", "")
return normalized
def download_instructions(set_id, links, is_v29=True):
for index, link in enumerate(links):
if is_v29:
# V29 links have three elements: (x, y, href)
href = link[2] # Extract href from the tuple
if len(links) == 1:
# Single link, use the set ID only
file_name = f"{set_id}.pdf"
else:
# Multiple links, use the x value
file_name = f"{set_id}+{index + 1}.pdf"
else:
# Non-V29 links have two elements: (alt_text, href)
alt_text, href = link
normalized_name = normalize_alt(alt_text)
file_name = f"{set_id}-{normalized_name}.pdf"
download_url = f"https://rebrickable.com{href}" # Full URL
print(f"Downloading: {download_url} -> {file_name}")
# Fetch the file
response = requests.get(download_url, stream=True)
if response.status_code == 200:
with open(file_name, 'wb') as file:
for chunk in response.iter_content(chunk_size=1024):
file.write(chunk)
print(f"Saved: {file_name}")
else:
print(f"Failed to download {download_url}. Status code: {response.status_code}")
# Example usage
if __name__ == "__main__":
import argparse
# Set up argument parser
parser = argparse.ArgumentParser(description="Download LEGO building instructions for a given set ID.")
parser.add_argument("set_id", type=str, help="The LEGO set ID (e.g., 10313-1)")
args = parser.parse_args()
set_id = args.set_id
instruction_links = get_instruction_links(set_id)
if instruction_links:
if isinstance(instruction_links[0], tuple) and len(instruction_links[0]) == 3: # V29 links
print(f"Found {len(instruction_links)} V29 instruction links.")
download_instructions(set_id, instruction_links)
else: # Non-V29 links
print(f"Found {len(instruction_links)} non-V29 instruction links.")
download_instructions(set_id, instruction_links, is_v29=False)
else:
print("No instruction links found.")