From 8c314927797bf42b5f780646b426d513eb446e9b Mon Sep 17 00:00:00 2001 From: Frederik Baerentsen Date: Wed, 24 Jun 2020 13:06:31 +0200 Subject: [PATCH] Added Python3 script to download catalogues from brickset --- catalog.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 catalog.py diff --git a/catalog.py b/catalog.py new file mode 100644 index 0000000..35612c7 --- /dev/null +++ b/catalog.py @@ -0,0 +1,29 @@ +import requests +import urllib.request +import time +from bs4 import BeautifulSoup +import re +import os + +url = 'https://brickset.com/library/catalogues' + + +response = requests.get(url) +soup = BeautifulSoup(response.text, "html.parser") +line_count = 1 #variable to track what line you are on + +for idx, val in enumerate(soup.findAll('h3')): + if idx != 0: + year=val.string + title=soup.find_all('p')[idx].contents[0] + link=soup.find_all('p')[idx].contents[3].get('href') + print(link," ",title," ",year) + + dpath="../Catalogues/"+year+"-"+title.strip().replace(' ', '_').replace('/', '-')+".pdf" + print(dpath) + if not os.path.exists(dpath): + r = requests.get(link, stream=True) + with open(dpath, 'wb') as f: + f.write(r.content) + time.sleep(1) #pause the code for a sec + line_count +=1