Updated LEGO downloader script

This commit is contained in:
Frederik Baerentsen 2021-11-15 20:35:48 +01:00
parent 9086981aef
commit 8e8cd5d135

View File

@ -1,38 +1,70 @@
#!/bin/bash #!/bin/bash
############################## ##############################
# #
# This scripts uses the brickset instructions file to get links and set numbers. # This scripts uses the brickset instructions file to get links and set numbers.
# In order to get themes, we get themes from rebrickable. # In order to get themes, we get themes from rebrickable.
# If a booklet isn't available on lego.com, then brickinstructions.com is tested. # If a booklet isn't available on lego.com, then brickinstructions.com is tested.
# #
# Files are saved as: # Files are saved as:
# $tID""$setDESC""${setNAME// /_}"_("$setTHEME"_"$setYEAR").pdf" # $tID""$setDESC""${setNAME// /_}"_("$setTHEME"_"$setYEAR").pdf"
# 1190-1_Retro_Buggy_(Town_1999).pdf # 1190-1_Retro_Buggy_(Town_1999).pdf
# #
# If a set contains multiple filesm the file name will be: # If a set contains multiple filesm the file name will be:
# 2520-1_(1_of_2)_Battle_Arena_(Ninjago_2011).pdf # 2520-1_(1_of_2)_Battle_Arena_(Ninjago_2011).pdf
# 2520-1_(2_of_2)_Battle_Arena_(Ninjago_2011).pdf # 2520-1_(2_of_2)_Battle_Arena_(Ninjago_2011).pdf
# #
# Req: # Req:
# bash, awk, grep, sed, curl, wget # bash, awk, grep, sed, curl, wget
# #
# Get themes.csv and sets.cvs from https://rebrickable.com/downloads # Get themes.csv and sets.cvs from https://rebrickable.com/downloads
# #
# error_level: # error_level:
# 0 no errors reported # 0 no errors reported
# 1 Download errors reported # 1 Download errors reported
# 2 Existing files and download errors reported # 2 Existing files and download errors reported
# #
############################## ##############################
echo "----> Starting Download of all LEGO instructions from https://brickset.com/exportscripts/instructions" echo "----> Starting Download of all LEGO instructions from https://brickset.com/exportscripts/instructions"
firstline=0 firstline=0
##### CHANGE HERE #####
logfile="lego_errors.log"
downloadFolder="../Instructions"
error_level=2
delay=1 # set to 1 for not making delays, set to 0 for random delays from 1-5 seconds.
DEBUG=false # set to true to stop all downloads and just do file checks.
##### STOP CHANGE #####
#####
#
# NOTES: Try and test other sites if LEGO or brickinstructions aren't working. eg. Peeron or Brickset.
#
#####
#deTe=0
#firstline=0
#while IFS=',' read -r ID NAME YEAR THEME_ID NUM_PARTS
#do
# if [ "$firstline" = 0 ]; then
# firstline=1
# else
#
# if [[ ${ID#0} -gt 1000 && "$YEAR" -gt 1990 && "$NUM_PARTS" -gt 50 ]]; then
# deTe=$((deTe+1))
# if [ "$deTe" -gt 5 ]; then
# exit
# fi
# echo "$ID, $YEAR, $NAME, $NUM_PARTS"
# awk -F '|' '$1 ~ /"^'$D'"/' instructions.csv
# fi
# fi
#done < sets.csv
#exit
#
#
#####
##### CHANGE HERE #####
logfile="lego_errors.log"
downloadFolder="../Instructions"
error_level=1
##### STOP CHANGE #####
echo "" > $logfile echo "" > $logfile
@ -48,6 +80,7 @@
echo "Error... Exiting!" echo "Error... Exiting!"
exit; exit;
else else
mv "Brickset-instructions.csv" "instructions"
echo "Done!" echo "Done!"
fi fi
fi fi
@ -68,7 +101,7 @@
fi fi
} }
while IFS='|' read -r ID LINK NAME DESC ADDED RETRIVED while IFS='|' read -r ID LINK DESC ADDED RETRIVED
do do
#$PC=$((200*$CURRENT/$LINES % 2 + 100*$CURRENT/$LINES)) #$PC=$((200*$CURRENT/$LINES % 2 + 100*$CURRENT/$LINES))
@ -76,20 +109,31 @@
if [ "$firstline" = 0 ]; then if [ "$firstline" = 0 ]; then
firstline=1 firstline=1
else else
if [ "$DEBUG" = true ]; then
echo "$ID"
echo "$LINK"
#echo "$NAME"
echo "$DESC"
echo "$ADDED"
echo "$RETRIVED"
fi
tID=$(sed -e 's/^"//' -e 's/"$//' <<<"$ID") tID=$(sed -e 's/^"//' -e 's/"$//' <<<"$ID")
tLINK=$(sed -e 's/^"//' -e 's/"$//' <<<"$LINK") tLINK=$(sed -e 's/^"//' -e 's/"$//' <<<"$LINK")
#tNAME=$(sed -e 's/^"//' -e 's/"$//' <<<"$NAME") #tNAME=$(sed -e 's/^"//' -e 's/"$//' <<<"$NAME")
ttNAME=$(cut -d, -f1-2 sets.csv | grep -w $tID | cut -d, -f2) ttNAME=$(cut -d, -f1-2 "$SETS" | grep -w "$tID" | cut -d, -f2)
setYEAR=$(grep -w $tID sets.csv | cut -d, -f3) #echo "ttNAME: $ttNAME"
setNAME=$(sed -e 's/[^A-Za-z0-9._-]/_/g' <<< $ttNAME) setYEAR=$(grep -w "$tID" "$SETS" | cut -d, -f3)
themeID=$(grep -w $tID sets.csv | cut -d, -f4) setNAME=$(sed -e 's/[^A-Za-z0-9._-]/_/g' <<< "$ttNAME")
themeName=$(awk -F',' -v id="$themeID" '$1 == id' themes.csv) #echo "setNAME: $setNAME"
themeID=$(grep -w "$tID" "$SETS" | cut -d, -f4)
themeName=$(awk -F',' -v id="$themeID" '$1 == id' "$THEMES")
IFS=',' read -r -a array <<< "$themeName" IFS=',' read -r -a array <<< "$themeName"
tempID=${array[2]} tempID=${array[2]}
if [[ $tempID != "" ]]; then if [[ $tempID != "" ]]; then
while [[ $tempID != "" ]] while [[ $tempID != "" ]]
do do
tthemeName=$(awk -F',' -v id="$tempID" '$1 == id' themes.csv) tthemeName=$(awk -F',' -v id="$tempID" '$1 == id' "$THEMES")
IFS=',' read -r -a tArray <<< "$tthemeName" IFS=',' read -r -a tArray <<< "$tthemeName"
tempID=${tArray[2]} tempID=${tArray[2]}
themeName=${tArray[1]} themeName=${tArray[1]}
@ -98,24 +142,49 @@
themeName=${array[1]} themeName=${array[1]}
fi fi
setTHEME=$(sed -e 's/[^A-Za-z0-9._-]/_/g' <<< $themeName) setTHEME=$(sed -e 's/[^A-Za-z0-9._-]/_/g' <<< "$themeName")
tADDED=$(sed -e 's/^"//' -e 's/"$//' <<<"$ADDED") tADDED=$(sed -e 's/^"//' -e 's/"$//' <<<"$ADDED")
tDESC=$(sed -e 's/^"//' -e 's/"$//' <<<"$DESC") tDESC=$(sed -e 's/^"//' -e 's/"$//' <<<"$DESC")
#echo "DESC: $DESC"
ttDESC=$(echo $tDESC | grep -Eo '[^0-9][0-9]{1}\s?\/\s?[0-9]{1,2}' | sed 's/[^A-Za-z0-9/]//g' | sed 's/\//_of_/g')
#echo $tDESC
ttDESC=$(echo "$tDESC" | grep -Eo '[^0-9][0-9]{1}\s?\/\s?[0-9]{1,2}(\s|$)' | sed 's/[^A-Za-z0-9/]//g' | sed 's/\//_of_/g')
#echo "TEST"$(echo "$tDESC" | grep -Eo '[^0-9][0-9]{1}\s?\/\s?[0-9]{1,2}(\s|$)')
#echo "--->ttDESC: $ttDESC"
#ttDESC=$(echo $tDESC | grep -Eo '\s[0-9]{1}\s?\/\s?[0-9]{1,2}' | sed 's/ //g' | sed 's/\//_of_/g') #ttDESC=$(echo $tDESC | grep -Eo '\s[0-9]{1}\s?\/\s?[0-9]{1,2}' | sed 's/ //g' | sed 's/\//_of_/g')
if [ -z "$ttDESC" ]; then if [ -z "$ttDESC" ]; then
etDESC="_" tDESC="_"
else if [ "$DEBUG" = true ]; then
tDESC="_("$ttDESC")_" echo "ttDESC is empty"
fi fi
else
tDESC="_($ttDESC)_"
if [ "$DEBUG" = true ]; then
echo "ttDESC is not empty"
fi
fi
#echo "tDESC: $tDESC"
#echo "setDESC: $setDESC"
#PC=$(echo $CURRENT $LINES | awk '{print 100*$1/$2}') #PC=$(echo $CURRENT $LINES | awk '{print 100*$1/$2}')
tFilename=""$tID"_"$setDESC""${setNAME// /_}"_("$setTHEME"_"$setYEAR").pdf" tFilename="$tID$tDESC${setNAME// /_}_($setTHEME""_""$setYEAR).pdf"
filename=$tFilename filename=$tFilename
#echo $filename
if [ "$DEBUG" = true ]; then
echo "--->FILENAME: $filename"
#echo "-->================================================<--"
CURRENT=$((CURRENT+1))
if [ "$CURRENT" -gt 30 ]; then
exit
fi
fi
if [ -f "$downloadFolder/$filename" ]; then if [ -f "$downloadFolder/$filename" ]; then
#echo "$downloadFolder/$filename"
if [[ $error_level = 2 ]]; then if [[ $error_level = 2 ]]; then
echo "-> $tID exists. Skipping..." echo "-> $tID exists. Skipping..."
echo "$filename exists." >> $logfile echo "$filename exists." >> $logfile
@ -123,9 +192,13 @@
else else
if [[ "$tDESC" = "{No longer listed at LEGO.com}" ]] ; then if [[ "$tDESC" = "{No longer listed at LEGO.com}" ]] ; then
echo -ne "-> $tID testing links..." echo -ne "-> $tID testing links..."
if validate_url $tLINK; then if validate_url "$tLINK"; then
echo -ne "Found on LEGO.com... Downloading..." echo -ne "Found on LEGO.com... Downloading..."
curl -H "Mozilla/5.0 (platform; rv:75.0) Gecko/20100101 Firefox/75.0" -L $tLINK --silent --output "$downloadFolder/$filename"
if [ "$DEBUG" != true ]; then
curl -H "Mozilla/5.0 (platform; rv:75.0) Gecko/20100101 Firefox/75.0" -L "$tLINK" --silent --output "$downloadFolder/$filename"
fi
if [ $(head -c 4 "$downloadFolder/$filename") = "%PDF" ]; then
if [ -f "$downloadFolder/$filename" ]; then if [ -f "$downloadFolder/$filename" ]; then
echo "Done! > $filename" echo "Done! > $filename"
else else
@ -135,16 +208,30 @@
echo "$filename was not downloaded. Check CURL" >> $logfile echo "$filename was not downloaded. Check CURL" >> $logfile
fi fi
fi fi
else
echo "ERROR!"
rm "$downloadFolder/$filename"
if [[ $error_level = 1 || $error_level = 2 ]]; then
echo "--> File is not a PDF..."
echo "$filename was not downloaded. Check CURL" >> $logfile
fi
fi
#random sleep in order to not look like a script #random sleep in order to not look like a script
if [[ $delay = 0 ]]; then
sleep $(( ( RANDOM % 5 ) + 1 )) sleep $(( ( RANDOM % 5 ) + 1 ))
fi
else else
#test #test
biID=$(sed -e 's/[^0-9_]/_/g' <<< $tID) biID=$(sed -e 's/[^0-9_]/_/g' <<< "$tID")
biLink="https://lego.brickinstructions.com/pdfdrop/" biLink="https://lego.brickinstructions.com/pdfdrop/"
if validate_url "$biLink$biID.pdf"; then if validate_url "$biLink$biID.pdf"; then
echo -ne "Found on BrickInstructions.com... Downloading..." echo -ne "Found on BrickInstructions.com... Downloading..."
if [ "$DEBUG" != true ]; then
curl -H "Mozilla/5.0 (platform; rv:75.0) Gecko/20100101 Firefox/75.0" -L "$biLink$tID.pdf" --silent --output "$downloadFolder/$filename" curl -H "Mozilla/5.0 (platform; rv:75.0) Gecko/20100101 Firefox/75.0" -L "$biLink$tID.pdf" --silent --output "$downloadFolder/$filename"
fi
if [ $(head -c 4 "$downloadFolder/$filename") = "%PDF" ]; then
if [ -f "$downloadFolder/$filename" ]; then if [ -f "$downloadFolder/$filename" ]; then
echo "Done! > $filename" echo "Done! > $filename"
else else
@ -154,8 +241,18 @@
echo "$filename was not downloaded. Check CURL" >> $logfile echo "$filename was not downloaded. Check CURL" >> $logfile
fi fi
fi fi
else
echo "ERROR!"
rm "$downloadFolder/$filename"
if [[ $error_level = 1 || $error_level = 2 ]]; then
echo "--> File is not a PDF..."
echo "$filename was not downloaded. Check CURL" >> $logfile
fi
fi
#random sleep in order to not look like a script #random sleep in order to not look like a script
if [[ $delay = 0 ]]; then
sleep $(( ( RANDOM % 5 ) + 1 )) sleep $(( ( RANDOM % 5 ) + 1 ))
fi
else else
if [[ $error_level = 1 || $error_level = 2 ]]; then if [[ $error_level = 1 || $error_level = 2 ]]; then
echo "-> $tID is not available. Skipping..." echo "-> $tID is not available. Skipping..."
@ -166,7 +263,11 @@
else else
echo -ne "--> $tID downloading now..." echo -ne "--> $tID downloading now..."
curl -H "Mozilla/5.0 (platform; rv:75.0) Gecko/20100101 Firefox/75.0" -L $tLINK --silent --output "$downloadFolder/$filename"
if [ "$DEBUG" != true ]; then
curl -H "Mozilla/5.0 (platform; rv:75.0) Gecko/20100101 Firefox/75.0" -L "$tLINK" --silent --output "$downloadFolder/$filename"
fi
if [ $(head -c 4 "$downloadFolder/$filename") = "%PDF" ]; then
if [ -f "$downloadFolder/$filename" ]; then if [ -f "$downloadFolder/$filename" ]; then
echo "Done! > $filename" echo "Done! > $filename"
else else
@ -176,10 +277,28 @@
echo "$filename was not downloaded. Check CURL" >> $logfile echo "$filename was not downloaded. Check CURL" >> $logfile
fi fi
#random sleep in order to not look like a script #random sleep in order to not look like a script
if [[ $delay = 0 ]]; then
sleep $(( ( RANDOM % 5 ) + 1 )) sleep $(( ( RANDOM % 5 ) + 1 ))
fi fi
fi fi
else
echo "ERROR!"
rm "$downloadFolder/$filename"
if [[ $error_level = 1 || $error_level = 2 ]]; then
echo "--> File is not a PDF..."
echo "$filename was not downloaded. Check CURL" >> $logfile
fi
fi
fi
fi fi
fi fi
done < instructions.csv done < instructions.csv