2021-11-15 20:35:48 +01:00
#!/bin/bash
##############################
#
# This scripts uses the brickset instructions file to get links and set numbers.
# In order to get themes, we get themes from rebrickable.
# If a booklet isn't available on lego.com, then brickinstructions.com is tested.
#
# Files are saved as:
# $tID""$setDESC""${setNAME// /_}"_("$setTHEME"_"$setYEAR").pdf"
# 1190-1_Retro_Buggy_(Town_1999).pdf
#
# If a set contains multiple filesm the file name will be:
# 2520-1_(1_of_2)_Battle_Arena_(Ninjago_2011).pdf
# 2520-1_(2_of_2)_Battle_Arena_(Ninjago_2011).pdf
#
# Req:
# bash, awk, grep, sed, curl, wget
#
# Get themes.csv and sets.cvs from https://rebrickable.com/downloads
#
# error_level:
# 0 no errors reported
# 1 Download errors reported
# 2 Existing files and download errors reported
#
##############################
echo "----> Starting Download of all LEGO instructions from https://brickset.com/exportscripts/instructions"
firstline = 0
##### CHANGE HERE #####
logfile = "lego_errors.log"
downloadFolder = "../Instructions"
error_level = 2
delay = 1 # set to 1 for not making delays, set to 0 for random delays from 1-5 seconds.
DEBUG = false # set to true to stop all downloads and just do file checks.
##### STOP CHANGE #####
#####
#
# NOTES: Try and test other sites if LEGO or brickinstructions aren't working. eg. Peeron or Brickset.
#
#####
#deTe=0
#firstline=0
#while IFS=',' read -r ID NAME YEAR THEME_ID NUM_PARTS
#do
# if [ "$firstline" = 0 ]; then
# firstline=1
# else
#
# if [[ ${ID#0} -gt 1000 && "$YEAR" -gt 1990 && "$NUM_PARTS" -gt 50 ]]; then
# deTe=$((deTe+1))
# if [ "$deTe" -gt 5 ]; then
# exit
# fi
# echo "$ID, $YEAR, $NAME, $NUM_PARTS"
# awk -F '|' '$1 ~ /"^'$D'"/' instructions.csv
# fi
# fi
#done < sets.csv
#exit
#
#
#####
2020-06-24 08:51:26 +02:00
2020-06-24 10:18:54 +02:00
echo "" > $logfile
2020-06-23 15:02:27 +02:00
2020-06-24 10:18:54 +02:00
THEMES = "themes.csv"
SETS = "sets.csv"
2020-06-23 15:02:27 +02:00
2020-06-24 10:18:54 +02:00
if [ -f "instructions" ] ; then
echo "---> Using existing instructions file. Manually delete it to redownload..."
else
echo -ne "---> Downloading csv from Brickset..."
wget https://brickset.com/exportscripts/instructions & > /dev/null
if [ [ " $? " != 0 ] ] ; then
echo "Error... Exiting!"
exit;
else
2021-11-15 20:35:48 +01:00
mv "Brickset-instructions.csv" "instructions"
2020-06-24 10:18:54 +02:00
echo "Done!"
fi
fi
2020-06-24 08:41:55 +02:00
2020-06-24 10:18:54 +02:00
echo "----> Converting csv with ',' to '|'..."
head -1 instructions | sed 's/,/|/g' > instructions.csv
sed 1,1d instructions | sed -r 's/("[^",]+),([^",]+")/\1###\2/g' | awk -F, '{print $1,$2,$3,$4,$5,$6}' | sed 's/###/,/g' | sed 's/" "/"|"/g' >> instructions.csv
2020-06-24 08:41:55 +02:00
2020-06-24 10:18:54 +02:00
LINES = $( wc -l instructions | awk '{ print $1 }' )
CURRENT = 0
2020-06-24 08:41:55 +02:00
2020-06-24 13:05:22 +02:00
function validate_url( ) {
if [ [ ` wget -S --spider $1 2>& 1 | grep 'HTTP/1.1 200 OK' ` ] ] ; then
return 0
else
return 1
2020-06-25 18:14:25 +02:00
fi
2020-06-24 13:05:22 +02:00
}
2021-11-15 20:35:48 +01:00
while IFS = '|' read -r ID LINK DESC ADDED RETRIVED
2020-06-24 10:18:54 +02:00
do
2020-06-24 08:41:55 +02:00
2020-06-24 10:18:54 +02:00
#$PC=$((200*$CURRENT/$LINES % 2 + 100*$CURRENT/$LINES))
#CURRENT=$((CURRENT + 1))
2020-06-25 18:14:25 +02:00
if [ " $firstline " = 0 ] ; then
firstline = 1
2021-11-15 20:35:48 +01:00
else
if [ " $DEBUG " = true ] ; then
echo " $ID "
echo " $LINK "
#echo "$NAME"
echo " $DESC "
echo " $ADDED "
echo " $RETRIVED "
fi
2020-06-25 18:14:25 +02:00
tID = $( sed -e 's/^"//' -e 's/"$//' <<< " $ID " )
tLINK = $( sed -e 's/^"//' -e 's/"$//' <<< " $LINK " )
#tNAME=$(sed -e 's/^"//' -e 's/"$//' <<<"$NAME")
2021-11-15 20:35:48 +01:00
ttNAME = $( cut -d, -f1-2 " $SETS " | grep -w " $tID " | cut -d, -f2)
#echo "ttNAME: $ttNAME"
setYEAR = $( grep -w " $tID " " $SETS " | cut -d, -f3)
setNAME = $( sed -e 's/[^A-Za-z0-9._-]/_/g' <<< " $ttNAME " )
#echo "setNAME: $setNAME"
themeID = $( grep -w " $tID " " $SETS " | cut -d, -f4)
themeName = $( awk -F',' -v id = " $themeID " '$1 == id' " $THEMES " )
2020-06-25 18:14:25 +02:00
IFS = ',' read -r -a array <<< " $themeName "
tempID = ${ array [2] }
2021-11-15 20:35:48 +01:00
if [ [ $tempID != "" ] ] ; then
2020-06-25 18:14:25 +02:00
while [ [ $tempID != "" ] ]
do
2021-11-15 20:35:48 +01:00
tthemeName = $( awk -F',' -v id = " $tempID " '$1 == id' " $THEMES " )
2020-06-25 18:14:25 +02:00
IFS = ',' read -r -a tArray <<< " $tthemeName "
tempID = ${ tArray [2] }
themeName = ${ tArray [1] }
done
else
themeName = ${ array [1] }
fi
2021-11-15 20:35:48 +01:00
setTHEME = $( sed -e 's/[^A-Za-z0-9._-]/_/g' <<< " $themeName " )
2020-06-24 10:18:54 +02:00
2020-06-25 18:14:25 +02:00
tADDED = $( sed -e 's/^"//' -e 's/"$//' <<< " $ADDED " )
tDESC = $( sed -e 's/^"//' -e 's/"$//' <<< " $DESC " )
2021-11-15 20:35:48 +01:00
#echo "DESC: $DESC"
2020-06-25 18:14:25 +02:00
2021-11-15 20:35:48 +01:00
#echo $tDESC
ttDESC = $( echo " $tDESC " | grep -Eo '[^0-9][0-9]{1}\s?\/\s?[0-9]{1,2}(\s|$)' | sed 's/[^A-Za-z0-9/]//g' | sed 's/\//_of_/g' )
#echo "TEST"$(echo "$tDESC" | grep -Eo '[^0-9][0-9]{1}\s?\/\s?[0-9]{1,2}(\s|$)')
#echo "--->ttDESC: $ttDESC"
#ttDESC=$(echo $tDESC | grep -Eo '\s[0-9]{1}\s?\/\s?[0-9]{1,2}' | sed 's/ //g' | sed 's/\//_of_/g')
2020-06-25 18:14:25 +02:00
if [ -z " $ttDESC " ] ; then
2021-11-15 20:35:48 +01:00
tDESC = "_"
if [ " $DEBUG " = true ] ; then
echo "ttDESC is empty"
fi
2020-06-25 18:14:25 +02:00
else
2021-11-15 20:35:48 +01:00
tDESC = " _( $ttDESC )_ "
if [ " $DEBUG " = true ] ; then
echo "ttDESC is not empty"
fi
2020-06-25 18:14:25 +02:00
fi
2021-11-15 20:35:48 +01:00
#echo "tDESC: $tDESC"
#echo "setDESC: $setDESC"
2020-06-25 18:14:25 +02:00
#PC=$(echo $CURRENT $LINES | awk '{print 100*$1/$2}')
2021-11-15 20:35:48 +01:00
tFilename = " $tID $tDESC ${ setNAME // /_ } _( $setTHEME " "_" " $setYEAR ).pdf "
2020-06-25 18:14:25 +02:00
filename = $tFilename
2021-11-15 20:35:48 +01:00
#echo $filename
if [ " $DEBUG " = true ] ; then
echo " --->FILENAME: $filename "
#echo "-->================================================<--"
CURRENT = $(( CURRENT+1))
if [ " $CURRENT " -gt 30 ] ; then
exit
fi
fi
2020-06-25 18:14:25 +02:00
if [ -f " $downloadFolder / $filename " ] ; then
2021-11-15 20:35:48 +01:00
#echo "$downloadFolder/$filename"
2020-06-25 18:14:25 +02:00
if [ [ $error_level = 2 ] ] ; then
echo " -> $tID exists. Skipping... "
echo " $filename exists. " >> $logfile
fi
else
if [ [ " $tDESC " = "{No longer listed at LEGO.com}" ] ] ; then
2020-06-25 20:41:34 +02:00
echo -ne " -> $tID testing links... "
2021-11-15 20:35:48 +01:00
if validate_url " $tLINK " ; then
2020-06-25 20:41:34 +02:00
echo -ne "Found on LEGO.com... Downloading..."
2021-11-15 20:35:48 +01:00
if [ " $DEBUG " != true ] ; then
curl -H "Mozilla/5.0 (platform; rv:75.0) Gecko/20100101 Firefox/75.0" -L " $tLINK " --silent --output " $downloadFolder / $filename "
fi
if [ $( head -c 4 " $downloadFolder / $filename " ) = "%PDF" ] ; then
if [ -f " $downloadFolder / $filename " ] ; then
echo " Done! > $filename "
else
echo "ERROR!"
if [ [ $error_level = 1 || $error_level = 2 ] ] ; then
echo "--> Not downloaded. Try again manually..."
echo " $filename was not downloaded. Check CURL " >> $logfile
fi
fi
else
2020-06-25 18:14:25 +02:00
echo "ERROR!"
2021-11-15 20:35:48 +01:00
rm " $downloadFolder / $filename "
2020-06-25 18:14:25 +02:00
if [ [ $error_level = 1 || $error_level = 2 ] ] ; then
2021-11-15 20:35:48 +01:00
echo "--> File is not a PDF..."
2020-06-25 18:14:25 +02:00
echo " $filename was not downloaded. Check CURL " >> $logfile
fi
fi
#random sleep in order to not look like a script
2021-11-15 20:35:48 +01:00
if [ [ $delay = 0 ] ] ; then
sleep $(( ( RANDOM % 5 ) + 1 ))
fi
2020-06-25 18:14:25 +02:00
else
#test
2021-11-15 20:35:48 +01:00
biID = $( sed -e 's/[^0-9_]/_/g' <<< " $tID " )
2020-06-25 18:14:25 +02:00
biLink = "https://lego.brickinstructions.com/pdfdrop/"
if validate_url " $biLink $biID .pdf " ; then
2020-06-25 20:41:34 +02:00
echo -ne "Found on BrickInstructions.com... Downloading..."
2021-11-15 20:35:48 +01:00
if [ " $DEBUG " != true ] ; then
curl -H "Mozilla/5.0 (platform; rv:75.0) Gecko/20100101 Firefox/75.0" -L " $biLink $tID .pdf " --silent --output " $downloadFolder / $filename "
fi
if [ $( head -c 4 " $downloadFolder / $filename " ) = "%PDF" ] ; then
if [ -f " $downloadFolder / $filename " ] ; then
echo " Done! > $filename "
else
echo "ERROR!"
if [ [ $error_level = 1 || $error_level = 2 ] ] ; then
echo "--> Not downloaded. Try again manually..."
echo " $filename was not downloaded. Check CURL " >> $logfile
fi
fi
2020-06-25 18:14:25 +02:00
else
2021-11-15 20:35:48 +01:00
echo "ERROR!"
rm " $downloadFolder / $filename "
if [ [ $error_level = 1 || $error_level = 2 ] ] ; then
echo "--> File is not a PDF..."
2020-06-25 15:56:06 +02:00
echo " $filename was not downloaded. Check CURL " >> $logfile
fi
2021-11-15 20:35:48 +01:00
fi
2020-06-25 15:56:06 +02:00
#random sleep in order to not look like a script
2021-11-15 20:35:48 +01:00
if [ [ $delay = 0 ] ] ; then
sleep $(( ( RANDOM % 5 ) + 1 ))
fi
else
2020-06-25 18:14:25 +02:00
if [ [ $error_level = 1 || $error_level = 2 ] ] ; then
echo " -> $tID is not available. Skipping... "
echo " $filename is not available. " >> $logfile
2020-06-24 13:05:22 +02:00
fi
2020-06-25 18:14:25 +02:00
fi
fi
else
echo -ne " --> $tID downloading now... "
2021-11-15 20:35:48 +01:00
if [ " $DEBUG " != true ] ; then
curl -H "Mozilla/5.0 (platform; rv:75.0) Gecko/20100101 Firefox/75.0" -L " $tLINK " --silent --output " $downloadFolder / $filename "
fi
if [ $( head -c 4 " $downloadFolder / $filename " ) = "%PDF" ] ; then
if [ -f " $downloadFolder / $filename " ] ; then
echo " Done! > $filename "
else
echo "ERROR!"
if [ [ $error_level = 1 || $error_level = 2 ] ] ; then
echo "--> Not downloaded. Try again manually..."
echo " $filename was not downloaded. Check CURL " >> $logfile
fi
#random sleep in order to not look like a script
if [ [ $delay = 0 ] ] ; then
sleep $(( ( RANDOM % 5 ) + 1 ))
fi
fi
2020-06-24 13:05:22 +02:00
else
2021-11-15 20:35:48 +01:00
echo "ERROR!"
rm " $downloadFolder / $filename "
2020-06-25 18:14:25 +02:00
if [ [ $error_level = 1 || $error_level = 2 ] ] ; then
2021-11-15 20:35:48 +01:00
echo "--> File is not a PDF..."
2020-06-23 15:02:27 +02:00
echo " $filename was not downloaded. Check CURL " >> $logfile
fi
fi
fi
fi
fi
2020-06-24 08:41:55 +02:00
done < instructions.csv
2020-06-23 15:02:27 +02:00
2021-11-15 20:35:48 +01:00