#!/bin/bash ############################## # # This scripts uses the brickset instructions file to get links and set numbers. # In order to get themes, we get themes from rebrickable. # If a booklet isn't available on lego.com, then brickinstructions.com is tested. # # Files are saved as: # $tID""$setDESC""${setNAME// /_}"_("$setTHEME"_"$setYEAR").pdf" # 1190-1_Retro_Buggy_(Town_1999).pdf # # If a set contains multiple filesm the file name will be: # 2520-1_(1_of_2)_Battle_Arena_(Ninjago_2011).pdf # 2520-1_(2_of_2)_Battle_Arena_(Ninjago_2011).pdf # # Req: # bash, awk, grep, sed, curl, wget # # Get themes.csv and sets.cvs from https://rebrickable.com/downloads # # error_level: # 0 no errors reported # 1 Download errors reported # 2 Existing files and download errors reported # ############################## echo "----> Starting Download of all LEGO instructions from https://brickset.com/exportscripts/instructions" firstline=0 ##### CHANGE HERE ##### logfile="lego_errors.log" downloadFolder="../Instructions" error_level=2 delay=1 # set to 1 for not making delays, set to 0 for random delays from 1-5 seconds. DEBUG=false # set to true to stop all downloads and just do file checks. ##### STOP CHANGE ##### ##### # # NOTES: Try and test other sites if LEGO or brickinstructions aren't working. eg. Peeron or Brickset. # ##### #deTe=0 #firstline=0 #while IFS=',' read -r ID NAME YEAR THEME_ID NUM_PARTS #do # if [ "$firstline" = 0 ]; then # firstline=1 # else # # if [[ ${ID#0} -gt 1000 && "$YEAR" -gt 1990 && "$NUM_PARTS" -gt 50 ]]; then # deTe=$((deTe+1)) # if [ "$deTe" -gt 5 ]; then # exit # fi # echo "$ID, $YEAR, $NAME, $NUM_PARTS" # awk -F '|' '$1 ~ /"^'$D'"/' instructions.csv # fi # fi #done < sets.csv #exit # # ##### echo "" > $logfile THEMES="themes.csv" SETS="sets.csv" if [ -f "instructions" ]; then echo "---> Using existing instructions file. Manually delete it to redownload..." else echo -ne "---> Downloading csv from Brickset..." wget https://brickset.com/exportscripts/instructions &> /dev/null if [[ "$?" != 0 ]]; then echo "Error... Exiting!" exit; else mv "Brickset-instructions.csv" "instructions" echo "Done!" fi fi echo "----> Converting csv with ',' to '|'..." head -1 instructions | sed 's/,/|/g' > instructions.csv sed 1,1d instructions | sed -r 's/("[^",]+),([^",]+")/\1###\2/g' | awk -F, '{print $1,$2,$3,$4,$5,$6}' | sed 's/###/,/g' | sed 's/" "/"|"/g' >> instructions.csv LINES=$(wc -l instructions | awk '{ print $1 }') CURRENT=0 function validate_url(){ if [[ `wget -S --spider $1 2>&1 | grep 'HTTP/1.1 200 OK'` ]]; then return 0 else return 1 fi } while IFS='|' read -r ID LINK DESC ADDED RETRIVED do #$PC=$((200*$CURRENT/$LINES % 2 + 100*$CURRENT/$LINES)) #CURRENT=$((CURRENT + 1)) if [ "$firstline" = 0 ]; then firstline=1 else if [ "$DEBUG" = true ]; then echo "$ID" echo "$LINK" #echo "$NAME" echo "$DESC" echo "$ADDED" echo "$RETRIVED" fi tID=$(sed -e 's/^"//' -e 's/"$//' <<<"$ID") tLINK=$(sed -e 's/^"//' -e 's/"$//' <<<"$LINK") #tNAME=$(sed -e 's/^"//' -e 's/"$//' <<<"$NAME") ttNAME=$(cut -d, -f1-2 "$SETS" | grep -w "$tID" | cut -d, -f2) #echo "ttNAME: $ttNAME" setYEAR=$(grep -w "$tID" "$SETS" | cut -d, -f3) setNAME=$(sed -e 's/[^A-Za-z0-9._-]/_/g' <<< "$ttNAME") #echo "setNAME: $setNAME" themeID=$(grep -w "$tID" "$SETS" | cut -d, -f4) themeName=$(awk -F',' -v id="$themeID" '$1 == id' "$THEMES") IFS=',' read -r -a array <<< "$themeName" tempID=${array[2]} if [[ $tempID != "" ]]; then while [[ $tempID != "" ]] do tthemeName=$(awk -F',' -v id="$tempID" '$1 == id' "$THEMES") IFS=',' read -r -a tArray <<< "$tthemeName" tempID=${tArray[2]} themeName=${tArray[1]} done else themeName=${array[1]} fi setTHEME=$(sed -e 's/[^A-Za-z0-9._-]/_/g' <<< "$themeName") tADDED=$(sed -e 's/^"//' -e 's/"$//' <<<"$ADDED") tDESC=$(sed -e 's/^"//' -e 's/"$//' <<<"$DESC") #echo "DESC: $DESC" #echo $tDESC ttDESC=$(echo "$tDESC" | grep -Eo '[^0-9][0-9]{1}\s?\/\s?[0-9]{1,2}(\s|$)' | sed 's/[^A-Za-z0-9/]//g' | sed 's/\//_of_/g') #echo "TEST"$(echo "$tDESC" | grep -Eo '[^0-9][0-9]{1}\s?\/\s?[0-9]{1,2}(\s|$)') #echo "--->ttDESC: $ttDESC" #ttDESC=$(echo $tDESC | grep -Eo '\s[0-9]{1}\s?\/\s?[0-9]{1,2}' | sed 's/ //g' | sed 's/\//_of_/g') if [ -z "$ttDESC" ]; then tDESC="_" if [ "$DEBUG" = true ]; then echo "ttDESC is empty" fi else tDESC="_($ttDESC)_" if [ "$DEBUG" = true ]; then echo "ttDESC is not empty" fi fi #echo "tDESC: $tDESC" #echo "setDESC: $setDESC" #PC=$(echo $CURRENT $LINES | awk '{print 100*$1/$2}') tFilename="$tID$tDESC${setNAME// /_}_($setTHEME""_""$setYEAR).pdf" filename=$tFilename #echo $filename if [ "$DEBUG" = true ]; then echo "--->FILENAME: $filename" #echo "-->================================================<--" CURRENT=$((CURRENT+1)) if [ "$CURRENT" -gt 30 ]; then exit fi fi if [ -f "$downloadFolder/$filename" ]; then #echo "$downloadFolder/$filename" if [[ $error_level = 2 ]]; then echo "-> $tID exists. Skipping..." echo "$filename exists." >> $logfile fi else if [[ "$tDESC" = "{No longer listed at LEGO.com}" ]] ; then echo -ne "-> $tID testing links..." if validate_url "$tLINK"; then echo -ne "Found on LEGO.com... Downloading..." if [ "$DEBUG" != true ]; then curl -H "Mozilla/5.0 (platform; rv:75.0) Gecko/20100101 Firefox/75.0" -L "$tLINK" --silent --output "$downloadFolder/$filename" fi if [ $(head -c 4 "$downloadFolder/$filename") = "%PDF" ]; then if [ -f "$downloadFolder/$filename" ]; then echo "Done! > $filename" else echo "ERROR!" if [[ $error_level = 1 || $error_level = 2 ]]; then echo "--> Not downloaded. Try again manually..." echo "$filename was not downloaded. Check CURL" >> $logfile fi fi else echo "ERROR!" rm "$downloadFolder/$filename" if [[ $error_level = 1 || $error_level = 2 ]]; then echo "--> File is not a PDF..." echo "$filename was not downloaded. Check CURL" >> $logfile fi fi #random sleep in order to not look like a script if [[ $delay = 0 ]]; then sleep $(( ( RANDOM % 5 ) + 1 )) fi else #test biID=$(sed -e 's/[^0-9_]/_/g' <<< "$tID") biLink="https://lego.brickinstructions.com/pdfdrop/" if validate_url "$biLink$biID.pdf"; then echo -ne "Found on BrickInstructions.com... Downloading..." if [ "$DEBUG" != true ]; then curl -H "Mozilla/5.0 (platform; rv:75.0) Gecko/20100101 Firefox/75.0" -L "$biLink$tID.pdf" --silent --output "$downloadFolder/$filename" fi if [ $(head -c 4 "$downloadFolder/$filename") = "%PDF" ]; then if [ -f "$downloadFolder/$filename" ]; then echo "Done! > $filename" else echo "ERROR!" if [[ $error_level = 1 || $error_level = 2 ]]; then echo "--> Not downloaded. Try again manually..." echo "$filename was not downloaded. Check CURL" >> $logfile fi fi else echo "ERROR!" rm "$downloadFolder/$filename" if [[ $error_level = 1 || $error_level = 2 ]]; then echo "--> File is not a PDF..." echo "$filename was not downloaded. Check CURL" >> $logfile fi fi #random sleep in order to not look like a script if [[ $delay = 0 ]]; then sleep $(( ( RANDOM % 5 ) + 1 )) fi else if [[ $error_level = 1 || $error_level = 2 ]]; then echo "-> $tID is not available. Skipping..." echo "$filename is not available." >> $logfile fi fi fi else echo -ne "--> $tID downloading now..." if [ "$DEBUG" != true ]; then curl -H "Mozilla/5.0 (platform; rv:75.0) Gecko/20100101 Firefox/75.0" -L "$tLINK" --silent --output "$downloadFolder/$filename" fi if [ $(head -c 4 "$downloadFolder/$filename") = "%PDF" ]; then if [ -f "$downloadFolder/$filename" ]; then echo "Done! > $filename" else echo "ERROR!" if [[ $error_level = 1 || $error_level = 2 ]]; then echo "--> Not downloaded. Try again manually..." echo "$filename was not downloaded. Check CURL" >> $logfile fi #random sleep in order to not look like a script if [[ $delay = 0 ]]; then sleep $(( ( RANDOM % 5 ) + 1 )) fi fi else echo "ERROR!" rm "$downloadFolder/$filename" if [[ $error_level = 1 || $error_level = 2 ]]; then echo "--> File is not a PDF..." echo "$filename was not downloaded. Check CURL" >> $logfile fi fi fi fi fi done < instructions.csv