DataHoarder_scripts/lego_dl.sh

 #!/bin/bash
 ##############################
 #
 # NOTE: There is still some small errors with sets with multiple booklets
 # 
 # This scripts uses the brickset instructions file to get links and set numbers.
 # In order to get themes, we get themes from rebrickable. 
 # 
 # Files are saved as:
 # $tID""$setDESC""${setNAME// /_}"_("$setTHEME"_"$setYEAR").pdf"
 # 1190-1_Retro_Buggy_(Town_1999).pdf
 # 
 # If a set contains multiple filesm the file name will be:
 # 2520-1_(1_of_2)_Battle_Arena_(Ninjago_2011).pdf
 # 2520-1_(2_of_2)_Battle_Arena_(Ninjago_2011).pdf
 # 
 # Req:
 # bash, awk, grep, sed, curl, wget
 #
 # Get themes.csv and sets.cvs from https://rebrickable.com/downloads
 # 
 # error_level: 
 #	0 no errors reported
 #	1 Download errors reported
 #	2 Existing files and download errors reported
 #
 ##############################

 echo "----> Starting Download of all LEGO instructions from https://brickset.com/exportscripts/instructions"
 firstline=0

 ##### CHANGE HERE #####
 logfile="lego_errors.log"
 downloadFolder="../Instructions"
 error_level=1
 ##### STOP CHANGE #####

 echo "" > $logfile

 THEMES="themes.csv"
 SETS="sets.csv"

 if [ -f "instructions" ]; then
	 echo "---> Using existing instructions file. Manually delete it to redownload..."
 else 
	 echo -ne "---> Downloading csv from Brickset..."
	 wget https://brickset.com/exportscripts/instructions &> /dev/null  
	 if [[ "$?" != 0 ]]; then
		 echo "Error... Exiting!"
		 exit; 
	 else
		 echo "Done!" 
	 fi
 fi


 echo "----> Converting csv with ',' to '|'..."
 head -1 instructions | sed 's/,/|/g' > instructions.csv
 sed 1,1d instructions | sed -r 's/("[^",]+),([^",]+")/\1###\2/g' | awk -F, '{print $1,$2,$3,$4,$5,$6}' | sed 's/###/,/g' | sed 's/" "/"|"/g' >> instructions.csv

 LINES=$(wc -l instructions | awk '{ print $1 }')
 CURRENT=0

 function validate_url(){
	if [[ `wget -S --spider $1  2>&1 | grep 'HTTP/1.1 200 OK'` ]]; then
		return 0
	else
		return 1
 	fi 
 }  

 while IFS='|' read -r ID LINK NAME DESC ADDED RETRIVED
 do

	#$PC=$((200*$CURRENT/$LINES % 2 + 100*$CURRENT/$LINES))
	#CURRENT=$((CURRENT + 1))
	 if [ "$firstline" = 0 ]; then
		 firstline=1
	 else			
		 tID=$(sed -e 's/^"//' -e 's/"$//' <<<"$ID")
		 tLINK=$(sed -e 's/^"//' -e 's/"$//' <<<"$LINK")
		 #tNAME=$(sed -e 's/^"//' -e 's/"$//' <<<"$NAME")
		 ttNAME=$(cut -d, -f1-2 sets.csv | grep -w $tID | cut -d, -f2)
		 setYEAR=$(grep -w $tID sets.csv | cut -d, -f3)
		 setNAME=$(sed -e 's/[^A-Za-z0-9._-]/_/g' <<< $ttNAME)
		 themeID=$(grep -w $tID sets.csv | cut -d, -f4)
		 themeName=$(awk -F',' -v id="$themeID" '$1 == id' themes.csv)
		 IFS=',' read -r -a array <<< "$themeName"  
		 tempID=${array[2]}
		 if [[ $tempID != "" ]]; then
			 while [[ $tempID != "" ]]  
			 do 	 
				 tthemeName=$(awk -F',' -v id="$tempID" '$1 == id' themes.csv)
				 IFS=',' read -r -a tArray <<< "$tthemeName"
				 tempID=${tArray[2]}
				 themeName=${tArray[1]}
			 done 
		 else  
			 themeName=${array[1]} 
		 fi

		 setTHEME=$(sed -e 's/[^A-Za-z0-9._-]/_/g' <<< $themeName)
			 
		 tADDED=$(sed -e 's/^"//' -e 's/"$//' <<<"$ADDED")
		 tDESC=$(sed -e 's/^"//' -e 's/"$//' <<<"$DESC")

		 ttDESC=$(echo $tDESC | grep -Eo '[^0-9][0-9]{1}\s?\/\s?[0-9]{1,2}' | sed 's/[^A-Za-z0-9/]//g' | sed 's/\//_of_/g')
		 #ttDESC=$(echo $tDESC | grep -Eo '\s[0-9]{1}\s?\/\s?[0-9]{1,2}' | sed 's/ //g' | sed 's/\//_of_/g')
		 
		 if [ -z "$ttDESC" ]; then
			 setDESC="_"
		 else
			 setDESC="_("$ttDESC")_"
		 fi
		 #PC=$(echo $CURRENT $LINES | awk '{print 100*$1/$2}')
		 tFilename=""$tID""$setDESC""${setNAME// /_}"_("$setTHEME"_"$setYEAR").pdf"
		 filename=$tFilename

		 if [ -f "$downloadFolder/$filename" ]; then
			 if [[ $error_level = 2 ]]; then
				 echo "-> $tID exists. Skipping..."
				 echo "$filename exists." >> $logfile
			 fi 	
		 else
			 if [[ "$tDESC" = "{No longer listed at LEGO.com}" ]] ; then
				 echo -ne "-> $tID testing link."

				 if validate_url $tLINK; then     
					 echo -ne "Found... Downloading..."   
				 
				 		curl -H "Mozilla/5.0 (platform; rv:75.0) Gecko/20100101 Firefox/75.0" -L $tLINK --silent --output "$downloadFolder/$filename"
				 		if [ -f "$downloadFolder/$filename" ]; then
					 		echo "Done! > $filename"
				 		else
					 		echo "ERROR!"
					 		if [[ $error_level = 1 || $error_level = 2 ]]; then  
						 	echo "--> Not downloaded. Try again manually..."
							echo "$filename was not downloaded. Check CURL" >> $logfile
						fi
						#random sleep in order to not look like a script
						sleep $(( ( RANDOM % 5 )  + 1 ))
				fi
					
				else
				   if [[ $error_level = 1 || $error_level = 2 ]]; then
					   echo "-> $tID is not available. Skipping..."
					   echo "$filename is not available." >> $logfile
				   fi
				 fi
			 else  
				 echo -ne "--> $tID downloading now..."
				 curl -H "Mozilla/5.0 (platform; rv:75.0) Gecko/20100101 Firefox/75.0" -L $tLINK --silent --output "$downloadFolder/$filename"
				 if [ -f "$downloadFolder/$filename" ]; then
					 echo "Done! > $filename"
				 else
					 echo "ERROR!"
					 if [[ $error_level = 1 || $error_level = 2 ]]; then  
						 echo "--> Not downloaded. Try again manually..."
						echo "$filename was not downloaded. Check CURL" >> $logfile
					fi
					#random sleep in order to not look like a script
					sleep $(( ( RANDOM % 5 )  + 1 ))
				fi
			fi
		fi
	fi
	
done < instructions.csv
Fixed errors 2020-06-24 10:18:54 +02:00			`#!/bin/bash`
			`##############################`
			`#`
			`# NOTE: There is still some small errors with sets with multiple booklets`
			`#`
			`# This scripts uses the brickset instructions file to get links and set numbers.`
			`# In order to get themes, we get themes from rebrickable.`
			`#`
			`# Files are saved as:`
			`# $tID""$setDESC""${setNAME// /_}"_("$setTHEME"_"$setYEAR").pdf"`
			`# 1190-1_Retro_Buggy_(Town_1999).pdf`
			`#`
			`# If a set contains multiple filesm the file name will be:`
			`# 2520-1_(1_of_2)_Battle_Arena_(Ninjago_2011).pdf`
			`# 2520-1_(2_of_2)_Battle_Arena_(Ninjago_2011).pdf`
			`#`
			`# Req:`
			`# bash, awk, grep, sed, curl, wget`
			`#`
			`# Get themes.csv and sets.cvs from https://rebrickable.com/downloads`
			`#`
			`# error_level:`
			`# 0 no errors reported`
			`# 1 Download errors reported`
			`# 2 Existing files and download errors reported`
			`#`
			`##############################`
Fixed errors 2020-06-24 08:41:55 +02:00
Fixed errors 2020-06-24 10:18:54 +02:00			`echo "----> Starting Download of all LEGO instructions from https://brickset.com/exportscripts/instructions"`
			`firstline=0`
Added commnets 2020-06-24 08:51:26 +02:00
Fixed errors 2020-06-24 10:18:54 +02:00			`##### CHANGE HERE #####`
			`logfile="lego_errors.log"`
			`downloadFolder="../Instructions"`
			`error_level=1`
			`##### STOP CHANGE #####`
Added commnets 2020-06-24 08:51:26 +02:00
Fixed errors 2020-06-24 10:18:54 +02:00			`echo "" > $logfile`
first upload 2020-06-23 15:02:27 +02:00
Fixed errors 2020-06-24 10:18:54 +02:00			`THEMES="themes.csv"`
			`SETS="sets.csv"`
first upload 2020-06-23 15:02:27 +02:00
Fixed errors 2020-06-24 10:18:54 +02:00			`if [ -f "instructions" ]; then`
			`echo "---> Using existing instructions file. Manually delete it to redownload..."`
			`else`
			`echo -ne "---> Downloading csv from Brickset..."`
			`wget https://brickset.com/exportscripts/instructions &> /dev/null`
			`if [[ "$?" != 0 ]]; then`
			`echo "Error... Exiting!"`
			`exit;`
			`else`
			`echo "Done!"`
			`fi`
			`fi`
Fixed errors 2020-06-24 08:41:55 +02:00

Fixed errors 2020-06-24 10:18:54 +02:00			`echo "----> Converting csv with ',' to '\|'..."`
			`head -1 instructions \| sed 's/,/\|/g' > instructions.csv`
			`sed 1,1d instructions \| sed -r 's/("[^",]+),([^",]+")/\1###\2/g' \| awk -F, '{print $1,$2,$3,$4,$5,$6}' \| sed 's/###/,/g' \| sed 's/" "/"\|"/g' >> instructions.csv`
Fixed errors 2020-06-24 08:41:55 +02:00
Fixed errors 2020-06-24 10:18:54 +02:00			`LINES=$(wc -l instructions \| awk '{ print $1 }')`
			`CURRENT=0`
Fixed errors 2020-06-24 08:41:55 +02:00
Added check if link exists if brickset says it isnt available 2020-06-24 13:05:22 +02:00			`function validate_url(){`
			if [[ `wget -S --spider $1 2>&1 \| grep 'HTTP/1.1 200 OK'` ]]; then
			`return 0`
			`else`
			`return 1`
			`fi`
			`}`

Fixed errors 2020-06-24 10:18:54 +02:00			`while IFS='\|' read -r ID LINK NAME DESC ADDED RETRIVED`
			`do`
Fixed errors 2020-06-24 08:41:55 +02:00
Fixed errors 2020-06-24 10:18:54 +02:00			`#$PC=$((200$CURRENT/$LINES % 2 + 100$CURRENT/$LINES))`
			`#CURRENT=$((CURRENT + 1))`
			`if [ "$firstline" = 0 ]; then`
			`firstline=1`
			`else`
			`tID=$(sed -e 's/^"//' -e 's/"$//' <<<"$ID")`
			`tLINK=$(sed -e 's/^"//' -e 's/"$//' <<<"$LINK")`
			`#tNAME=$(sed -e 's/^"//' -e 's/"$//' <<<"$NAME")`
			`ttNAME=$(cut -d, -f1-2 sets.csv \| grep -w $tID \| cut -d, -f2)`
			`setYEAR=$(grep -w $tID sets.csv \| cut -d, -f3)`
			`setNAME=$(sed -e 's/[^A-Za-z0-9._-]/_/g' <<< $ttNAME)`
			`themeID=$(grep -w $tID sets.csv \| cut -d, -f4)`
			`themeName=$(awk -F',' -v id="$themeID" '$1 == id' themes.csv)`
			`IFS=',' read -r -a array <<< "$themeName"`
			`tempID=${array[2]}`
			`if [[ $tempID != "" ]]; then`
			`while [[ $tempID != "" ]]`
			`do`
			`tthemeName=$(awk -F',' -v id="$tempID" '$1 == id' themes.csv)`
			`IFS=',' read -r -a tArray <<< "$tthemeName"`
			`tempID=${tArray[2]}`
			`themeName=${tArray[1]}`
			`done`
			`else`
			`themeName=${array[1]}`
			`fi`
Fixed errors 2020-06-24 08:41:55 +02:00
Fixed errors 2020-06-24 10:18:54 +02:00			`setTHEME=$(sed -e 's/[^A-Za-z0-9._-]/_/g' <<< $themeName)`

			`tADDED=$(sed -e 's/^"//' -e 's/"$//' <<<"$ADDED")`
			`tDESC=$(sed -e 's/^"//' -e 's/"$//' <<<"$DESC")`

			`ttDESC=$(echo $tDESC \| grep -Eo '[^0-9][0-9]{1}\s?\/\s?[0-9]{1,2}' \| sed 's/[^A-Za-z0-9/]//g' \| sed 's/\//_of_/g')`
			`#ttDESC=$(echo $tDESC \| grep -Eo '\s[0-9]{1}\s?\/\s?[0-9]{1,2}' \| sed 's/ //g' \| sed 's/\//_of_/g')`

			`if [ -z "$ttDESC" ]; then`
			`setDESC="_"`
			`else`
			`setDESC="_("$ttDESC")_"`
			`fi`
			`#PC=$(echo $CURRENT $LINES \| awk '{print 100*$1/$2}')`
			`tFilename=""$tID""$setDESC""${setNAME// /_}"_("$setTHEME"_"$setYEAR").pdf"`
			`filename=$tFilename`

			`if [ -f "$downloadFolder/$filename" ]; then`
			`if [[ $error_level = 2 ]]; then`
			`echo "-> $tID exists. Skipping..."`
			`echo "$filename exists." >> $logfile`
			`fi`
			`else`
			`if [[ "$tDESC" = "{No longer listed at LEGO.com}" ]] ; then`
Added check if link exists if brickset says it isnt available 2020-06-24 13:05:22 +02:00			`echo -ne "-> $tID testing link."`

			`if validate_url $tLINK; then`
			`echo -ne "Found... Downloading..."`

			`curl -H "Mozilla/5.0 (platform; rv:75.0) Gecko/20100101 Firefox/75.0" -L $tLINK --silent --output "$downloadFolder/$filename"`
			`if [ -f "$downloadFolder/$filename" ]; then`
			`echo "Done! > $filename"`
			`else`
			`echo "ERROR!"`
			`if [[ $error_level = 1 \|\| $error_level = 2 ]]; then`
			`echo "--> Not downloaded. Try again manually..."`
			`echo "$filename was not downloaded. Check CURL" >> $logfile`
			`fi`
			`#random sleep in order to not look like a script`
			`sleep $(( ( RANDOM % 5 ) + 1 ))`
			`fi`

			`else`
			`if [[ $error_level = 1 \|\| $error_level = 2 ]]; then`
			`echo "-> $tID is not available. Skipping..."`
			`echo "$filename is not available." >> $logfile`
			`fi`
Fixed errors 2020-06-24 10:18:54 +02:00			`fi`
			`else`
			`echo -ne "--> $tID downloading now..."`
			`curl -H "Mozilla/5.0 (platform; rv:75.0) Gecko/20100101 Firefox/75.0" -L $tLINK --silent --output "$downloadFolder/$filename"`
			`if [ -f "$downloadFolder/$filename" ]; then`
			`echo "Done! > $filename"`
			`else`
			`echo "ERROR!"`
			`if [[ $error_level = 1 \|\| $error_level = 2 ]]; then`
			`echo "--> Not downloaded. Try again manually..."`
first upload 2020-06-23 15:02:27 +02:00			`echo "$filename was not downloaded. Check CURL" >> $logfile`
			`fi`
Fixed errors 2020-06-24 10:18:54 +02:00			`#random sleep in order to not look like a script`
			`sleep $(( ( RANDOM % 5 ) + 1 ))`
first upload 2020-06-23 15:02:27 +02:00			`fi`
			`fi`
			`fi`
			`fi`

Fixed errors 2020-06-24 08:41:55 +02:00			`done < instructions.csv`
first upload 2020-06-23 15:02:27 +02:00