First get the appropriate index. 9706 wget http://www.thebricktestament.com/joshua/index.html Divide it up into chunks 9715 csplit index.html '/.*/' '{*}' Find the ones that are junk rather than content 9716 ls 9717 less xx00 9718 rm xx00 9719 less xx01 9720 rm xx01 ... 9726 rm xx0{2,3,4,5} 9727 ls 9728 less xx{0{6,7,8,9},1{0,1,2,3,4,5,6,7,8,9},2{0,1,2}} 9729 rm xx2{0,1,2} Edit out any other junk from remainder 9730 zed xx19 9731 zed xx13 9732 zed xx12 Delete index.txt 9734 rm index.html Demunge dos2unix 9736 perl -pi.bak -e 's/ //g' * 9740 rm *.bak Change relative anchors into absolute ones 9741 grep 'a href=' * 9742 perl -pi.bak -e 's-a href="-a href="http://www.thebricktestament.com/joshua/-g' * 9744 rm *.bak Change the ratings images to point to the ones on chiark 9752 grep -h rating * | sort | uniq 9755 perl -pi.bak -e 's=../website_images/parchment_bkg/rating=http://www.chiark.greenend.org.uk/~jdamery/bt-thumbs/rating=g' * 9757 rm *.bak Watching out for ones which don't match the common scheme 9766 zed xx12 Get thumbnails and rename them so they've got unique leafnames 9772 for x in $(perl -ne 'm/img src="([^"]*)"/; print "$1\n";' xx* | grep -v http | sort| uniq); do y=$(echo $x | sed 's-/-_-'); wget -O $y http://www.thebricktestament.com/joshua/"$x"; done Edit the image URLs in the fragments 9775 perl -pi.bak -e 's#" > $(( $x + 126 )).html.frag; cat xx$(( $x + 5 )) >> $(( $x + 126 )).html.frag; echo "" >> $(( $x + 126 )).html.frag; done 9785 for x in $(seq 1 4); do echo "" > $(( $x + 126 )).html.frag; cat xx0$(( $x + 5 )) >> $(( $x + 126 )).html.frag; echo "
" >> $(( $x + 126 )).html.frag; done Put the fragments in the hopper. 9789 cp *.frag ~/var/lib/bricktestament/frags/