#! /usr/bin/perl -w # # This is basically Moonshadow's code, with a bit of hacking by Pallando # TODO: split each book info source into a separate subfunc and customise # to scrape any of the additional fields available, that are listed # at the bottom of the file use strict; use FileHandle; require '/home/douglasr/public-cgi/cgi-lib.pl'; sub eancheckdigit { my($string) = @_; my $weight = 1; my $sum = 0; foreach my $digit(split('', $string)) { $sum += $digit * $weight; $weight = 4 - $weight; } return (10 - ($sum % 10)) % 10; }; sub isbncheckdigit { my($string) = @_; my $weight = 10; my $sum = 0; foreach my $digit(split('', $string)) { $sum += $digit * $weight; $weight--; } $sum = (11 - ($sum % 11)) % 11; if ($sum == 10) { $sum = 'X'; } return $sum; }; sub isbnhyphenate { my ($isbn) = @_; # source: http://usin.org/software/servers/ISBN-ISSN.phps my @country_group_partition = ( 0, 80, 950, 9960, 99900 ); my %country_group_map = ( 0 => [ '00',200,7000,85000,900000,9500000 ], 1 => [ '00000000',55000,869800,9999900 ], 2 => [ '00',200,40000000,500,7000,84000,900000,9500000 ], 3 => [ '00',200,7000,85000,900000,9500000 ], 4 => ['00',200,7000,85000,900000,9500000], 5 => ['00',200,7000,85000,900000,9500000], 7 => ['00',100,5000,80000,900000], 80 => ['00',200,7000,85000,900000], 81 => ['00',200,7000,85000,900000], 82 => ['00',200,7000,90000,990000], 83 => ['00',200,7000,85000,900000], 84 => ['00',200,7000,85000,900000,95000,9700], 85 => ['00',200,7000,85000,900000], 86 => ['00',300,7000,80000,900000], 87 => ['00',400,7000,85000,970000], 88 => ['00',200,7000,85000,900000], 89 => ['00',300,7000,85000,950000], 90 => ['00',200,5000,70000,800000,9000000], 91 => ['0',20,500,6500000,7000,8000000,85000,9500000,970000], 92 => ['0',60,800,9000], 93 => ['0000000'], 950 => ['00',500,9000,99000], 951 => ['0',20,550,8900,95000], 952 => ['00',200,5000,89,9500,99000], 953 => ['0',10,150,6000,96000], 954 => ['00',400,8000,90000], 955 => ['0',20,550,800000,9000,95000], 956 => ['00',200,7000], 957 => ['00',440,8500,97000], 958 => ['0',600,9000,95000], 959 => ['00',200,7000], 960 => ['00',200,7000,85000], 961 => ['00',200,6000,90000], 962 => ['00',200,7000,85000], 963 => ['00',200,7000,85000], 964 => ['00',300,5500,90000], 965 => ['00',200,7000,90000], 966 => ['00',500,7000,90000], 967 => ['0',60,900,9900,99900], 968 => ['000000',10,400,500000,6000,800,900000], 969 => ['0',20,400,8000], 970 => ['00',600,9000,91000], 971 => ['00',500,8500,91000], 972 => ['0',20,550,8000,95000], 973 => ['0',20,550,9000,95000], 974 => ['00',200,7000,85000,900000], 975 => ['00',300,6000,92000,980000], 976 => ['0',40,600,8000,95000], 977 => ['00',200,5000,70000], 978 => ['000',2000,30000], 979 => ['0',20,300000,400,700000,8000,95000], 980 => ['00',200,6000], 981 => ['00',200,3000], 982 => ['00',100,500000], 983 => ['000',2000,300000,50,800,9000,99000], 984 => ['00',400,8000,90000], 985 => ['00',400,6000,90000], 986 => ['000000'], 987 => ['00',500,9000,99000], 9952 => ['00000'], 9953 => ['0',20,9000], 9954 => ['00',8000], 9955 => ['00',400], 9956 => ['00000'], 9957 => ['00',8000], 9958 => ['0',10,500,7000,9000], 9959 => ['00'], 9960 => ['00',600,9000], 9961 => ['0',50,800,9500], 9962 => ['00000'], 9963 => ['0',30,550,7500], 9964 => ['0',70,950], 9965 => ['00',400,9000], 9966 => ['00',70000,800,9600], 9967 => ['00000'], 9968 => ['0',10,700,9700], 9970 => ['00',400,9000], 9971 => ['0',60,900,9900], 9972 => ['0',40,600,9000], 9973 => ['0',10,700,9700], 9974 => ['0',30,550,7500], 9975 => ['0',50,900,9500], 9976 => ['0',60,900,99000,9990], 9977 => ['00',900,9900], 9978 => ['00',950,9900], 9979 => ['0',50,800,9000], 9980 => ['0',40,900,9900], 9981 => ['0',20,800,9500], 9982 => ['00',40000,800,9900], 9983 => ['00',500,80,950,9900], 9984 => ['00',500,9000], 9985 => ['0',50,800,9000], 9986 => ['00',400,9000], 9987 => ['00',400,8800], 9988 => ['0',30,550,7500], 9989 => ['0',30,600,9700], 99901 => ['00'], 99903 => ['0',20,900], 99904 => ['0',60,900], 99905 => ['0',60,900], 99906 => ['0',60,900], 99908 => ['0',10,900], 99909 => ['0',40,950], 99910 => ['0000'], 99911 => ['00',600], 99912 => ['0',60,900], 99913 => ['0',30,600], 99914 => ['0',50,900], 99915 => ['0',50,800], 99916 => ['0',30,700], 99917 => ['0',30], 99918 => ['0',40,900], 99919 => ['0',40,900], 99920 => ['0',50,900], 99921 => ['0',20,700], 99922 => ['0',50], 99923 => ['0',20,800], 99924 => ['0',30], 99925 => ['0',40,800], 99926 => ['0000',600], 99927 => ['0',30,600], 99928 => ['0',50,800], 99929 => ['0000'], 99930 => ['0',50,800], 99931 => ['0000'], 99932 => ['0',10], 99933 => ['00',300], 99934 => ['0'], 99935 => ['0000'], 99936 => ['0000'], 99937 => ['0',20] ); # determine country group my $group = substr($isbn, 0, length($country_group_partition[0])); { my $ngroup = 1; while( defined($country_group_partition[$ngroup]) and ( ( $country_group_partition[$ngroup] cmp substr($isbn, 0, length($country_group_partition[$ngroup])) ) < 0 ) ) { $group = substr($isbn, 0, length($country_group_partition[$ngroup])); $ngroup++; } } # determine publisher prefix my $prefix = substr($isbn, length($group), length($country_group_map{$group}->[0])); { my $nprefix = 1; while( defined($country_group_map{$group}->[$nprefix]) and ( ( $country_group_map{$group}->[$nprefix] cmp substr($isbn, length($group), length($country_group_map{$group}->[$nprefix])) ) < 0 ) ) { $prefix = substr($isbn, length($group), length($country_group_map{$group}->[$nprefix])); $nprefix++; } } my $itemstart = length($group) + length($prefix); my $itemlength = length($isbn) - $itemstart - 1; return $group . '-' . $prefix . '-' . substr($isbn, $itemstart, $itemlength) . '-' . substr($isbn, $itemstart + $itemlength, 1); }; sub retrieve { my ($url) = @_; require LWP::UserAgent; my $ua = LWP::UserAgent->new( timeout => 30, env_proxy => 1, agent => 'Mozilla/4.0; compatible; MSIE 6.0; perl bot', keep_alive => 1 ); my $request = HTTP::Request->new(GET => $url); $request->protocol('HTTP/1.1'); my $response = $ua->request($request); if($response->is_success) { return $response->content; } else { $url =~ s/&/\\&/g; return `/usr/local/bin/GET $url`; } return ''; } sub IsErrorCode { my($code) = @_; $code =~ /^\s*Not found\.?\s*$/gi and return 1; $code =~ /^\s*Book Search\s*$/gi and return 1; $code =~ /^\s*Advanced Book Search\s*$/gi and return 1; $code =~ /^\s*\*\*\* Not found\. \*\*\*?\s*$/gi and return 1; $code =~ /^\s*No Title Found\s*$/gi and return 1; $code =~ /^\s*Books.+Used.+Out of Print.+DVDs.+Toys\s*$/gi and return 1; $code =~ /^\s*by\s*$/gi and return 1; return 0; } sub forceISBN { my ($isbn) = @_; if($isbn =~ /(978)([0-9]{9}?)([0-9])/) { # todo: compare eancheckdigit ($1.$2) to $3, bail if not equal $isbn = $2 . &isbncheckdigit($2); } return $isbn; } sub lookup { my ($isbn) = @_; my $html = ''; $isbn = forceISBN($isbn); $isbn =~ /^([0-9]{9}?)([0-9xX])$/ or return "RESULT=NULL1"; # todo: compare isbncheckdigit($1) to $2, bail if not equal $isbn = $1.$2; my $url; $url = 'http://www.amazon.co.uk/exec/obidos/ASIN/' . $isbn ; $html = retrieve($url); if( ($html =~ /]+>(.+?)<\/font>/) { if(!IsErrorCode($1)) { return $1, "Nu", $url, $isbn ; } } $url = 'http://www.ozon.ru/?context=advsearch_book&isbn=' . isbnhyphenate($isbn); $html = retrieve($url); if($html =~ /class="big1">(.+?)<\//) { if(!IsErrorCode($1)) { return $1, "Ozon", $url, $isbn; } } $url = 'http://www.amazon.com/exec/obidos/ASIN/' . $isbn; $html = retrieve($url); if( ($html =~ //i)) { if(!IsErrorCode($1)) { return $1, "Amazon", $url, $isbn; } } # $url = 'http://www.amazon.co.jp/exec/obidos/ASIN/' . $isbn; # $html = retrieve($url); # if( ($html =~ //i)) # { # if(!IsErrorCode($1)) { return $1.' (source)'; } # } $url = 'http://www.biblio.com/isbn/' . $isbn . '.html'; $html = retrieve($url); if($html =~ /Biblio: \(ISBN: .+?\) (.+?)<\//) { if(!IsErrorCode($1)) { return $1, "Biblio", $url, $isbn; } } $url = 'http://search.barnesandnoble.com/booksearch/isbninquiry.asp?ISBN=' . $isbn; $html = retrieve($url); if($html =~ /<title>Barnes\ \;\&\;\ \;Noble.com - (.+?)<\//) { if(!IsErrorCode($1)) { return $1, "Barnes and Noble", $url, $isbn; } } $url = 'http://my.linkbaton.com/isbn/' . $isbn; $html = retrieve($url); if($html =~ /content='ISBN, book, author, (.+?, )[0-9xX]+, (.+?)'/) { if(!IsErrorCode($1)) { return $1, "Link Baton", $url, $isbn; } } $url = 'http://www.google.co.uk/search?btnI=1&q=inurl:ffbooks+' . $isbn; $html = retrieve($url); if($html =~ /<title>(.+?by.+?)<\//) { if(!IsErrorCode($1)) { return $1, "Google", $url, $isbn; } } $url = 'http://wuz.librialice.it/scheda.aspx?isbn=' . $isbn; $html = retrieve($url); if($html =~ /<title>(.+? - .+?) *<\//) { if(!IsErrorCode($1)) { return $1, "Libri Alice", $url, $isbn; } } # $url = 'http://www.centraldellibro.com/web/ES/APL/resultados_busqueda.asp?cad_busq=' . $isbn; # $html = retrieve($url); # if($html =~ /<title>[^<]+?: ([<]+) *<\/.+<meta name/) # { # if(!IsErrorCode($1)) { return $1.' (<a href="'.$url.'">source</a>)'; } # } return "RESULT=NULL2"; }; sub makestr { my @args = @_; my %info; my $raw = $args[0]; my $SOURCENAME = $args[1]; my $SOURCEURL = $args[2]; my $ISBN = $args[3]; my @parts = split(':', $raw); my $len = $#parts; $info{"SOURCENAME"} = $SOURCENAME; $info{"SOURCEURL"} = $SOURCEURL; $info{"ISBN"} = $ISBN; # * AUTHOR - full name in order supplied by source my $AUTHOR = $parts[$len - 1]; $AUTHOR =~ s/^\s*(\S.*)/$1/; # remove leading whitespace $AUTHOR =~ s/^(.*\S)\s*$/$1/; # remove trailing whitespace $info{"AUTHOR"} = $AUTHOR; # * TITLEFULL - foo (bar) my $TITLEFULL; if($len > 2) { $TITLEFULL = join( ":", @parts[0,$len - 2] ); } else { $TITLEFULL = $parts[0]; } $TITLEFULL =~ s/^\s*(\S.*)/$1/; # remove leading whitespace $TITLEFULL =~ s/^(.*\S)\s*$/$1/; # remove trailing whitespace $info{"TITLEFULL"} = $TITLEFULL; # * AUTHORNAMELAST - Author's last name @parts = split( '\s+', $AUTHOR ); $AUTHOR = $parts[$#parts]; @parts = split( '\.', $AUTHOR ); $AUTHOR = $parts[$#parts]; $info{"AUTHORNAMELAST"} = $AUTHOR; # * AUTHORNAMEOTHER - Author's other names, including co-authors $AUTHOR = $info{"AUTHOR"}; $len = length($AUTHOR) - length($info{"AUTHORNAMELAST"}); $AUTHOR = substr($AUTHOR, 0, $len); $info{"AUTHORNAMEOTHER"} = $AUTHOR; # * TITLEEXTRA - bar, hopefully series name and number # * TITLE - foo if($TITLEFULL =~ /.*\((.*)\).*/) { $TITLEFULL =~ s/.*\((.*)\).*/$1/; $info{"TITLEEXTRA"} = $TITLEFULL; $TITLEFULL =~ s/$(.*)\s*\(.*/$1/; $info{"TITLE"} = $TITLEFULL; } else { $info{"TITLE"} = $info{"TITLEFULL"}; } my @key_list = sort keys(%info); my $str = ""; my $key; foreach $key ( @key_list ) { $str = $str.$key."=".$info{$key}."\t"; } $str =~ s/(.*)\s$/$1/; return $str; } if(1) { use CGI; my $code = $ARGV[0]; if($code) { $code =~ s/[^0-9xX]//g; } my $result = ''; my @ret = (); $code and length($code) and @ret = lookup($code); if($#ret > 1) { my $str = makestr( $ret[0], $ret[1], $ret[2], $ret[3] ); print $str; } else { print "RESULT=NULL3"; } } else { print "RESULT=NULL4"; } # * ISBN - defaults to "AUTHOR-TITLE-FORMAT-PUBDATE" if no ISBN # * AUTHOR - full name in order supplied by source # * AUTHORNAMELAST - Author's last name # * AUTHORNAMEOTHER - Author's other names, including co-authors # * SOURCENAME # * SOURCEURL # * TITLEFULL - foo (bar) # * TITLE - foo # * TITLEEXTRA - bar, hopefully series name and number # * DATEPUB - listed publication date # * FORMAT - Hardback, Paperback, Trade, Manga, Graphic Novel, Other # * PRICE # * PUBLISHER # * DIMENSIONS # * SOURCERATING - if the info source rated it