#! /usr/bin/perl -w
#
# This is basically Moonshadow's code, with a bit of hacking by Pallando
# TODO: split each book info source into a separate subfunc and customise
# to scrape any of the additional fields available, that are listed
# at the bottom of the file
use strict;
use FileHandle;
require '/home/douglasr/public-cgi/cgi-lib.pl';
sub eancheckdigit
{
my($string) = @_;
my $weight = 1;
my $sum = 0;
foreach my $digit(split('', $string))
{
$sum += $digit * $weight;
$weight = 4 - $weight;
}
return (10 - ($sum % 10)) % 10;
};
sub isbncheckdigit
{
my($string) = @_;
my $weight = 10;
my $sum = 0;
foreach my $digit(split('', $string))
{
$sum += $digit * $weight;
$weight--;
}
$sum = (11 - ($sum % 11)) % 11;
if ($sum == 10)
{
$sum = 'X';
}
return $sum;
};
sub isbnhyphenate
{
my ($isbn) = @_;
# source: http://usin.org/software/servers/ISBN-ISSN.phps
my @country_group_partition = ( 0, 80, 950, 9960, 99900 );
my %country_group_map = (
0 => [ '00',200,7000,85000,900000,9500000 ],
1 => [ '00000000',55000,869800,9999900 ],
2 => [ '00',200,40000000,500,7000,84000,900000,9500000 ],
3 => [ '00',200,7000,85000,900000,9500000 ],
4 => ['00',200,7000,85000,900000,9500000],
5 => ['00',200,7000,85000,900000,9500000],
7 => ['00',100,5000,80000,900000],
80 => ['00',200,7000,85000,900000],
81 => ['00',200,7000,85000,900000],
82 => ['00',200,7000,90000,990000],
83 => ['00',200,7000,85000,900000],
84 => ['00',200,7000,85000,900000,95000,9700],
85 => ['00',200,7000,85000,900000],
86 => ['00',300,7000,80000,900000],
87 => ['00',400,7000,85000,970000],
88 => ['00',200,7000,85000,900000],
89 => ['00',300,7000,85000,950000],
90 => ['00',200,5000,70000,800000,9000000],
91 => ['0',20,500,6500000,7000,8000000,85000,9500000,970000],
92 => ['0',60,800,9000],
93 => ['0000000'],
950 => ['00',500,9000,99000],
951 => ['0',20,550,8900,95000],
952 => ['00',200,5000,89,9500,99000],
953 => ['0',10,150,6000,96000],
954 => ['00',400,8000,90000],
955 => ['0',20,550,800000,9000,95000],
956 => ['00',200,7000],
957 => ['00',440,8500,97000],
958 => ['0',600,9000,95000],
959 => ['00',200,7000],
960 => ['00',200,7000,85000],
961 => ['00',200,6000,90000],
962 => ['00',200,7000,85000],
963 => ['00',200,7000,85000],
964 => ['00',300,5500,90000],
965 => ['00',200,7000,90000],
966 => ['00',500,7000,90000],
967 => ['0',60,900,9900,99900],
968 => ['000000',10,400,500000,6000,800,900000],
969 => ['0',20,400,8000],
970 => ['00',600,9000,91000],
971 => ['00',500,8500,91000],
972 => ['0',20,550,8000,95000],
973 => ['0',20,550,9000,95000],
974 => ['00',200,7000,85000,900000],
975 => ['00',300,6000,92000,980000],
976 => ['0',40,600,8000,95000],
977 => ['00',200,5000,70000],
978 => ['000',2000,30000],
979 => ['0',20,300000,400,700000,8000,95000],
980 => ['00',200,6000],
981 => ['00',200,3000],
982 => ['00',100,500000],
983 => ['000',2000,300000,50,800,9000,99000],
984 => ['00',400,8000,90000],
985 => ['00',400,6000,90000],
986 => ['000000'],
987 => ['00',500,9000,99000],
9952 => ['00000'],
9953 => ['0',20,9000],
9954 => ['00',8000],
9955 => ['00',400],
9956 => ['00000'],
9957 => ['00',8000],
9958 => ['0',10,500,7000,9000],
9959 => ['00'],
9960 => ['00',600,9000],
9961 => ['0',50,800,9500],
9962 => ['00000'],
9963 => ['0',30,550,7500],
9964 => ['0',70,950],
9965 => ['00',400,9000],
9966 => ['00',70000,800,9600],
9967 => ['00000'],
9968 => ['0',10,700,9700],
9970 => ['00',400,9000],
9971 => ['0',60,900,9900],
9972 => ['0',40,600,9000],
9973 => ['0',10,700,9700],
9974 => ['0',30,550,7500],
9975 => ['0',50,900,9500],
9976 => ['0',60,900,99000,9990],
9977 => ['00',900,9900],
9978 => ['00',950,9900],
9979 => ['0',50,800,9000],
9980 => ['0',40,900,9900],
9981 => ['0',20,800,9500],
9982 => ['00',40000,800,9900],
9983 => ['00',500,80,950,9900],
9984 => ['00',500,9000],
9985 => ['0',50,800,9000],
9986 => ['00',400,9000],
9987 => ['00',400,8800],
9988 => ['0',30,550,7500],
9989 => ['0',30,600,9700],
99901 => ['00'],
99903 => ['0',20,900],
99904 => ['0',60,900],
99905 => ['0',60,900],
99906 => ['0',60,900],
99908 => ['0',10,900],
99909 => ['0',40,950],
99910 => ['0000'],
99911 => ['00',600],
99912 => ['0',60,900],
99913 => ['0',30,600],
99914 => ['0',50,900],
99915 => ['0',50,800],
99916 => ['0',30,700],
99917 => ['0',30],
99918 => ['0',40,900],
99919 => ['0',40,900],
99920 => ['0',50,900],
99921 => ['0',20,700],
99922 => ['0',50],
99923 => ['0',20,800],
99924 => ['0',30],
99925 => ['0',40,800],
99926 => ['0000',600],
99927 => ['0',30,600],
99928 => ['0',50,800],
99929 => ['0000'],
99930 => ['0',50,800],
99931 => ['0000'],
99932 => ['0',10],
99933 => ['00',300],
99934 => ['0'],
99935 => ['0000'],
99936 => ['0000'],
99937 => ['0',20]
);
# determine country group
my $group = substr($isbn, 0, length($country_group_partition[0]));
{
my $ngroup = 1;
while( defined($country_group_partition[$ngroup]) and ( ( $country_group_partition[$ngroup] cmp substr($isbn, 0, length($country_group_partition[$ngroup])) ) < 0 ) )
{
$group = substr($isbn, 0, length($country_group_partition[$ngroup]));
$ngroup++;
}
}
# determine publisher prefix
my $prefix = substr($isbn, length($group), length($country_group_map{$group}->[0]));
{
my $nprefix = 1;
while( defined($country_group_map{$group}->[$nprefix]) and ( ( $country_group_map{$group}->[$nprefix] cmp substr($isbn, length($group), length($country_group_map{$group}->[$nprefix])) ) < 0 ) )
{
$prefix = substr($isbn, length($group), length($country_group_map{$group}->[$nprefix]));
$nprefix++;
}
}
my $itemstart = length($group) + length($prefix);
my $itemlength = length($isbn) - $itemstart - 1;
return $group . '-' . $prefix . '-' . substr($isbn, $itemstart, $itemlength) . '-' . substr($isbn, $itemstart + $itemlength, 1);
};
sub retrieve
{
my ($url) = @_;
require LWP::UserAgent;
my $ua = LWP::UserAgent->new( timeout => 30, env_proxy => 1, agent => 'Mozilla/4.0; compatible; MSIE 6.0; perl bot', keep_alive => 1 );
my $request = HTTP::Request->new(GET => $url);
$request->protocol('HTTP/1.1');
my $response = $ua->request($request);
if($response->is_success)
{
return $response->content;
}
else
{
$url =~ s/&/\\&/g;
return `/usr/local/bin/GET $url`;
}
return '';
}
sub IsErrorCode
{
my($code) = @_;
$code =~ /^\s*Not found\.?\s*$/gi and return 1;
$code =~ /^\s*Book Search\s*$/gi and return 1;
$code =~ /^\s*Advanced Book Search\s*$/gi and return 1;
$code =~ /^\s*\*\*\* Not found\. \*\*\*?\s*$/gi and return 1;
$code =~ /^\s*No Title Found\s*$/gi and return 1;
$code =~ /^\s*Books.+Used.+Out of Print.+DVDs.+Toys\s*$/gi and return 1;
$code =~ /^\s*by\s*$/gi and return 1;
return 0;
}
sub forceISBN
{
my ($isbn) = @_;
if($isbn =~ /(978)([0-9]{9}?)([0-9])/)
{
# todo: compare eancheckdigit ($1.$2) to $3, bail if not equal
$isbn = $2 . &isbncheckdigit($2);
}
return $isbn;
}
sub lookup
{
my ($isbn) = @_;
my $html = '';
$isbn = forceISBN($isbn);
$isbn =~ /^([0-9]{9}?)([0-9xX])$/ or return "RESULT=NULL1";
# todo: compare isbncheckdigit($1) to $2, bail if not equal
$isbn = $1.$2;
my $url;
$url = 'http://www.amazon.co.uk/exec/obidos/ASIN/' . $isbn ;
$html = retrieve($url);
if( ($html =~ /]+>(.+?)<\/font>/)
{
if(!IsErrorCode($1)) { return $1, "Nu", $url, $isbn ; }
}
$url = 'http://www.ozon.ru/?context=advsearch_book&isbn=' . isbnhyphenate($isbn);
$html = retrieve($url);
if($html =~ /class="big1">(.+?)<\//)
{
if(!IsErrorCode($1)) { return $1, "Ozon", $url, $isbn; }
}
$url = 'http://www.amazon.com/exec/obidos/ASIN/' . $isbn;
$html = retrieve($url);
if( ($html =~ //i))
{
if(!IsErrorCode($1)) { return $1, "Amazon", $url, $isbn; }
}
# $url = 'http://www.amazon.co.jp/exec/obidos/ASIN/' . $isbn;
# $html = retrieve($url);
# if( ($html =~ //i))
# {
# if(!IsErrorCode($1)) { return $1.' (source)'; }
# }
$url = 'http://www.biblio.com/isbn/' . $isbn . '.html';
$html = retrieve($url);
if($html =~ /Biblio: \(ISBN: .+?\) (.+?)<\//)
{
if(!IsErrorCode($1)) { return $1, "Biblio", $url, $isbn; }
}
$url = 'http://search.barnesandnoble.com/booksearch/isbninquiry.asp?ISBN=' . $isbn;
$html = retrieve($url);
if($html =~ /Barnes\ \;\&\;\ \;Noble.com - (.+?)<\//)
{
if(!IsErrorCode($1)) { return $1, "Barnes and Noble", $url, $isbn; }
}
$url = 'http://my.linkbaton.com/isbn/' . $isbn;
$html = retrieve($url);
if($html =~ /content='ISBN, book, author, (.+?, )[0-9xX]+, (.+?)'/)
{
if(!IsErrorCode($1)) { return $1, "Link Baton", $url, $isbn; }
}
$url = 'http://www.google.co.uk/search?btnI=1&q=inurl:ffbooks+' . $isbn;
$html = retrieve($url);
if($html =~ /(.+?by.+?)<\//)
{
if(!IsErrorCode($1)) { return $1, "Google", $url, $isbn; }
}
$url = 'http://wuz.librialice.it/scheda.aspx?isbn=' . $isbn;
$html = retrieve($url);
if($html =~ /(.+? - .+?) *<\//)
{
if(!IsErrorCode($1)) { return $1, "Libri Alice", $url, $isbn; }
}
# $url = 'http://www.centraldellibro.com/web/ES/APL/resultados_busqueda.asp?cad_busq=' . $isbn;
# $html = retrieve($url);
# if($html =~ /[^<]+?: ([<]+) *<\/.+source)'; }
# }
return "RESULT=NULL2";
};
sub makestr
{
my @args = @_;
my %info;
my $raw = $args[0];
my $SOURCENAME = $args[1];
my $SOURCEURL = $args[2];
my $ISBN = $args[3];
my @parts = split(':', $raw);
my $len = $#parts;
$info{"SOURCENAME"} = $SOURCENAME;
$info{"SOURCEURL"} = $SOURCEURL;
$info{"ISBN"} = $ISBN;
# * AUTHOR - full name in order supplied by source
my $AUTHOR = $parts[$len - 1];
$AUTHOR =~ s/^\s*(\S.*)/$1/; # remove leading whitespace
$AUTHOR =~ s/^(.*\S)\s*$/$1/; # remove trailing whitespace
$info{"AUTHOR"} = $AUTHOR;
# * TITLEFULL - foo (bar)
my $TITLEFULL;
if($len > 2)
{
$TITLEFULL = join( ":", @parts[0,$len - 2] );
} else {
$TITLEFULL = $parts[0];
}
$TITLEFULL =~ s/^\s*(\S.*)/$1/; # remove leading whitespace
$TITLEFULL =~ s/^(.*\S)\s*$/$1/; # remove trailing whitespace
$info{"TITLEFULL"} = $TITLEFULL;
# * AUTHORNAMELAST - Author's last name
@parts = split( '\s+', $AUTHOR );
$AUTHOR = $parts[$#parts];
@parts = split( '\.', $AUTHOR );
$AUTHOR = $parts[$#parts];
$info{"AUTHORNAMELAST"} = $AUTHOR;
# * AUTHORNAMEOTHER - Author's other names, including co-authors
$AUTHOR = $info{"AUTHOR"};
$len = length($AUTHOR) - length($info{"AUTHORNAMELAST"});
$AUTHOR = substr($AUTHOR, 0, $len);
$info{"AUTHORNAMEOTHER"} = $AUTHOR;
# * TITLEEXTRA - bar, hopefully series name and number
# * TITLE - foo
if($TITLEFULL =~ /.*\((.*)\).*/)
{
$TITLEFULL =~ s/.*\((.*)\).*/$1/;
$info{"TITLEEXTRA"} = $TITLEFULL;
$TITLEFULL =~ s/$(.*)\s*\(.*/$1/;
$info{"TITLE"} = $TITLEFULL;
} else {
$info{"TITLE"} = $info{"TITLEFULL"};
}
my @key_list = sort keys(%info);
my $str = "";
my $key;
foreach $key ( @key_list )
{
$str = $str.$key."=".$info{$key}."\t";
}
$str =~ s/(.*)\s$/$1/;
return $str;
}
if(1)
{
use CGI;
my $code = $ARGV[0];
if($code)
{
$code =~ s/[^0-9xX]//g;
}
my $result = '';
my @ret = ();
$code and length($code) and @ret = lookup($code);
if($#ret > 1)
{
my $str = makestr( $ret[0], $ret[1], $ret[2], $ret[3] );
print $str;
}
else
{
print "RESULT=NULL3";
}
}
else
{
print "RESULT=NULL4";
}
# * ISBN - defaults to "AUTHOR-TITLE-FORMAT-PUBDATE" if no ISBN
# * AUTHOR - full name in order supplied by source
# * AUTHORNAMELAST - Author's last name
# * AUTHORNAMEOTHER - Author's other names, including co-authors
# * SOURCENAME
# * SOURCEURL
# * TITLEFULL - foo (bar)
# * TITLE - foo
# * TITLEEXTRA - bar, hopefully series name and number
# * DATEPUB - listed publication date
# * FORMAT - Hardback, Paperback, Trade, Manga, Graphic Novel, Other
# * PRICE
# * PUBLISHER
# * DIMENSIONS
# * SOURCERATING - if the info source rated it