#!/usr/bin/perl -w use strict; use POSIX; use IO::Handle; use Carp; use TOML::Parser; use Data::Dumper; use Time::HiRes; use Digest::MD5 qw(md5_hex); use File::Compare; our $max_rate = 2; # per second # todo: allow to read size details out of svg (maybe in daemon-otter?) # todo: allow scraper method none to handle item vs filename mismatch #print Dumper($libinfo); open DEBUG, ">&STDERR" or die $!; autoflush DEBUG 1; sub run_curl { my ($datalog, $output, $url, @xopts) = @_; my @curl = (qw(curl -Ssf -L --proto-redir -all), @xopts); push @curl, '-o', "$output.tmp", $url; our $last_curl; $last_curl //= 0.; my $now = Time::HiRes::time; my $delay = 1./$max_rate - ($now - $last_curl); Time::HiRes::sleep $delay if $delay > 0; $last_curl = $now; # print DEBUG "+ @curl\n"; $!=$?=0; my $r = system @curl; die "curl failed ($? $!): @curl" if $r; my $logtime = strftime "%F %T UTC", gmtime time; print $datalog "$logtime: downloaded into $output from $url\n" or die $!; rename "$output.tmp", "$output" or die "install $output: $!"; } sub cfg_lookup_1 ($@) { my ($dict, @keys) = @_; #print DEBUG "cfg_lookup_1 ".Dumper($dict, \@keys); foreach my $k (@keys) { confess unless ref $dict eq 'HASH'; $dict = $dict->{$k}; return undef if !defined $dict; } #print DEBUG "cfg_lookup_1 (@keys) => $dict\n"; return $dict; } sub mk_cfg_lookup ($$@) { my ($groups, $thisgroup, @keysprefix) = @_; #print DEBUG "mk_cfg_lookup >@keysprefix< ".Dumper($thisgroup); return sub { #print DEBUG "from mk_cfg_lookup >@keysprefix< >@_<\n"; my $cgroup = $thisgroup; for (;;) { my $got = cfg_lookup_1($cgroup, @keysprefix, @_); return $got if defined $got; my $inherit = cfg_lookup_1($cgroup, qw(inherit)); return undef unless $inherit; $cgroup = $groups->{$inherit}; confess "$inherit".Dumper($groups,$inherit)."?" unless $cgroup; } } } sub cfg_affixes ($$$) { my ($cfg, $keybase, $middle) = @_; return ($cfg->("${keybase}_prefix") // '') .$middle .($cfg->("${keybase}_suffix") // ''); } sub method_none { return sub { } } sub methodlic_none { undef } sub methodlic_wikimedia ($) { my ($scraper) = @_; return <('site-title') } They are all available under at least, and distributed here under, ${ \ $scraper->('spdx') } as well as possibly other licences. There is NO WARRANTY. See .download-log for the original URL and download timestamp. The wikitext of the File: page on the wiki is in .wikitext, and contains the authorship and derivation information, as well as information about any alternative licence terms. [ This LICENCE file was generated by media-scraper and should not be manually edited ] END } sub method_wikimedia ($$$) { my ($scraper, $methname) = @_; #print DEBUG "METHOD $methname...\n"; return sub { my ($lbase, $ldest, $rstem) = @_; my $rfilename = cfg_affixes $scraper, 'filename', $rstem; my $url = cfg_affixes $scraper, 'url', $rfilename; my $wt = "$lbase.wikitext"; #print DEBUG "rfilename=$rfilename url=$url .\n"; my $datalog = new IO::File "$lbase.download-log", '>>' or die $!; print $datalog "\n" or die $!; run_curl $datalog, $wt, $url; open WT, "$wt" or die $!; my (@lics) = @{ $scraper->('licences') }; s/\W/\\$&/g foreach @lics; my $lic1_re = '(?:'.(join '|', @lics).')'; my $ok; while () { s/\s+$//; if (m{^ \{\{ ($lic1_re) \}\} $}xi || m{^ \{\{ self\| (?:[^{}]*\|)? ($lic1_re) (?:\|[^{}]*)? \}\} $}xi) { print "licence=$1 "; $ok = 1; last; } } if (!$ok) { die "\nfile $wt from $url no appropriate licence $lic1_re"; } my $hash_prefix = ''; if ($scraper->('data_url_hashprefix')) { # https://www.mediawiki.org/wiki/Special:MyLanguage/Manual:$wgHashedUploadDirectory md5_hex($rfilename) =~ m{^((.).)} or die; $hash_prefix .= "$2/$1/"; } my $data_url = cfg_affixes $scraper, 'data_url', $hash_prefix.$rfilename; run_curl $datalog, $ldest, $data_url; close $datalog or die $!; }; } my $input = $ARGV[0] // die; $input =~ m/\.toml$/ or die "$input ?"; my $basename = $`; mkdir $basename or $!==EEXIST or die "mkdir $basename: $!"; my $parser = TOML::Parser->new(); my $libinfo = $parser->parse_file($input); my $groups = $libinfo->{group}; my $scraper = sub { $libinfo->{scraper}{$_[0]} }; my $method = $scraper->('method'); my $method_fn = ${*::}{"method_$method"}; my $methodlic_fn = ${*::}{"methodlic_$method"}; my $licpath = "$basename/LICENCE"; my $method_lictext = $methodlic_fn->($scraper); if (defined $method_lictext) { my $licfile = new IO::File "$licpath.tmp", '>' or die $!; print $licfile <('spdx') } (applies to the contents of this directory unless otherwise stated) END The download was done by media-scraper, controlled by $input. END close $licfile or die $!; } my $makepath = "$basename/files.make"; my $makefile = new IO::File "$makepath.tmp", '>' or die $!; foreach my $groupname (sort keys %$groups) { my $group_dict = $groups->{$groupname}; my $gcfg = mk_cfg_lookup($groups, $group_dict); my $method_impl = $method_fn->($scraper, $method); foreach (split(/\n/, $gcfg->('files'))) { s/^\s+//; next if m/^\#/ || m/^$/; m/^(\S+)\s+(\S+)/ or die; my $lministem = $1; my $rministem = $2; my $lstem = cfg_affixes $gcfg, 'item', $lministem; my $rstem = cfg_affixes $gcfg, 'stem', $rministem; my $lbase = "$basename/$lstem"; my $lupstream = "$lbase.svg"; my $lprocessed = "$lbase.usvg"; print DEBUG "file $lstem "; print $makefile <($lbase, $lupstream, $rstem); print DEBUG "done.\n"; } } close $makefile or die $!; if (defined($method_lictext)) { my $cmp = compare("$licpath.tmp", $licpath); die if $cmp < 0; if ($cmp) { rename "$licpath.tmp", $licpath or die $!; } else { remove "$licpath.tmp"; } } rename "$makepath.tmp", $makepath or die $!;