From 7670d75be84c73425b39fce0e1768dbf33e5299d Mon Sep 17 00:00:00 2001 From: Ian Jackson Date: Thu, 10 Sep 2020 00:54:14 +0100 Subject: [PATCH] media-scraper: wip Signed-off-by: Ian Jackson --- library/wikimedia.toml | 9 +++++++-- media-scraper | 16 +++++++++++++--- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/library/wikimedia.toml b/library/wikimedia.toml index af29d138..97a27185 100644 --- a/library/wikimedia.toml +++ b/library/wikimedia.toml @@ -18,5 +18,10 @@ blt45 a white bishop method = "wikimedia" licences = [ "Cc-by-sa-3.0", "GFDL|migration=relicense" ] -url_prefix = "https://commons.wikimedia.org/wiki/File:Chess_" -url_suffix = ".svg?action=raw" +url_prefix = "https://commons.wikimedia.org/wiki/File:" +filename_prefix = "Chess_" +filename_suffix = ".svg" +url_suffix = "?action=raw" +data_url_prefix = "https://upload.wikimedia.org/wikipedia/commons/" +data_url_hashprefix = true +data_url_suffix = "" diff --git a/media-scraper b/media-scraper index 6642cd09..cbed0ee3 100755 --- a/media-scraper +++ b/media-scraper @@ -6,6 +6,7 @@ use IO::Handle; use TOML::Parser; use Data::Dumper; use Time::HiRes; +use Digest::MD5 qw(md5_hex); our $max_rate = 2; # per second @@ -39,12 +40,13 @@ sub method_wikimedia ($$) { print DEBUG "METHOD $methname...\n"; return sub { my ($filespec, $base) = @_; - my $url = $cfg->{url_prefix}.$filespec.$cfg->{url_suffix}; + my $filename = $cfg->{filename_prefix}.$filespec.$cfg->{filename_suffix}; + print DEBUG "file $filespec $filename "; + my $url = $cfg->{url_prefix}.$filename.$cfg->{url_suffix}; my $wt = "$base.wikitext"; my $datalog = new IO::File "$base.download-log", '>>' or die $!; print $datalog "\n" or die $!; run_curl $datalog, $wt, $url; - print DEBUG "file $wt "; open WT, "$wt" or die $!; my (@lics) = @{ $cfg->{licences} }; s/\W/\\$&/g foreach @lics; @@ -62,7 +64,15 @@ sub method_wikimedia ($$) { if (!$ok) { die "\nfile $wt from $url no appropriate licence $lic1_re"; } -# my $data_url = + my $data_url = $cfg->{data_url_prefix}; + if ($cfg->{data_url_hashprefix}) { + # https://www.mediawiki.org/wiki/Special:MyLanguage/Manual:$wgHashedUploadDirectory + md5_hex($filename) =~ m{^((.).)} or die; + $data_url .= "$2/$1/"; + } + $data_url .= $filename.$cfg->{data_url_suffix}; + my $ups = "$base.upstream.svg"; + run_curl $datalog, $ups, $data_url; close $datalog or die $!; }; } -- 2.30.2