media-scraper: Licence handling

author Ian Jackson <ijackson@chiark.greenend.org.uk>

Sat, 12 Sep 2020 12:46:53 +0000 (13:46 +0100)

committer Ian Jackson <ijackson@chiark.greenend.org.uk>

Sat, 12 Sep 2020 12:48:03 +0000 (13:48 +0100)
author Ian Jackson <ijackson@chiark.greenend.org.uk>
Sat, 12 Sep 2020 12:46:53 +0000 (13:46 +0100)
committer Ian Jackson <ijackson@chiark.greenend.org.uk>
Sat, 12 Sep 2020 12:48:03 +0000 (13:48 +0100)
diff --git a/LICENCE b/LICENCE

index fcdc3181b7348656565a926c39860a68eea8efef..261ab96136a7c09898b2e4a1ebe4e5e10b11b96b 100644 (file)
--- a/LICENCE
+++ b/LICENCE
@@ -30,8 +30,27 @@ Attribution-Share Alike 3.0 Unported License, or the Creative Commons
  Attribution-ShareAlike 4.0 International Licence.
  There is NO WARRANTY.
  
-For each file in the shape library, there is a corresponding file
-".licence" giving licence, provenance and authorship information.
+Many of these files were downloaded from public sources using the
+./media-scraper script; if they are to be edited, it would be best to
+edit them at the public source.
+
+For each file in the shape library, there is either a corresponding
+file ".licence" (in the git source tree), or a file LICENCE in the
+relevant directory, giving licence, provenance and authorship
+information.
+
+The individual shapes must be sent by the server to the client as part
+of HTML/XML documents and as part of the client/server protocol; they
+become part of the dynamic HTML in the page in the client web browser.
+
+These versions of the files have been processed by usvg and are saved
+as ".usvg" in the built version of the overall git tree.  The spdx
+licence identifier and the provenance of the file is recorded in an
+XML comment like this <!-- $SPDX $LIBNAME/$BASENAME --> where $SPDX is
+the SPDX licence identifier for the primary licence we are using; and
+the $LIBNAME/$BASENAME are relative to the library/ directory in the
+source tree, where the full authorship and licence information, and
+source url etc., can be found.
  
  You can find a copy of the actual licences in the files
  CC-BY-SA-3.0.txt and CC-BY-SA-4.0.txt.  If not, see
diff --git a/library/wikimedia.toml b/library/wikimedia.toml

index 97a271851d85b4340f2e7d4afa0649c5a0524a9a..6392dbc66fcaad0531ea7ee370e5499b5642da21 100644 (file)
--- a/library/wikimedia.toml
+++ b/library/wikimedia.toml
@@ -16,11 +16,13 @@ blt45       a white bishop
  """
  [chess.scraper]
  
+spdx = "CC-BY-SA-3.0"
+filename_prefix = "Chess_"
+filename_suffix = ".svg"
+
  method = "wikimedia"
  licences = [ "Cc-by-sa-3.0", "GFDL|migration=relicense" ]
  url_prefix = "https://commons.wikimedia.org/wiki/File:"
-filename_prefix = "Chess_"
-filename_suffix = ".svg"
  url_suffix = "?action=raw"
  data_url_prefix = "https://upload.wikimedia.org/wikipedia/commons/"
  data_url_hashprefix = true
diff --git a/media-scraper b/media-scraper

index f25f2faace1be6f1dbf183ee180c6a3c85e0242c..8ba5aeb3f1ce744f26f79e7468a304f189998d02 100755 (executable)
--- a/media-scraper
+++ b/media-scraper
@@ -21,8 +21,6 @@ autoflush DEBUG 1;
  
  sub run_curl {
    my ($datalog, $output, $url, @xopts) = @_;
-  return if stat $output;
-  die "$output $!" unless $!==ENOENT;
    my @curl = (qw(curl -Ssf -L --proto-redir -all), @xopts);
    push @curl, '-o', "$output.tmp", $url;
    our $last_curl;
@@ -31,7 +29,7 @@ sub run_curl {
    my $delay = 1./$max_rate - ($now - $last_curl);
    Time::HiRes::sleep $delay if $delay > 0;
    $last_curl = $now;
-  print DEBUG "+ @curl\n";
+#  print DEBUG "+ @curl\n";
    $!=$?=0; my $r = system @curl; die "curl failed ($? $!): @curl" if $r;
    my $logtime = strftime "%F %T UTC", gmtime time;
    print $datalog "$logtime: downloaded into $output from $url\n"
@@ -39,13 +37,12 @@ sub run_curl {
    rename "$output.tmp", "$output" or die "install $output: $!";
  }
  
-sub method_wikimedia ($$) {
+sub method_wikimedia ($$$) {
    my ($cfg, $methname) = @_;
    print DEBUG "METHOD $methname...\n";
    return sub {
      my ($filespec, $base) = @_;
      my $filename = $cfg->{filename_prefix}.$filespec.$cfg->{filename_suffix};
-    print DEBUG "file $filespec $filename ";
      my $url = $cfg->{url_prefix}.$filename.$cfg->{url_suffix};
      my $wt = "$base.wikitext";
      my $datalog = new IO::File "$base.download-log", '>>' or die $!;
@@ -75,9 +72,15 @@ sub method_wikimedia ($$) {
        $data_url .= "$2/$1/";
      }
      $data_url .= $filename.$cfg->{data_url_suffix};
-    my $ups = "$base.upstream.svg";
+    my $ups = "$base.svg";
      run_curl $datalog, $ups, $data_url;
      close $datalog or die $!;
+    return <<END;
+This file was downloaded from a wikimedia/mediawiki installation.
+See .download-log for the original URL and download timestamp.
+The wikitext of the File: page on the wiki is in .wikitext, and
+contains the authorship and derivation information.
+END
    };
  }
  
@@ -101,6 +104,19 @@ foreach my $sect (values %$libinfo) {
      m/^\S+/;
      my $filespec = $&;
      my $base = "$basename/$filespec";
-    $method_fn->($filespec, $base);
+    my $licpath = "$base.licence";
+    print DEBUG "file $base ";
+    if (stat $licpath) {
+      print DEBUG "already.\n";
+      next;
+    }
+    die "$base $!" unless $!==ENOENT;
+    my $lictext = $method_fn->($filespec, $base);
+    $lictext = "SPDX-License-Identifier: $scraper->{spdx}\n\n".$lictext;
+    my $licfile = new IO::File "$licpath.tmp", '>' or die $!;
+    print $licfile $lictext or die $!;
+    close $licfile or die $!;
+    rename "$licpath.tmp", "$licpath" or die $!;
+    print DEBUG "done.\n";
    }
  }
author	Ian Jackson <ijackson@chiark.greenend.org.uk>
	Sat, 12 Sep 2020 12:46:53 +0000 (13:46 +0100)
committer	Ian Jackson <ijackson@chiark.greenend.org.uk>
	Sat, 12 Sep 2020 12:48:03 +0000 (13:48 +0100)
LICENCE		patch \| blob \| history
library/wikimedia.toml		patch \| blob \| history
media-scraper		patch \| blob \| history