contrib/tunefeed.in

   1 #!/usr/bin/perl
   2 $version = q$Id: tunefeed.in 4329 2001-01-14 13:47:52Z rra $;
   3 #
   4 # tunefeed -- Compare active files with a remote site to tune a feed.
   5 #             Copyright 1998 by Russ Allbery <rra@stanford.edu>
   6 #
   7 # This program is free software; you can redistribute it and/or modify it
   8 # under the same terms as Perl itself.
   9
  10 ############################################################################
  11 # Site configuration
  12 ############################################################################
  13
  14 # A list of hierarchies in the Big Eight.
  15 %big8 = map { $_ => 1 } qw(comp humanities misc news rec sci soc talk);
  16
  17 # A list of hierarchies that are considered global and not language
  18 # hierarchies.
  19 %global = map { $_ => 1 } qw(bionet bit biz borland ddn gnu gov ieee info
  20                              linux k12 microsoft netscape tnn vmsnet);
  21
  22 # The pattern matching local-only hierarchies (that we should disregard when
  23 # doing feed matching).
  24 %ignore = map { $_ => 1 } qw(clari control junk);
  25
  26
  27 ############################################################################
  28 # Modules and declarations
  29 ############################################################################
  30
  31 require 5.003;
  32
  33 use Getopt::Long qw(GetOptions);
  34
  35 use strict;
  36 use vars qw(%big8 $days %global %ignore $threshold %traffic $version);
  37
  38
  39 ############################################################################
  40 # Active file hashing and analysis
  41 ############################################################################
  42
  43 # Read in an active file, putting those groups into a hash where the key is
  44 # the name of the group and the value is always 1.  If the optional third
  45 # argument is true, exclude any groups in the hierarchies listed in %local
  46 # and use this active file to store traffic information (in a rather
  47 # simple-minded fashion).
  48 sub hash {
  49     my ($file, $hash, $local) = @_;
  50     open (ACTIVE, $file) or die "$0: cannot open $file: $!\n";
  51     local $_;
  52     while (<ACTIVE>) {
  53         my ($group, $high, $low, $flags) = split;
  54         next if ($flags =~ /^=|^x/);
  55         my $hierarchy = (split (/\./, $group, 2))[0];
  56         next if ($local && $ignore{$hierarchy});
  57         $$hash{$group} = 1;
  58         $traffic{$group} = ($high - $low) / $days if $local;
  59     }
  60     close ACTIVE;
  61 }
  62
  63 # Read in a file that gives traffic statistics.  We assume it's in the form
  64 # group, whitespace, number of articles per day, and we just read it
  65 # directly into the %traffic hash.
  66 sub traffic {
  67     my ($file) = @_;
  68     open (TRAFFIC, $file) or die "$0: cannot open $file: $!\n";
  69     local $_;
  70     while (<TRAFFIC>) {
  71         my ($group, $traffic) = split;
  72         $traffic{$group} = $traffic;
  73     }
  74     close TRAFFIC;
  75 }
  76
  77 # Pull off the first X nodes of a group name.
  78 sub prefix {
  79     my ($group, $count) = @_;
  80     my @group = split (/\./, $group);
  81     splice (@group, $count);
  82     join ('.', @group);
  83 }
  84
  85 # Find the common hierarchical prefix of a list.
  86 sub common {
  87     my (@list) = @_;
  88     my @prefix = split (/\./, shift @list);
  89     local $_;
  90     while (defined ($_ = shift @list)) {
  91         my @group = split /\./;
  92         my $i;
  93         $i++ while ($prefix[$i] && $prefix[$i] eq $group[$i]);
  94         if ($i <= $#prefix) { splice (@prefix, $i) }
  95     }
  96     join ('.', @prefix);
  97 }
  98
  99 # Given two lists, a list of groups that the remote site does have and a
 100 # list of groups that the remote site doesn't have, in a single hierarchy,
 101 # perform a smash.  The object is to find the minimal pattern that expresses
 102 # just the groups they want.  We're also given the common prefix of all the
 103 # groups in the have and exclude lists, and a flag indicating whether we're
 104 # coming in with a positive assumption (all groups sent unless excluded) or
 105 # a negative assumption (no groups sent unless added).
 106 sub smash {
 107     my ($have, $exclude, $top, $positive) = @_;
 108     my (@positive, @negative);
 109     my $level = ($top =~ tr/././) + 1;
 110
 111     # Start with the positive assumption.  We make copies of our @have and
 112     # @exclude arrays since we're going to be needing the virgin ones again
 113     # later for the negative assumption.  If we're coming in with the
 114     # negative assumption, we have to add a wildcarded entry to switch
 115     # assumptions, and we also have to deal with the cases where there is a
 116     # real group at the head of the hierarchy.
 117     my @have = @$have;
 118     my @exclude = @$exclude;
 119     if ($top eq $have[0]) {
 120         shift @have;
 121         push (@positive, "$top*") unless $positive;
 122     } else {
 123         if ($top eq $exclude[0]) {
 124             if ($positive && $traffic{$top} > $threshold) {
 125                 push (@positive, "!$top");
 126             }
 127             shift @exclude;
 128         }
 129         push (@positive, "$top.*") unless $positive;
 130     }
 131
 132     # Now that we've got things started, keep in mind that we're set up so
 133     # that every group will be sent *unless* it's excluded.  So we step
 134     # through the list of exclusions.  The idea here is to pull together all
 135     # of the exclusions with the same prefix (going one level deeper into
 136     # the newsgroup names than we're currently at), and then find all the
 137     # groups with the same prefix that the remote site *does* want.  If
 138     # there aren't any, then we can just exclude that whole prefix provided
 139     # that we're saving enough traffic to make it worthwhile (checked
 140     # against the threshold).  If there are, and if the threshold still
 141     # makes it worthwhile to worry about this, we call this sub recursively
 142     # to compute the best pattern for that prefix.
 143     while (defined ($_ = shift @exclude)) {
 144         my ($prefix) = prefix ($_, $level + 1);
 145         my @drop = ($_);
 146         my @keep;
 147         my $traffic = $traffic{$_};
 148         while ($exclude[0] =~ /^\Q$prefix./) {
 149             $traffic += $traffic{$exclude[0]};
 150             push (@drop, shift @exclude);
 151         }
 152         $prefix = common (@drop);
 153         my $saved = $traffic;
 154         while (@have && $have[0] le $prefix) { shift @have }
 155         while ($have[0] =~ /^\Q$prefix./) {
 156             $traffic += $traffic{$have[0]};
 157             push (@keep, shift @have);
 158         }
 159         next unless $saved > $threshold;
 160         if (@keep) {
 161             $traffic{"$prefix*"} = $traffic;
 162             push (@positive, smash (\@keep, \@drop, $prefix, 1));
 163         } elsif (@drop == 1) {
 164             push (@positive, "!$_");
 165         } elsif ($prefix eq $_) {
 166             push (@positive, "!$prefix*");
 167         } else {
 168             push (@positive, "!$prefix.*");
 169         }
 170     }
 171
 172     # Now we do essentially the same thing, but from the negative
 173     # perspective (adding a wildcard pattern as necessary to make sure that
 174     # we're not sending all groups and then finding the groups we are
 175     # sending and trying to smash them into minimal wildcard patterns).
 176     @have = @$have;
 177     @exclude = @$exclude;
 178     if ($top eq $exclude[0]) {
 179         shift @exclude;
 180         push (@negative, "!$top*") if $positive;
 181     } else {
 182         if ($top eq $have[0]) {
 183             push (@negative, $top) unless $positive;
 184             shift @have;
 185         }
 186         push (@negative, "!$top.*") if $positive;
 187     }
 188
 189     # This again looks pretty much the same as what we do for the positive
 190     # case; the primary difference is that we have to make sure that we send
 191     # them every group that they want, so we still err on the side of
 192     # sending too much, rather than too little.
 193     while (defined ($_ = shift @have)) {
 194         my ($prefix) = prefix ($_, $level + 1);
 195         my @keep = ($_);
 196         my @drop;
 197         my $traffic = $traffic{$_};
 198         while ($have[0] =~ /^\Q$prefix./) {
 199             $traffic += $traffic{$have[0]};
 200             push (@keep, shift @have);
 201         }
 202         $prefix = common (@keep);
 203         while (@exclude && $exclude[0] le $prefix) { shift @exclude }
 204         my $saved = 0;
 205         while ($exclude[0] =~ /^\Q$prefix./) {
 206             $saved += $traffic{$exclude[0]};
 207             push (@drop, shift @exclude);
 208         }
 209         if (@drop && $saved > $threshold) {
 210             $traffic{"$prefix*"} = $traffic + $saved;
 211             push (@negative, smash (\@keep, \@drop, $prefix, 0));
 212         } elsif (@keep == 1) {
 213             push (@negative, $_);
 214         } elsif ($prefix eq $_) {
 215             push (@negative, "$prefix*");
 216         } else {
 217             push (@negative, "$prefix.*");
 218         }
 219     }
 220
 221     # Now that we've built both the positive and negative case, we decide
 222     # which to return.  We want the one that's the most succinct, and if
 223     # both descriptions are equally succinct, we return the negative case on
 224     # the grounds that it's likely to send less of what they don't want.
 225     (@positive < @negative) ? @positive : @negative;
 226 }
 227
 228
 229 ############################################################################
 230 # Output
 231 ############################################################################
 232
 233 # We want to sort Big Eight ahead of alt.* ahead of global non-language
 234 # hierarchies ahead of regionals and language hierarchies.
 235 sub score {
 236     my ($hierarchy) = @_;
 237     if ($big8{$hierarchy})      { return 1 }
 238     elsif ($hierarchy eq 'alt') { return 2 }
 239     elsif ($global{$hierarchy}) { return 3 }
 240     else                        { return 4 }
 241 }
 242
 243 # Our special sort routine for hierarchies.  It calls score to get a
 244 # hierarchy score and sorts on that first.
 245 sub by_hierarchy {
 246     (score $a) <=> (score $b) || $a cmp $b;
 247 }
 248
 249 # Given a reference to a list of patterns, output it in some reasonable
 250 # form.  Currently, this is lines prefixed by a tab, with continuation lines
 251 # like INN likes to have in newsfeeds, 76 column margin, and with a line
 252 # break each time the hierarchy score changes.
 253 sub output {
 254     my ($patterns) = @_;
 255     my ($last, $line);
 256     for (@$patterns) {
 257         my ($hierarchy) = /^!?([^.]+)/;
 258         my $score = score $hierarchy;
 259         $line += 1 + length $_;
 260         if (($last && $score > $last) || $line > 76) {
 261             print ",\\\n\t";
 262             $line = 8 + length $_;
 263         } elsif ($last) {
 264             print ',';
 265         } else {
 266             print "\t";
 267             $line += 8;
 268         }
 269         print;
 270         $last = $score;
 271     }
 272     print "\n";
 273 }
 274
 275
 276 ############################################################################
 277 # Main routine
 278 ############################################################################
 279
 280 # Clean up the name of this program for error messages.
 281 my $fullpath = $0;
 282 $0 =~ s%.*/%%;
 283
 284 # Parse the command line.  Our argument is the path to an active file (we
 285 # tell the difference by seeing if it contains a /).
 286 my ($help, $print_version);
 287 Getopt::Long::config ('bundling');
 288 GetOptions ('help|h'        => \$help,
 289             'days|d=i'      => \$days,
 290             'threshold|t=i' => \$threshold,
 291             'version|v'     => \$print_version) or exit 1;
 292
 293 # Set a default for the minimum threshold traffic required to retain an
 294 # exclusion, and assume that active file differences represent one day of
 295 # traffic unless told otherwise.
 296 $threshold = (defined $threshold) ? $threshold : 250;
 297 $days ||= 1;
 298
 299 # If they asked for our version number, abort and just print that.
 300 if ($print_version) {
 301     my ($program, $ver) = (split (' ', $version))[1,2];
 302     $program =~ s/,v$//;
 303     die "$program $ver\n";
 304 }
 305
 306 # If they asked for help, give them the documentation.
 307 if ($help) {
 308     print "Feeding myself to perldoc, please wait....\n";
 309     exec ('perldoc', '-t', $fullpath) or die "$0: can't fork: $!\n";
 310 }
 311
 312 # Hash the active files, skipping groups we ignore in the local one.  Make
 313 # sure we have our two files listed first.
 314 unless (@ARGV == 2 || @ARGV == 3) {
 315     die "Usage: $0 [-hv] [-t <threshold>] <local> <remote> [<traffic>]\n";
 316 }
 317 my (%local, %remote);
 318 hash (shift, \%local, 1);
 319 hash (shift, \%remote);
 320 traffic (shift) if @ARGV;
 321
 322 # Now, we analyze the differences between the two feeds.  We're trying to
 323 # build a pattern of what *we* should send *them*, so stuff that's in
 324 # %remote and not in %local doesn't concern us.  Rather, we're looking for
 325 # stuff that we carry that they don't, since that's what we'll want to
 326 # exclude from a full feed.
 327 my (%have, %exclude, %count, $have, $exclude, $positive);
 328 for (sort keys %local) {
 329     my ($hierarchy) = (split /\./);
 330     $count{$hierarchy}++;
 331     $traffic{"$hierarchy*"} += $traffic{$_};
 332     if ($remote{$_}) { push (@{$have{$hierarchy}}, $_); $have++       }
 333     else             { push (@{$exclude{$hierarchy}}, $_); $exclude++ }
 334 }
 335 my @patterns;
 336 if ($have > $exclude * 4) {
 337     push (@patterns, "*");
 338     $positive = 1;
 339 }
 340 for (sort by_hierarchy keys %count)  {
 341     if ($have{$_} && !$exclude{$_}) {
 342         push (@patterns, "$_.*") unless $positive;
 343     } elsif ($exclude{$_} && !$have{$_}) {
 344         push (@patterns, "!$_.*") if $positive;
 345     } else {
 346         push (@patterns, smash ($have{$_}, $exclude{$_}, $_, $positive));
 347     }
 348 }
 349 output (\@patterns);
 350 __END__
 351
 352
 353 ############################################################################
 354 # Documentation
 355 ############################################################################
 356
 357 =head1 NAME
 358
 359 tunefeed - Build a newsgroups pattern for a remote feed
 360
 361 =head1 SYNOPSIS
 362
 363 B<tunefeed> [B<-hv>] [B<-t> I<threshold>] [B<-d> I<days>] I<local>
 364 I<remote> [I<traffic>]
 365
 366 =head1 DESCRIPTION
 367
 368 Given two active files, B<tunefeed> generates an INN newsfeeds pattern for
 369 a feed from the first site to the second, that sends the second site
 370 everything in its active file carried by the first site but tries to
 371 minimize the number of rejected articles.  It does this by noting
 372 differences between the two active files and then trying to generate
 373 wildcard patterns that cover the similarities without including much (or
 374 any) unwanted traffic.
 375
 376 I<local> and I<remote> should be standard active files.  You can probably
 377 get the active file of a site that you feed (provided they're running INN)
 378 by connecting to their NNTP port and typing C<LIST ACTIVE>.
 379
 380 B<tunefeed> makes an effort to avoid complex patterns when they're of
 381 minimal gain.  I<threshold> is the number of messages per day at which to
 382 worry about excluding a group; if a group the remote site doesn't want to
 383 receive gets below that number of messages per day, then that group is
 384 either sent or not sent depending on which choice results in the simplest
 385 (shortest) wildcard pattern.  If you want a pattern that exactly matches
 386 what the remote site wants, use C<-t 0>.
 387
 388 Ideally, B<tunefeed> likes to be given the optional third argument,
 389 I<traffic>, which points at a file listing traffic numbers for each group.
 390 The format of this file is a group name, whitespace, and then the number
 391 of messages per day it receives.  Without such a file, B<tunefeed> will
 392 attempt to guess traffic by taking the difference between the high and low
 393 numbers in the active file as the amount of traffic in that group per day.
 394 This will almost always not be accurate, but it should at least be a
 395 ballpark figure.  If you know approximately how many days of traffic the
 396 active file numbers represent, you can tell B<tunefeed> this information
 397 using the B<-d> flag.
 398
 399 B<tunefeed>'s output will look something like:
 400
 401         comp.*,humanities.classics,misc.*,news.*,rec.*,sci.*,soc.*,talk.*,\
 402         alt.*,!alt.atheism,!alt.binaries.*,!alt.nocem.misc,!alt.punk*,\
 403         !alt.sex*,!alt.video.dvd,\
 404         bionet.*,biz.*,gnu.*,vmsnet.*,\
 405         ba.*,!ba.jobs.agency,ca.*,sbay.*
 406
 407 (with each line prefixed by a tab, and with standard INN newsfeeds
 408 continuation syntax).  Due to the preferences of the author, it will also
 409 be sorted as Big Eight, then alt.*, then global non-language hierarchies,
 410 then regional and language hierarchies.
 411
 412 =head1 OPTIONS
 413
 414 =over 4
 415
 416 =item B<-h>, B<--help>
 417
 418 Print out this documentation (which is done simply by feeding the script
 419 to C<perldoc -t>.
 420
 421 =item B<-v>, B<--version>
 422
 423 Print out the version of B<tunefeed> and exit.
 424
 425 =item B<-d> I<days>, B<--days>=I<days>
 426
 427 Assume that the difference between the high and low numbers in the active
 428 file represent I<days> days of traffic.
 429
 430 =item B<-t> I<threshold>, B<--threshold>=I<threshold>
 431
 432 Allow any group with less than I<threshold> articles per day in traffic to
 433 be either sent or not sent depending on which choice makes the wildcard
 434 patterns simpler.  If a threshold isn't specified, the default value is
 435 250.
 436
 437 =back
 438
 439 =head1 BUGS
 440
 441 This program takes a long time to run, not to mention being a nasty memory
 442 hog.  The algorithm is thorough, but definitely not very optimized, and
 443 isn't all that friendly.
 444
 445 Guessing traffic from active file numbers is going to produce very skewed
 446 results on sites with expiration policies that vary widely by group.
 447
 448 There is no way to optimize for size in avoiding rejections, only quantity
 449 of articles.
 450
 451 There should be a way to turn off the author's idiosyncratic ordering of
 452 hierarchies, or to specify a different ordering, without editing this
 453 script.
 454
 455 This script should attempt to retrieve the active file from the remote
 456 site automatically if so desired.
 457
 458 This script should be able to be given some existing wildcard patterns and
 459 take them into account when generating new ones.
 460
 461 =head1 CAVEATS
 462
 463 Please be aware that your neighbor's active file may not accurately
 464 represent the groups they wish to receive from you.  As with everything,
 465 choices made by automated programs like this one should be reviewed by a
 466 human and the remote site should be notified, and if they have sent
 467 explicit patterns, those should be honored instead.  I definitely do *not*
 468 recommend running this program on any sort of automated basis.
 469
 470 =head1 AUTHOR
 471
 472 Russ Allbery E<lt>rra@stanford.eduE<gt>
 473
 474 =cut