2 $version = q$Id: tunefeed.in 4329 2001-01-14 13:47:52Z rra $;
4 # tunefeed -- Compare active files with a remote site to tune a feed.
5 # Copyright 1998 by Russ Allbery <rra@stanford.edu>
7 # This program is free software; you can redistribute it and/or modify it
8 # under the same terms as Perl itself.
10 ############################################################################
12 ############################################################################
14 # A list of hierarchies in the Big Eight.
15 %big8 = map { $_ => 1 } qw(comp humanities misc news rec sci soc talk);
17 # A list of hierarchies that are considered global and not language
19 %global = map { $_ => 1 } qw(bionet bit biz borland ddn gnu gov ieee info
20 linux k12 microsoft netscape tnn vmsnet);
22 # The pattern matching local-only hierarchies (that we should disregard when
23 # doing feed matching).
24 %ignore = map { $_ => 1 } qw(clari control junk);
27 ############################################################################
28 # Modules and declarations
29 ############################################################################
33 use Getopt::Long qw(GetOptions);
36 use vars qw(%big8 $days %global %ignore $threshold %traffic $version);
39 ############################################################################
40 # Active file hashing and analysis
41 ############################################################################
43 # Read in an active file, putting those groups into a hash where the key is
44 # the name of the group and the value is always 1. If the optional third
45 # argument is true, exclude any groups in the hierarchies listed in %local
46 # and use this active file to store traffic information (in a rather
47 # simple-minded fashion).
49 my ($file, $hash, $local) = @_;
50 open (ACTIVE, $file) or die "$0: cannot open $file: $!\n";
53 my ($group, $high, $low, $flags) = split;
54 next if ($flags =~ /^=|^x/);
55 my $hierarchy = (split (/\./, $group, 2))[0];
56 next if ($local && $ignore{$hierarchy});
58 $traffic{$group} = ($high - $low) / $days if $local;
63 # Read in a file that gives traffic statistics. We assume it's in the form
64 # group, whitespace, number of articles per day, and we just read it
65 # directly into the %traffic hash.
68 open (TRAFFIC, $file) or die "$0: cannot open $file: $!\n";
71 my ($group, $traffic) = split;
72 $traffic{$group} = $traffic;
77 # Pull off the first X nodes of a group name.
79 my ($group, $count) = @_;
80 my @group = split (/\./, $group);
81 splice (@group, $count);
85 # Find the common hierarchical prefix of a list.
88 my @prefix = split (/\./, shift @list);
90 while (defined ($_ = shift @list)) {
91 my @group = split /\./;
93 $i++ while ($prefix[$i] && $prefix[$i] eq $group[$i]);
94 if ($i <= $#prefix) { splice (@prefix, $i) }
99 # Given two lists, a list of groups that the remote site does have and a
100 # list of groups that the remote site doesn't have, in a single hierarchy,
101 # perform a smash. The object is to find the minimal pattern that expresses
102 # just the groups they want. We're also given the common prefix of all the
103 # groups in the have and exclude lists, and a flag indicating whether we're
104 # coming in with a positive assumption (all groups sent unless excluded) or
105 # a negative assumption (no groups sent unless added).
107 my ($have, $exclude, $top, $positive) = @_;
108 my (@positive, @negative);
109 my $level = ($top =~ tr/././) + 1;
111 # Start with the positive assumption. We make copies of our @have and
112 # @exclude arrays since we're going to be needing the virgin ones again
113 # later for the negative assumption. If we're coming in with the
114 # negative assumption, we have to add a wildcarded entry to switch
115 # assumptions, and we also have to deal with the cases where there is a
116 # real group at the head of the hierarchy.
118 my @exclude = @$exclude;
119 if ($top eq $have[0]) {
121 push (@positive, "$top*") unless $positive;
123 if ($top eq $exclude[0]) {
124 if ($positive && $traffic{$top} > $threshold) {
125 push (@positive, "!$top");
129 push (@positive, "$top.*") unless $positive;
132 # Now that we've got things started, keep in mind that we're set up so
133 # that every group will be sent *unless* it's excluded. So we step
134 # through the list of exclusions. The idea here is to pull together all
135 # of the exclusions with the same prefix (going one level deeper into
136 # the newsgroup names than we're currently at), and then find all the
137 # groups with the same prefix that the remote site *does* want. If
138 # there aren't any, then we can just exclude that whole prefix provided
139 # that we're saving enough traffic to make it worthwhile (checked
140 # against the threshold). If there are, and if the threshold still
141 # makes it worthwhile to worry about this, we call this sub recursively
142 # to compute the best pattern for that prefix.
143 while (defined ($_ = shift @exclude)) {
144 my ($prefix) = prefix ($_, $level + 1);
147 my $traffic = $traffic{$_};
148 while ($exclude[0] =~ /^\Q$prefix./) {
149 $traffic += $traffic{$exclude[0]};
150 push (@drop, shift @exclude);
152 $prefix = common (@drop);
153 my $saved = $traffic;
154 while (@have && $have[0] le $prefix) { shift @have }
155 while ($have[0] =~ /^\Q$prefix./) {
156 $traffic += $traffic{$have[0]};
157 push (@keep, shift @have);
159 next unless $saved > $threshold;
161 $traffic{"$prefix*"} = $traffic;
162 push (@positive, smash (\@keep, \@drop, $prefix, 1));
163 } elsif (@drop == 1) {
164 push (@positive, "!$_");
165 } elsif ($prefix eq $_) {
166 push (@positive, "!$prefix*");
168 push (@positive, "!$prefix.*");
172 # Now we do essentially the same thing, but from the negative
173 # perspective (adding a wildcard pattern as necessary to make sure that
174 # we're not sending all groups and then finding the groups we are
175 # sending and trying to smash them into minimal wildcard patterns).
177 @exclude = @$exclude;
178 if ($top eq $exclude[0]) {
180 push (@negative, "!$top*") if $positive;
182 if ($top eq $have[0]) {
183 push (@negative, $top) unless $positive;
186 push (@negative, "!$top.*") if $positive;
189 # This again looks pretty much the same as what we do for the positive
190 # case; the primary difference is that we have to make sure that we send
191 # them every group that they want, so we still err on the side of
192 # sending too much, rather than too little.
193 while (defined ($_ = shift @have)) {
194 my ($prefix) = prefix ($_, $level + 1);
197 my $traffic = $traffic{$_};
198 while ($have[0] =~ /^\Q$prefix./) {
199 $traffic += $traffic{$have[0]};
200 push (@keep, shift @have);
202 $prefix = common (@keep);
203 while (@exclude && $exclude[0] le $prefix) { shift @exclude }
205 while ($exclude[0] =~ /^\Q$prefix./) {
206 $saved += $traffic{$exclude[0]};
207 push (@drop, shift @exclude);
209 if (@drop && $saved > $threshold) {
210 $traffic{"$prefix*"} = $traffic + $saved;
211 push (@negative, smash (\@keep, \@drop, $prefix, 0));
212 } elsif (@keep == 1) {
213 push (@negative, $_);
214 } elsif ($prefix eq $_) {
215 push (@negative, "$prefix*");
217 push (@negative, "$prefix.*");
221 # Now that we've built both the positive and negative case, we decide
222 # which to return. We want the one that's the most succinct, and if
223 # both descriptions are equally succinct, we return the negative case on
224 # the grounds that it's likely to send less of what they don't want.
225 (@positive < @negative) ? @positive : @negative;
229 ############################################################################
231 ############################################################################
233 # We want to sort Big Eight ahead of alt.* ahead of global non-language
234 # hierarchies ahead of regionals and language hierarchies.
236 my ($hierarchy) = @_;
237 if ($big8{$hierarchy}) { return 1 }
238 elsif ($hierarchy eq 'alt') { return 2 }
239 elsif ($global{$hierarchy}) { return 3 }
243 # Our special sort routine for hierarchies. It calls score to get a
244 # hierarchy score and sorts on that first.
246 (score $a) <=> (score $b) || $a cmp $b;
249 # Given a reference to a list of patterns, output it in some reasonable
250 # form. Currently, this is lines prefixed by a tab, with continuation lines
251 # like INN likes to have in newsfeeds, 76 column margin, and with a line
252 # break each time the hierarchy score changes.
257 my ($hierarchy) = /^!?([^.]+)/;
258 my $score = score $hierarchy;
259 $line += 1 + length $_;
260 if (($last && $score > $last) || $line > 76) {
262 $line = 8 + length $_;
276 ############################################################################
278 ############################################################################
280 # Clean up the name of this program for error messages.
284 # Parse the command line. Our argument is the path to an active file (we
285 # tell the difference by seeing if it contains a /).
286 my ($help, $print_version);
287 Getopt::Long::config ('bundling');
288 GetOptions ('help|h' => \$help,
289 'days|d=i' => \$days,
290 'threshold|t=i' => \$threshold,
291 'version|v' => \$print_version) or exit 1;
293 # Set a default for the minimum threshold traffic required to retain an
294 # exclusion, and assume that active file differences represent one day of
295 # traffic unless told otherwise.
296 $threshold = (defined $threshold) ? $threshold : 250;
299 # If they asked for our version number, abort and just print that.
300 if ($print_version) {
301 my ($program, $ver) = (split (' ', $version))[1,2];
303 die "$program $ver\n";
306 # If they asked for help, give them the documentation.
308 print "Feeding myself to perldoc, please wait....\n";
309 exec ('perldoc', '-t', $fullpath) or die "$0: can't fork: $!\n";
312 # Hash the active files, skipping groups we ignore in the local one. Make
313 # sure we have our two files listed first.
314 unless (@ARGV == 2 || @ARGV == 3) {
315 die "Usage: $0 [-hv] [-t <threshold>] <local> <remote> [<traffic>]\n";
317 my (%local, %remote);
318 hash (shift, \%local, 1);
319 hash (shift, \%remote);
320 traffic (shift) if @ARGV;
322 # Now, we analyze the differences between the two feeds. We're trying to
323 # build a pattern of what *we* should send *them*, so stuff that's in
324 # %remote and not in %local doesn't concern us. Rather, we're looking for
325 # stuff that we carry that they don't, since that's what we'll want to
326 # exclude from a full feed.
327 my (%have, %exclude, %count, $have, $exclude, $positive);
328 for (sort keys %local) {
329 my ($hierarchy) = (split /\./);
330 $count{$hierarchy}++;
331 $traffic{"$hierarchy*"} += $traffic{$_};
332 if ($remote{$_}) { push (@{$have{$hierarchy}}, $_); $have++ }
333 else { push (@{$exclude{$hierarchy}}, $_); $exclude++ }
336 if ($have > $exclude * 4) {
337 push (@patterns, "*");
340 for (sort by_hierarchy keys %count) {
341 if ($have{$_} && !$exclude{$_}) {
342 push (@patterns, "$_.*") unless $positive;
343 } elsif ($exclude{$_} && !$have{$_}) {
344 push (@patterns, "!$_.*") if $positive;
346 push (@patterns, smash ($have{$_}, $exclude{$_}, $_, $positive));
353 ############################################################################
355 ############################################################################
359 tunefeed - Build a newsgroups pattern for a remote feed
363 B<tunefeed> [B<-hv>] [B<-t> I<threshold>] [B<-d> I<days>] I<local>
364 I<remote> [I<traffic>]
368 Given two active files, B<tunefeed> generates an INN newsfeeds pattern for
369 a feed from the first site to the second, that sends the second site
370 everything in its active file carried by the first site but tries to
371 minimize the number of rejected articles. It does this by noting
372 differences between the two active files and then trying to generate
373 wildcard patterns that cover the similarities without including much (or
374 any) unwanted traffic.
376 I<local> and I<remote> should be standard active files. You can probably
377 get the active file of a site that you feed (provided they're running INN)
378 by connecting to their NNTP port and typing C<LIST ACTIVE>.
380 B<tunefeed> makes an effort to avoid complex patterns when they're of
381 minimal gain. I<threshold> is the number of messages per day at which to
382 worry about excluding a group; if a group the remote site doesn't want to
383 receive gets below that number of messages per day, then that group is
384 either sent or not sent depending on which choice results in the simplest
385 (shortest) wildcard pattern. If you want a pattern that exactly matches
386 what the remote site wants, use C<-t 0>.
388 Ideally, B<tunefeed> likes to be given the optional third argument,
389 I<traffic>, which points at a file listing traffic numbers for each group.
390 The format of this file is a group name, whitespace, and then the number
391 of messages per day it receives. Without such a file, B<tunefeed> will
392 attempt to guess traffic by taking the difference between the high and low
393 numbers in the active file as the amount of traffic in that group per day.
394 This will almost always not be accurate, but it should at least be a
395 ballpark figure. If you know approximately how many days of traffic the
396 active file numbers represent, you can tell B<tunefeed> this information
397 using the B<-d> flag.
399 B<tunefeed>'s output will look something like:
401 comp.*,humanities.classics,misc.*,news.*,rec.*,sci.*,soc.*,talk.*,\
402 alt.*,!alt.atheism,!alt.binaries.*,!alt.nocem.misc,!alt.punk*,\
403 !alt.sex*,!alt.video.dvd,\
404 bionet.*,biz.*,gnu.*,vmsnet.*,\
405 ba.*,!ba.jobs.agency,ca.*,sbay.*
407 (with each line prefixed by a tab, and with standard INN newsfeeds
408 continuation syntax). Due to the preferences of the author, it will also
409 be sorted as Big Eight, then alt.*, then global non-language hierarchies,
410 then regional and language hierarchies.
416 =item B<-h>, B<--help>
418 Print out this documentation (which is done simply by feeding the script
421 =item B<-v>, B<--version>
423 Print out the version of B<tunefeed> and exit.
425 =item B<-d> I<days>, B<--days>=I<days>
427 Assume that the difference between the high and low numbers in the active
428 file represent I<days> days of traffic.
430 =item B<-t> I<threshold>, B<--threshold>=I<threshold>
432 Allow any group with less than I<threshold> articles per day in traffic to
433 be either sent or not sent depending on which choice makes the wildcard
434 patterns simpler. If a threshold isn't specified, the default value is
441 This program takes a long time to run, not to mention being a nasty memory
442 hog. The algorithm is thorough, but definitely not very optimized, and
443 isn't all that friendly.
445 Guessing traffic from active file numbers is going to produce very skewed
446 results on sites with expiration policies that vary widely by group.
448 There is no way to optimize for size in avoiding rejections, only quantity
451 There should be a way to turn off the author's idiosyncratic ordering of
452 hierarchies, or to specify a different ordering, without editing this
455 This script should attempt to retrieve the active file from the remote
456 site automatically if so desired.
458 This script should be able to be given some existing wildcard patterns and
459 take them into account when generating new ones.
463 Please be aware that your neighbor's active file may not accurately
464 represent the groups they wish to receive from you. As with everything,
465 choices made by automated programs like this one should be reviewed by a
466 human and the remote site should be notified, and if they have sent
467 explicit patterns, those should be honored instead. I definitely do *not*
468 recommend running this program on any sort of automated basis.
472 Russ Allbery E<lt>rra@stanford.eduE<gt>