#! /usr/bin/perl -w

# The AFE FAQ HTMLizer, faq2html.pl
# Perl version written by Kamion; based loosely on a sed script by Aquarius.
# I don't claim this to be any kind of general program. It's a special-case
# hack - actually, a big pile of special-case hacks - but it works well for
# our application.
# 
# Copyright (c) 2000, 2001 Colin Watson <cjwatson@chiark.greenend.org.uk>
# 
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
# 
# A copy of the GNU General Public License is available on the World Wide
# Web at http://www.gnu.org/copyleft/gpl.html.
# 
# faq2html.pl 1.0.x:
#   * Lower-case HTML tag names.
#   * Fix html_header invocation not to pass a previous link for the first
#     section.
#
# faq2html.pl 1.0.2 (8 July 2001):
#   * Tolerate the changes coming after the Archive-name: block.
#   * Minor cleanups.
#   * Explicitly licensed under the GPL, and added this changelog.
# 
# faq2html.pl 1.0.1 (30 April 2001):
#   * Moved the "Sephrenia the Styric" Easter-egg link to groups.google.com.
# 
# faq2html.pl 1.0.0 (dawn of time):
#   * Initial release.


use strict;
require 5.000;
use lib qw(.);
use Style qw(html_header html_footer);

sub usage ()
{
    print STDERR <<"EOF";
Usage: $0 index-filename [up-URL] < faq-document

URLs may be absolute or relative.

EOF
    exit 1;
}

sub html_escape (\$)
{
    my $line = shift;
    return unless defined $$line;
    $$line =~ s/\&/&amp;/g;
    $$line =~ s/</&lt;/g;
    $$line =~ s/>/&gt;/g;
    $$line =~ s/"/&quot;/g;
    $$line =~ s/æ/&aelig;/g;
    return $$line;
}

#my $sts_link = "http://riva.ucam.org/~kamion/archive-bin/" .
#		"article.pl?msgid=5n0qq5\$d9p\@mercury.dur.ac.uk";
my $sts_link = 'http://groups.google.com/groups?' .
	       'q=Easter+Egg&ic=1&selm=5n0qq5%24d9p%40mercury.dur.ac.uk';

# Paragraph to format and undefine, followed optionally by open and close tags.
sub print_paragraph (\$;$$)
{
    my $para = shift;
    return unless defined $$para && $$para !~ /^\s*$/;
    my $open_tags = shift;
    my $close_tags = shift;

    # Make URLs links
    $$para =~ s#(http:(?:[^\ \&<]|\&(?!gt;))*)
	       #<a target="_top" href="$1">$1</a>#gx;

    # Make all text within pairs of underscores outside URLs emphasized
    $$para =~ s[_((?:[^_.-])+\.?)_
	    	(?![^"]*">|[^<]*</a>)][<em>$1</em>]gx;

    # Similarly, strengthen bold text within asterisks
    $$para =~ s[\*((?:[^*.])+\.?)\*
	    	(?![^"]*">|[^<]*</a>)][<strong>$1</strong>]gx;

    # Easter egg
    $$para =~ s|(Sephrenia the Styric)|<a href="$sts_link">$1</a>|g;

    $$para = "$open_tags$$para" if defined $open_tags;
    $$para .= $close_tags if defined $close_tags;

    print "$$para\n";
    undef $$para;
}

my $index_file = shift or usage;
my $up_url = shift;

open CONTROL, ">control.html" or die "Can't write to control.html: $!";
select CONTROL;

html_header "Revision information", $index_file;
print "\n";

my $changes_done = 0;

print "<pre>\n";
while (<>)
{
    $changes_done = 1 if /^Archive-name:/;
    next unless $changes_done;
    chomp;
    html_escape $_;
    last if /^\s*$/;
    unless (/^([^:]+:)\s*(.+)$/)
    {
	warn "Non-header line found in control section: $_";
	next;
    }
    print "$_\n";
}
print "</pre>\n\n";

html_footer $index_file;

open INDEX, ">$index_file" or die "Can't write to $index_file: $!";
select INDEX;

html_header "Contents", $up_url;
print "\n";

print qq(<h1 align="center">\n);
while (<>)
{
    if (/^Changes:/)
    {
	# Oh, the changes are after the Archive-name: block this time, are
	# they? OK ...
	while (<>)
	{
	    last if /^\s*$/;
	}
	next;
    }
    last if /^\s*$/;
    html_escape $_;
    print;
}
print "</h1>\n\n";

print qq(<p><a href="control.html">Revision information</a></p>\n);

my $paragraph;
my $listentry = 0;
my $level = 0;
my @lastvalues = ();

while (<>)
{
    chomp;
    s/^\s*//;
    html_escape $_;
    # Stop at TOC-body separator
    if (/--------/)
    {
	print_paragraph $paragraph, "", $listentry ? "</a>" : "";
	last;
    }
    # Convert headings to links
    my $line = $_;
    if ($line =~ /([0-9]+)((?:.[0-9]+)*)[\)\s]/)
    {
	my $firstcomp = $1;
	my $heading = "$1$2";
	my @components = split /\./, $heading;
	if (@components > 0)
	{
	    $line =~ /\s+(.*)$/g;
	    my $linetail = $1;
	    print_paragraph $paragraph, "", $listentry ? "</a>" : "";

	    # Change level of ordered list if necessary
	    if (@components > $level)
	    {
		print "<ol>\n" x (@components - $level);
	    }
	    elsif (@components < $level)
	    {
		print "</ol>\n" x ($level - @components);
	    }
	    #print "</p>\n\n" if @components == 1 && $level > 0;
	    print "\n<br>\n\n" if @components == 1 && $level > 0;
	    $level = @components;
	    $#lastvalues = $level;
	    #print "<p>\n" if $level == 1;

	    # Reformat the current line
	    my $value = $components[$#components];
	    if (defined $lastvalues[$level] &&
		$lastvalues[$level] + 1 != $value)
	    {
		warn "Missing item in contents before $heading";
		print qq(<li value="$value">);
	    }
	    elsif ($value != 1)
	    {
		print qq(<li value="$value">);
	    }
	    else
	    {
		print "<li>";
	    }
	    $lastvalues[$level] = $value;
	    print qq(<a href="part$firstcomp.html#sec$heading">\n);
	    $paragraph = $linetail;
	    $listentry = 1;
	    next;
	}
    }
    elsif (/^[A-Z][^a-z]+$/)
    {
	print_paragraph $paragraph, "", $listentry ? "</a>" : "";
	if ($level == 0)
	{
	    print "\n<h2>$_</h2>\n\n";
	}
	else
	{
	    #print "</p>\n\n";
	    print "</ol>\n\n" x ($level - 1);
	    print "\n" if $level == 1;
	    print "<h3>$_</h3>\n\n";
	    print "<ol>\n" x ($level - 1);
	    #print "<p>\n";
	    $#lastvalues = 0;
	}
	$listentry = 0;
	next;
    }
    elsif (/^$/)
    {
	print_paragraph $paragraph, "", $listentry ? "</a>" : "";
	$listentry = 0;
	next;
    }
    print_paragraph $paragraph;
    $paragraph = $_;
}

# Do all the necessary end tags
#print "</p>\n" if $level > 0;
print "\n<br>\n" if $level > 0;
print "\n";
print "</ol>\n\n" x $level;

html_footer $up_url;
select STDOUT;
close INDEX;

$paragraph = <>;
my $oldsection;
my $section;
my $section_open = 0;
my $num_sections = $lastvalues[1];
my $in_heading = 0;
my $in_h2 = 0;
$level = 0;
$#lastvalues = -1;

my $heading_pat = qr/^\s*([0-9]+(?:.[0-9]+)*)\)\s+/;

while (<>)
{
    s/\s+\n/\n/;
    html_escape $_;

    if (/^\s*([0-9]+)\)\s/)
    {
	chomp $paragraph if defined $paragraph;
	if ($section_open)
	{
	    if ($in_heading)
	    {
		print_paragraph $paragraph, "", "</strong></a>";
	    }
	    else
	    {
		print_paragraph $paragraph, "<p>\n", "\n</p>";
	    }
	}
	$in_heading = $in_h2 = 0;
	$oldsection = $section;
	$section = $1;
	print "\n" if $section_open;
	print "</ol>\n\n" x $level if $section_open;
	html_footer $index_file,
		    ($oldsection > 1) ?
			("part" . ($oldsection - 1) . ".html") :
			undef,
		    "part" . ($oldsection + 1) . ".html"
	    if $section_open;
	open SECTION, ">part$section.html" or
	    die "Couldn't write to part$section.html: $!";
	select SECTION;
	html_header "section $section",
		    $index_file,
		    ($section > 1) ?
			("part" . ($section - 1) . ".html") :
			undef,
		    ($section < $num_sections) ?
			("part" . ($section + 1) . ".html") :
			undef;
	$section_open = 1;
	$level = 0;
	$#lastvalues = -1;
    }

    # Convert headings to anchors in list items
    if (/$heading_pat/)
    {
	my $heading = $1;
	my @components = split /\./, $heading;
	chomp $paragraph if defined $paragraph;
	if ($in_heading)
	{
	    print_paragraph $paragraph, "", "</strong></a>\n";
	}
	else
	{
	    print_paragraph $paragraph, "<p>\n", "\n</p>\n";
	}
	$in_h2 = 0;

	# Change level of ordered list if necessary
	if (@components > $level)
	{
	    print "<ol>\n\n" x (@components - $level);
	}
	elsif (@components < $level)
	{
	    print "</ol>\n\n" x ($level - @components);
	}
	$level = @components;
	$#lastvalues = $level;

	my $value = $components[$#components];
	if (@components == 1)
	{
	    chomp;
	    s|$heading_pat|<li><h2><a name="sec$1">|;
	    $in_h2 = 1;
	}
	elsif (defined $lastvalues[$level] &&
	       $lastvalues[$level] + 1 != $value)
	{
	    warn "Missing item in main document before $heading";
	    s|$heading_pat|<li value="$value"><a name="sec$1"><strong>|;
	}
	elsif ($value != 1)
	{
	    s|$heading_pat|<li value="$value"><a name="sec$1"><strong>|;
	}
	else
	{
	    s|$heading_pat|<li><a name="sec$1"><strong>|;
	}
	$lastvalues[$level] = $value;

	warn "Heading $heading encountered outside a section"
	    unless defined $section;
	warn "Heading $heading encountered in section $section"
	    if defined $section && $heading !~ /^$section(?:\.|$)/;

	$in_heading = 1;
    }

    # Mark unnumbered headings
    if (/^([A-Z][^a-z]*[A-Z])\n/)
    {
	chomp;
	$paragraph = $_;

	print "\n</ol>\n" x ($level - 1);
	print "\n" if $level != 1;
	print_paragraph $paragraph, "<h3>", "</h3>";
	$in_heading = $in_h2 = 0;
	print "\n<ol>\n" x ($level - 1);
	$#lastvalues = 0;

	$_ = "";
    }

    # If this is a paragraph delimiter of some kind (usually a blank line),
    # print the preceding paragraph, handling things we recognize as tables.
    if (defined $paragraph &&
	(/^\n/ || ($in_heading && $paragraph =~ /<a name[^>]*>.{0,40}$/)))
    {
	if (($paragraph =~ /ISBN +[0-9]-/ &&
	     $paragraph =~ /\((?:hc|pb|trade pb)\)/)
	    || $paragraph =~ /c\/o Del Rey Books/
	    || $paragraph =~ /esper\.net$/m)
	{
	    $paragraph =~ s/\n/<br>\n/g;
	}
	else
	{
	    # We have to split up the paragraph and work line-by-line here;
	    # multi-line regexps are really hard to get right.
	    my @parlines = split /\n/, $paragraph;
	    my $prevline;
	    foreach my $parline (@parlines)
	    {
		unless (defined $prevline)
		{
		    $prevline = \$parline;
		    next;
		}
		if ($parline =~ /^(\s*)((?:\w\)|
		    		 \[\w\]|
			    	 \w+(?:\s+\w+)?:\s*http).*|
				 \&lt;http(?:[^&]|\&(?!gt;))*\&gt;)$/x)
		{
		    $parline = "$1$2<br>";
		    $$prevline .= "<br>" if $$prevline !~ /<br>$/;
		}
		$prevline = \$parline;
	    }
	    $paragraph = (join "\n", @parlines) . "\n";
	    $paragraph =~ s/:$/:<br>/gm;
	}
	chomp $paragraph;
	$paragraph =~ s/(\S)  +/$1 /g unless /<br>/;
	if ($in_h2)
	{
	    print_paragraph $paragraph, "", "</a></h2>\n";
	    $in_h2 = 0;
	}
	else
	{
	    if ($in_heading)
	    {
		print_paragraph $paragraph, "", "</strong></a>";
	    }
	    else
	    {
		print_paragraph $paragraph, "<p>\n", "\n</p>";
	    }
	}
	$in_heading = 0;
	$paragraph = "$_" unless /^\n?$/;
    }
    elsif (defined $paragraph)
    {
	$paragraph .= "$_";
    }
    else
    {
	$paragraph = "$_" unless /^\n?$/;
    }
}

if ($section_open)
{
    chomp $paragraph if defined $paragraph;
    if ($in_heading)
    {
	print_paragraph $paragraph, "", "</strong></a>";
    }
    else
    {
	print_paragraph $paragraph, "<p>\n", "\n</p>";
    }
    print "\n";
    print "</ol>\n\n" x $level;

    html_footer $index_file,
		($section > 1) ? ("part" . ($section - 1) . ".html") : undef,
		undef;
    close SECTION if $section_open;
}
else
{
    warn "No sections encountered";
}

