#!/usr/bin/perl -w

=head1 NAME

devil2dict - preprocess The Devil's Dictionary for dictfmt(1)

=head1 SYNOPSIS

B<devil2dict> [I<INPUT_FILE>... [I<OUTPUT_BASENAME>]]

=head1 DESCRIPTION

B<devil2dict> converts the Internet Wiretap edition of The Devil's
Dictionary to CIA World Factbook format suitable for input to
L<dictfmt(1)>.  If you specify I<OUTPUT_BASENAME>, output is directly
piped into B<dictfmt>.

=head1 AUTHOR

Matej Vela <vela@debian.org>.  In the public domain.

=cut

use strict;

my $fmt_base = pop if @ARGV > 1;
open STDOUT, '|-', 'dictfmt', '-c5',
	     '--without-headword', # Necessary for multiple headwords.
	     '--headword-separator', ', ',
	     '-u', 'http://wiretap.area.com/Gopher/Library/Classic/devils.txt',
	     '-s', 'The Devil\'s Dictionary (1881-1906)',
	     $fmt_base
    or die "$0: can't open pipe to dictfmt: $!\n"
    if defined $fmt_base;

# Entries with multiple headwords, listed explicitly because they all
# use differing punctuation.
my %multi = (BABE      => 'BABE, BABY',
	     CONFIDANT => 'CONFIDANT, CONFIDANTE',
	     TZETZE    => 'TZETZE FLY, TSETSE FLY',
	     # LAUREL refers to LAUREATE with _Vide supra._
	     LAUREATE => 'LAUREATE, LAUREL');

# Hyperlinks, also listed explicitly due to a number of exceptions
# (e.g. LUNARIAN in the entry for EXECUTIVE).
my %link = (ACADEME         => '{ACADEME}',
	    HUSBAND         => '{HUSBAND}',
	    EPITAPH         => '{EPITAPH}',
	    LAUREATE        => '{LAUREATE}',
	     _Molecule_      => '{MOLECULE}',
	    LOGIC           => '{LOGIC}');

my $check = 0;
my $blank = 0;

while (<>) {
    # Expect headwords only after blank lines and lines beginning with
    # whitespace (cf. MEERSCHAUM).  This fixes several false positives
    # (for example, "II., De Clem._, ...").
    if (/^\s/) {
	$check = 1;

	# Leave blank lines for later.
	$blank++, next	if /^$/;

	# Strip letter headings, and the decoration at the end; no
	# point in having them in the previous entry.
	next		if /^\s+(?:[A-Z]|-\)\(-)$/;
    } elsif ($check) {
	$check = 0;

	# Check for headwords.  Characteristic examples:
	#
	#     I is the first letter of ...
	#     R.I.P.  A careless abbrev...  (dot included!)
	#     HABEAS CORPUS.  A writ by...  (dot left out!)
	#
	# The /g flag updates pos() for the hyperlink code below.
	if (/^([A-Z]+\b(?:\.[A-Z.]+|[- \'A-Z]+[A-Z])?)/g) {
	    $blank = 0;
	    print "\n_____\n\n", exists $multi{$1} ? $multi{$1} : $1, "\n";
	}
    }

    # Restore blank lines within entries.
    $blank--, print "\n" while $blank;

    # Search for hyperlinks; candidates are upper-case words and
    # underscored text.  s///g would look nicer, but it would
    # substitute headwords as well because it starts from the
    # beginning rather than the current pos().
     while (/([A-Z]{2,}|_.+?_)/g) {
         pos = $-[0] + length(substr($_, $-[0], $+[0] - $-[0]) = $link{$1})
             if exists $link{$1};
     }
     
    print;
}

print "\n";

close STDOUT
    or die "$0: can't close pipe to dictfmt: $!\n"
    if defined $fmt_base;
