/*
 * encodings.c: locale and encoding handling for man
 *
 * Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Colin Watson.
 *
 * This file is part of man-db.
 *
 * man-db is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * man-db is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with man-db; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */

#ifdef HAVE_CONFIG_H
#  include "config.h"
#endif /* HAVE_CONFIG_H */

#include <string.h>
#include <stdlib.h>

#include "gettext.h"
#include <locale.h>
#ifdef HAVE_LANGINFO_CODESET
#  include <langinfo.h>
#endif
#include <ctype.h>

#include "manconfig.h"

#include "pathsearch.h"
#include "pipeline.h"
#include "decompress.h"

#include "manconv.h"
#include "encodings.h"


/* Due to historical limitations in groff (which may be removed in the
 * future), there is no mechanism for a man page to specify its own
 * encoding. This means that each national language directory needs to carry
 * with it information about its encoding, and each groff device needs to
 * have a default encoding associated with it. Out of the box, groff
 * formally allows only ISO-8859-1 on input; however, patches originating
 * with Debian and imported by many other GNU/Linux distributions change
 * this somewhat.
 *
 * Eventually, groff will support proper Unicode input, and much of this
 * horror can go away.
 *
 * Do *not* confuse source encoding with groff encoding. The encoding
 * specified in this table is the encoding in which the source man pages in
 * each language directory are expected to be written. The groff encoding is
 * determined by the selected groff device and sometimes also by the user's
 * locale.
 *
 * The standard output encoding is the encoding assumed for cat pages for
 * each language directory. It must *not* be used to discover the actual
 * output encoding displayed to the user; that is determined by the locale.
 * TODO: it would be useful to be able to change the standard output
 * encoding in the configuration file.
 *
 * This table is expected to change over time, particularly as man pages
 * begin to move towards UTF-8. Feel free to patch this for your
 * distribution; send me updates for languages I've missed.
 *
 * Explicit encodings in the directory name (e.g. de_DE.UTF-8) override this
 * table.
 */
struct directory_entry {
	const char *lang_dir;
	const char *source_encoding;
	const char *standard_output_encoding;
};

static struct directory_entry directory_table[] = {
	{ "C",		"ISO-8859-1",	"ANSI_X3.4-1968"	}, /* English */
	{ "POSIX",	"ISO-8859-1",	"ANSI_X3.4-1968"	}, /* English */
	{ "da",		"ISO-8859-1",	"ISO-8859-1"		}, /* Danish */
	{ "de",		"ISO-8859-1",	"ISO-8859-1"		}, /* German */
	{ "en",		"ISO-8859-1",	"ISO-8859-1"		}, /* English */
	{ "es",		"ISO-8859-1",	"ISO-8859-1"		}, /* Spanish */
	{ "et",		"ISO-8859-1",	"ISO-8859-1"		}, /* Estonian */
	{ "fi",		"ISO-8859-1",	"ISO-8859-1"		}, /* Finnish */
	{ "fr",		"ISO-8859-1",	"ISO-8859-1"		}, /* French */
	{ "ga",		"ISO-8859-1",	"ISO-8859-1"		}, /* Irish */
	{ "gl",		"ISO-8859-1",	"ISO-8859-1"		}, /* Galician */
	{ "id",		"ISO-8859-1",	"ISO-8859-1"		}, /* Indonesian */
	{ "is",		"ISO-8859-1",	"ISO-8859-1"		}, /* Icelandic */
	{ "it",		"ISO-8859-1",	"ISO-8859-1"		}, /* Italian */
	{ "nb",		"ISO-8859-1",	"ISO-8859-1"		}, /* Norwegian Bokmål */
	{ "nl",		"ISO-8859-1",	"ISO-8859-1"		}, /* Dutch */
	{ "nn",		"ISO-8859-1",	"ISO-8859-1"		}, /* Norwegian Nynorsk */
	{ "no",		"ISO-8859-1",	"ISO-8859-1"		}, /* Norwegian */
	{ "pt",		"ISO-8859-1",	"ISO-8859-1"		}, /* Portuguese */
	{ "sv",		"ISO-8859-1",	"ISO-8859-1"		}, /* Swedish */

#ifdef MULTIBYTE_GROFF
	/* These languages require a patched version of groff with the
	 * ascii8 and nippon devices.
	 */
	{ "be",		"CP1251",	"CP1251"		}, /* Belarusian */
	{ "bg",		"CP1251",	"CP1251"		}, /* Bulgarian */
	{ "cs",		"ISO-8859-2",	"ISO-8859-2"		}, /* Czech */
	{ "el",		"ISO-8859-7",	"ISO-8859-7"		}, /* Greek */
	{ "hr",		"ISO-8859-2",	"ISO-8859-2"		}, /* Croatian */
	{ "hu",		"ISO-8859-2",	"ISO-8859-2"		}, /* Hungarian */
	{ "ja",		"EUC-JP",	"EUC-JP"		}, /* Japanese */
	{ "ko",		"EUC-KR",	"EUC-KR"		}, /* Korean */
	{ "lt",		"ISO-8859-13",	"ISO-8859-13"		}, /* Lithuanian */
	{ "lv",		"ISO-8859-13",	"ISO-8859-13"		}, /* Latvian */
	{ "mk",		"ISO-8859-5",	"ISO-8859-5"		}, /* Macedonian */
	{ "pl",		"ISO-8859-2",	"ISO-8859-2"		}, /* Polish */
	{ "ro",		"ISO-8859-2",	"ISO-8859-2"		}, /* Romanian */
	{ "ru",		"KOI8-R",	"KOI8-R"		}, /* Russian */
	{ "sk",		"ISO-8859-2",	"ISO-8859-2"		}, /* Slovak */
	{ "sl",		"ISO-8859-2",	"ISO-8859-2"		}, /* Slovenian */
	/* sr@latin must precede sr, due to top-down left-substring matching later */
	{ "sr@latin",   "ISO-8859-2",   "ISO-8859-2"            }, /* Serbian Latin */
	{ "sr",		"ISO-8859-5",	"ISO-8859-5"		}, /* Serbian */
	{ "tr",		"ISO-8859-9",	"ISO-8859-9"		}, /* Turkish */
	{ "uk",		"KOI8-U",	"KOI8-U"		}, /* Ukrainian */
	{ "vi",		"TCVN5712-1",	"TCVN5712-1"		}, /* Vietnamese */
	{ "zh_CN",	"GBK",		"GBK"			}, /* Simplified Chinese */
	{ "zh_SG",	"GBK",		"GBK"			}, /* Simplified Chinese, Singapore */
	{ "zh_HK",	"BIG5HKSCS",	"BIG5HKSCS"		}, /* Traditional Chinese, Hong Kong */
	{ "zh_TW",	"BIG5",		"BIG5"			}, /* Traditional Chinese */
#endif /* MULTIBYTE_GROFF */

	{ NULL,		NULL,		"NULL"			}
};

static const char *fallback_source_encoding = "ISO-8859-1";

/* Unfortunately, there is no portable way to inspect iconv's internal table
 * of character set aliases. We copy the most interesting ones here so that
 * we can deal with them if they appear in directory names. Note that all
 * names will be converted to upper case before looking them up in this
 * table.
 */
struct charset_alias_entry {
	const char *alias;
	const char *canonical_name;
};

static struct charset_alias_entry charset_alias_table[] = {
	/* The FHS is silly and requires numeric-only aliases that iconv
	 * does not support.
	 */
	{ "88591",		"ISO-8859-1"		},
	{ "88592",		"ISO-8859-2"		},
	{ "88593",		"ISO-8859-3"		},
	{ "88594",		"ISO-8859-4"		},
	{ "88595",		"ISO-8859-5"		},
	{ "88596",		"ISO-8859-6"		},
	{ "88597",		"ISO-8859-7"		},
	{ "88598",		"ISO-8859-8"		},
	{ "88599",		"ISO-8859-9"		},
	{ "885910",		"ISO-8859-10"		},
	{ "885911",		"ISO-8859-11"		},
	{ "885913",		"ISO-8859-13"		},
	{ "885914",		"ISO-8859-14"		},
	{ "885915",		"ISO-8859-15"		},
	{ "885916",		"ISO-8859-16"		},

	{ "ASCII",		"ANSI_X3.4-1968"	},
	{ "BIG-5",		"BIG5"			},
	{ "BIG5-HKSCS",		"BIG5HKSCS"		},
	{ "EUCCN",		"EUC-CN"		},
	{ "EUCJP",		"EUC-JP"		},
	{ "EUCKR",		"EUC-KR"		},
	{ "EUCTW",		"EUC-TW"		},
	{ "GB2312",		"EUC-CN"		},
	{ "ISO8859-1",		"ISO-8859-1"		},
	{ "ISO8859-2",		"ISO-8859-2"		},
	{ "ISO8859-3",		"ISO-8859-3"		},
	{ "ISO8859-4",		"ISO-8859-4"		},
	{ "ISO8859-5",		"ISO-8859-5"		},
	{ "ISO8859-6",		"ISO-8859-6"		},
	{ "ISO8859-7",		"ISO-8859-7"		},
	{ "ISO8859-8",		"ISO-8859-8"		},
	{ "ISO8859-9",		"ISO-8859-9"		},
	{ "ISO8859-10",		"ISO-8859-10"		},
	{ "ISO8859-11",		"ISO-8859-11"		},
	{ "ISO8859-13",		"ISO-8859-13"		},
	{ "ISO8859-14",		"ISO-8859-14"		},
	{ "ISO8859-15",		"ISO-8859-15"		},
	{ "ISO8859-16",		"ISO-8859-16"		},
	{ "KOI8R",		"KOI8-R"		},
	{ "KOI8U",		"KOI8-U"		},
	{ "UJIS",		"EUC-JP"		},
	{ "US-ASCII",		"ANSI_X3.4-1968"	},
	{ "UTF8",		"UTF-8"			},

	{ NULL,			NULL			}
};

/* The default groff terminal output device to be used is determined based
 * on nl_langinfo(CODESET), which returns the character set used by the
 * current locale.
 */
struct charset_entry {
	const char *locale_charset;
	const char *default_device;
};

static struct charset_entry charset_table[] = {
	{ "ANSI_X3.4-1968",	"ascii"		},
	{ "ISO-8859-1",		"latin1"	},
	{ "UTF-8",		"utf8"		},

#ifdef MULTIBYTE_GROFF
	{ "BIG5",		"nippon"	},
	{ "BIG5HKSCS",		"nippon"	},
	{ "EUC-CN",		"nippon"	},
	{ "EUC-JP",		"nippon"	},
	{ "EUC-TW",		"nippon"	},
	{ "GBK",		"nippon"	},
#else /* !MULTIBYTE_GROFF */
	/* If we have a smarter version of groff, this is better dealt with
	 * using either ascii8 (Debian multibyte patch) or preconv (as of
	 * groff 1.20). This is a not-quite-right stopgap in case we have
	 * neither.
	 */
	{ "ISO-8859-15",    	"latin1"	},
#endif /* MULTIBYTE_GROFF */

	{ NULL,			NULL		}
};

static const char *fallback_default_device =
#ifdef MULTIBYTE_GROFF
	"ascii8"
#else /* !MULTIBYTE_GROFF */
	"ascii"
#endif /* MULTIBYTE_GROFF */
	;

/* The encoding used for the text passed to groff is a function of the
 * selected groff device. Traditional devices expect ISO-8859-1 on input
 * (yes, even the utf8 device); devices added in the Debian multibyte patch
 * expect other encodings. The ascii8 device passes top-bit-set characters
 * straight through so is (probably ...) encoding-agnostic. If this encoding
 * does not match the source encoding, an iconv pipe is used (if available)
 * to perform recoding.
 */
struct device_entry {
	const char *roff_device;
	const char *roff_encoding;
	const char *output_encoding;
};

static struct device_entry device_table[] = {
	/* nroff devices */
	{ "ascii",	"ANSI_X3.4-1968",	"ANSI_X3.4-1968"	},
	{ "latin1",	"ISO-8859-1",		"ISO-8859-1"		},
	{ "utf8",	"ISO-8859-1",		"UTF-8"			},

#ifdef MULTIBYTE_GROFF
	{ "ascii8",	NULL,			NULL			},
	{ "nippon",	NULL,			NULL			},
#endif /* MULTIBYTE_GROFF */

	/* troff devices */
	{ "X75",	NULL,			NULL			},
	{ "X75-12",	NULL,			NULL			},
	{ "X100",	NULL,			NULL			},
	{ "X100-12",	NULL,			NULL			},
	{ "dvi",	NULL,			NULL			},
	{ "html",	NULL,			NULL			},
	{ "lbp",	NULL,			NULL			},
	{ "lj4",	NULL,			NULL			},
	{ "ps",		NULL,			NULL			},

	{ NULL,		NULL,			NULL			}
};

static const char *fallback_roff_encoding = "ISO-8859-1";

/* Setting less_charset to iso8859 tells the less pager that characters
 * between 0xA0 and 0xFF are displayable, not that its input is encoded in
 * ISO-8859-*. TODO: Perhaps using LESSCHARDEF would be better.
 *
 * Character set names compatible only with jless go in jless_charset.
 */
struct less_charset_entry {
	const char *locale_charset;
	const char *less_charset;
	const char *jless_charset;
};

static struct less_charset_entry less_charset_table[] = {
	{ "ANSI_X3.4-1968",	"ascii",	NULL		},
	{ "ISO-8859-1",		"iso8859",	NULL		},
	{ "UTF-8",		"utf-8",	NULL		},

#ifdef MULTIBYTE_GROFF
	{ "EUC-JP",		"iso8859",	"japanese-ujis"	},
	{ "KOI8-R",		"koi8-r",	NULL		},
	/* close enough? */
	{ "KOI8-U",		"koi8-r",	NULL		},
#endif /* MULTIBYTE_GROFF */

	{ NULL,			NULL,		NULL		}
};

static const char *fallback_less_charset = "iso8859";


const char *groff_preconv = NULL;

/* Is the groff "preconv" helper available? If so, return its name.
 * Otherwise, return NULL.
 */
const char *get_groff_preconv (void)
{
	if (groff_preconv) {
		if (*groff_preconv)
			return groff_preconv;
		else
			return NULL;
	}

	if (pathsearch_executable ("gpreconv"))
		groff_preconv = "gpreconv";
	else if (pathsearch_executable ("preconv"))
		groff_preconv = "preconv";
	else
		groff_preconv = "";

	if (*groff_preconv)
		return groff_preconv;
	else
		return NULL;
}

/* Return the assumed encoding of the source man page, based on the
 * directory in which it was found. The caller should attempt to recode from
 * this to whatever encoding is expected by groff.
 *
 * The caller should free the returned string when it is finished with it.
 */
char *get_page_encoding (const char *lang)
{
	const struct directory_entry *entry;
	const char *dot;

	if (!lang || !*lang) {
		/* Guess based on the locale. */
		lang = setlocale (LC_MESSAGES, NULL);
		if (!lang)
			return xstrdup (fallback_source_encoding);
	}

	dot = strchr (lang, '.');
	if (dot) {
		/* The FHS has the worst specification of what's supposed to
		 * go after the dot here that I've ever seen. To quote from
		 * version 2.1:
		 *
		 * "It is recommended that this be a numeric representation
		 * if possible (ISO standards, especially), not include
		 * additional punctuation symbols, and that any letters be
		 * in lowercase."
		 *
		 * Any sane standard would use directory names like
		 * de_DE.ISO-8859-1; the examples in the FHS recommend
		 * de_DE.88591 instead. Considering that there is no other
		 * conceivable use for encodings in directory names other
		 * than to pass them to iconv or similar, this is quite
		 * startlingly useless.
		 *
		 * While we now support this thanks to
		 * get_canonical_charset_name, the FHS specification is
		 * obviously wrong and I plan to petition to have it
		 * changed. I recommend ignoring this part of the FHS.
		 */
		char *dir_encoding =
			xstrndup (dot + 1, strcspn (dot + 1, ",@"));
		char *canonical_dir_encoding =
			xstrdup (get_canonical_charset_name (dir_encoding));
		free (dir_encoding);
		return canonical_dir_encoding;
	}

	for (entry = directory_table; entry->lang_dir; ++entry)
		if (STRNEQ (entry->lang_dir, lang, strlen (entry->lang_dir)))
			return xstrdup (entry->source_encoding);

	return xstrdup (fallback_source_encoding);
}

/* Return the canonical encoding for source man pages in the specified
 * language. This ignores any encoding specification in the language
 * directory name. The source encoding should be used as a basis for
 * determining the correct roff device to use: that is, the caller should
 * behave as if it is recoding from the page encoding to the source encoding
 * first, although in practice it should recode directly from the page
 * encoding to the roff encoding.
 *
 * You should normally only call this function if the page encoding is
 * UTF-8, in which case older versions of groff that lack preconv need to
 * have the page recoded to some legacy encoding). If the page is in a
 * legacy encoding, then attempting to recode from that to some other legacy
 * encoding will probably do more harm than good.
 *
 * Here are a few concrete examples of why these distinctions are important:
 *
 *   /usr/share/man/en_GB.UTF-8, locale C
 *     page encoding = UTF-8
 *     source encoding = ISO-8859-1
 *     roff encoding = ISO-8859-1
 *     output encoding = UTF-8
 *     UTF-8 -> iconv -> ISO-8859-1 -> groff -Tascii -> ANSI_X3.4-1968
 *
 *   /usr/share/man/pl_PL.UTF-8, locale pl_PL.UTF-8
 *     page encoding = UTF-8
 *     source encoding = ISO-8859-2
 *     roff encoding = ISO-8859-2
 *     output encoding = ISO-8859-2
 *     UTF-8 -> iconv -> ISO-8859-2 -> groff -Tascii8
 *                    -> ISO-8859-2 -> iconv -> UTF-8
 *
 *   /usr/share/man/ja_JP.EUC-JP, locale ja_JP.UTF-8
 *     page encoding = EUC-JP
 *     source encoding = EUC-JP
 *     roff encoding = UTF-8
 *     output encoding = UTF-8
 *     EUC-JP -> iconv -> UTF-8 -> groff -Tutf8 -> UTF-8
 *
 *   /usr/share/man/en_GB.ISO-8859-15, locale en_GB.UTF-8
 *     page encoding = ISO-8859-15
 *     source encoding = ISO-8859-15
 *     roff encoding = ISO-8859-15
 *     output encoding = ISO-8859-15
 *     ISO-8859-15 -> groff -Tascii8 -> ISO-8859-15 -> iconv -> UTF-8
 */
const char *get_source_encoding (const char *lang)
{
	const struct directory_entry *entry;

	if (!lang || !*lang) {
		/* Guess based on the locale. */
		lang = setlocale (LC_MESSAGES, NULL);
		if (!lang)
			return fallback_source_encoding;
	}

	for (entry = directory_table; entry->lang_dir; ++entry)
		if (STRNEQ (entry->lang_dir, lang, strlen (entry->lang_dir)))
			return entry->source_encoding;

	return fallback_source_encoding;
}

/* Return the standard output encoding for the source man page, based on the
 * directory in which it was found. This should only be used to determine
 * whether a cat page can be saved.
 *
 * The caller should free the returned string when it is finished with it.
 */
char *get_standard_output_encoding (const char *lang)
{
	const struct directory_entry *entry;
	const char *dot;

	if (!lang || !*lang) {
		/* Guess based on the locale. */
		lang = setlocale (LC_MESSAGES, NULL);
		if (!lang)
			return NULL;
	}

	dot = strchr (lang, '.');
	if (dot) {
		/* The cat directory will have a corresponding name to the
		 * man directory including an explicit character set, so the
		 * pages it contains should have that encoding.
		 */
		char *dir_encoding =
			xstrndup (dot + 1, strcspn (dot + 1, ",@"));
		char *canonical_dir_encoding =
			xstrdup (get_canonical_charset_name (dir_encoding));
		free (dir_encoding);
		return canonical_dir_encoding;
	}

	for (entry = directory_table; entry->lang_dir; ++entry)
		if (STRNEQ (entry->lang_dir, lang, strlen (entry->lang_dir)))
			return xstrdup (entry->standard_output_encoding);

	return NULL;
}

const char *get_canonical_charset_name (const char *charset)
{
	const struct charset_alias_entry *entry;
	char *charset_upper = xstrdup (charset);
	char *p;

	for (p = charset_upper; *p; ++p)
		*p = CTYPE (toupper, *p);

	for (entry = charset_alias_table; entry->alias; ++entry)
		if (STREQ (entry->alias, charset_upper)) {
			free (charset_upper);
			return entry->canonical_name;
		}

	free (charset_upper);
	return charset;
}

/* Return the current locale's character set. */
const char *get_locale_charset (void)
{
	const char *ctype;
	const char *charset;
	char *saved_locale;

	/* We need to modify LC_CTYPE temporarily in order to look at the
	 * codeset, so save it first.
	 */
	saved_locale = xstrdup (setlocale (LC_CTYPE, NULL));

	ctype = setlocale (LC_CTYPE, "");

#ifdef HAVE_LANGINFO_CODESET
	charset = nl_langinfo (CODESET);
#else
	/* If the libc can't tell us, then try to guess from LC_CTYPE. */
	charset = strchr (ctype, '.');
	if (charset)
		++charset;
#endif

	/* Restore LC_CTYPE to its value on entry to this function. */
	setlocale (LC_CTYPE, saved_locale);

	if (charset && *charset)
		return get_canonical_charset_name (charset);
	else
		return NULL;
}

/* Can we take this input encoding and produce this output encoding, perhaps
 * with the help of some iconv pipes? */
static int compatible_encodings (const char *input, const char *output)
{
	if (STREQ (input, output))
		return 1;

	/* If the input is ASCII, recoding should be easy. Try it. */
	if (STREQ (input, "ANSI_X3.4-1968"))
		return 1;

	/* If the input is UTF-8, it's either a simple recoding of whatever
	 * we want or else it probably won't work at all no matter what we
	 * do. We might as well try it for now.
	 */
	if (STREQ (input, "UTF-8"))
		return 1;

	/* If the output is ASCII, this is probably because the caller
	 * explicitly asked for it, so we have little choice but to try.
	 */
	if (STREQ (output, "ANSI_X3.4-1968"))
		return 1;

#ifdef MULTIBYTE_GROFF
	/* Special case for some CJK UTF-8 locales, which take UTF-8 input
	 * recoded from EUC-JP (etc.) and produce UTF-8 output. This is
	 * rather filthy.
	 */
	if ((STREQ (input, "BIG5") || STREQ (input, "BIG5HKSCS") ||
	     STREQ (input, "EUC-JP") ||
	     STREQ (input, "EUC-CN") || STREQ (input, "GBK") ||
	     STREQ (input, "EUC-KR") ||
	     STREQ (input, "EUC-TW")) &&
	    STREQ (output, "UTF-8"))
		return 1;
#endif /* MULTIBYTE_GROFF */

	return 0;
}

/* Return the default groff device for the given character set. This may be
 * overridden by the user. The page's source encoding is needed to ensure
 * that the device is compatible: consider ru_RU.UTF-8, which needs ascii8
 * and a trailing iconv pipe to recode to UTF-8.
 *
 * All this encoding compatibility stuff feels like a slightly nasty hack,
 * but I haven't yet come up with a cleaner way to do it.
 */
const char *get_default_device (const char *locale_charset,
				const char *source_encoding)
{
	const struct charset_entry *entry;

	if (get_groff_preconv ())
		return "utf8";

	if (!locale_charset)
		return fallback_default_device;

	for (entry = charset_table; entry->locale_charset; ++entry) {
		if (STREQ (entry->locale_charset, locale_charset)) {
			const char *roff_encoding =
				get_roff_encoding (entry->default_device,
						   source_encoding);
			if (compatible_encodings (source_encoding,
						  roff_encoding))
				return entry->default_device;
		}
	}

	return fallback_default_device;
}

/* Is this a known *roff device name? */
int is_roff_device (const char *device)
{
	const struct device_entry *entry;

	for (entry = device_table; entry->roff_device; ++entry) {
		if (STREQ (entry->roff_device, device))
			return 1;
	}

	return 0;
}

/* Find the input encoding expected by groff, and set the LESSCHARSET
 * environment variable appropriately.
 */
const char *get_roff_encoding (const char *device, const char *source_encoding)
{
	const struct device_entry *entry;
	int found = 0;
	const char *roff_encoding = NULL;

	if (device) {
		for (entry = device_table; entry->roff_device; ++entry) {
			if (STREQ (entry->roff_device, device)) {
				found = 1;
				roff_encoding = entry->roff_encoding;
				break;
			}
		}
	}

	if (!found)
		roff_encoding = fallback_roff_encoding;

#ifdef MULTIBYTE_GROFF
	/* An ugly special case is needed here. The utf8 device normally
	 * takes ISO-8859-1 input. However, with the multibyte patch, when
	 * recoding from CJK character sets it takes UTF-8 input instead.
	 * This is evil, but there's not much that can be done about it
	 * apart from waiting for groff 2.0.
	 */
	if (device && STREQ (device, "utf8") && !get_groff_preconv () &&
	    STREQ (get_locale_charset (), "UTF-8")) {
		const char *ctype = setlocale (LC_CTYPE, NULL);
		if (STRNEQ (ctype, "ja_JP", 5) ||
		    STRNEQ (ctype, "ko_KR", 5) ||
		    STRNEQ (ctype, "zh_CN", 5) ||
		    STRNEQ (ctype, "zh_HK", 5) ||
		    STRNEQ (ctype, "zh_SG", 5) ||
		    STRNEQ (ctype, "zh_TW", 5))
			roff_encoding = "UTF-8";
	}
#endif /* MULTIBYTE_GROFF */

	return roff_encoding ? roff_encoding : source_encoding;
}

/* Find the output encoding that this device will produce, or NULL if it
 * will simply pass through the input encoding.
 */
const char *get_output_encoding (const char *device)
{
	const struct device_entry *entry;

	for (entry = device_table; entry->roff_device; ++entry)
		if (STREQ (entry->roff_device, device))
			return entry->output_encoding;

	return NULL;
}

/* Return the value of LESSCHARSET appropriate for this locale. */
const char *get_less_charset (const char *locale_charset)
{
	const struct less_charset_entry *entry;

	for (entry = less_charset_table; entry->locale_charset; ++entry)
		if (STREQ (entry->locale_charset, locale_charset))
			return entry->less_charset;

	return fallback_less_charset;
}

/* Return the value of JLESSCHARSET appropriate for this locale. May return
 * NULL.
 */
const char *get_jless_charset (const char *locale_charset)
{
	const struct less_charset_entry *entry;

	for (entry = less_charset_table; entry->locale_charset; ++entry)
		if (STREQ (entry->locale_charset, locale_charset))
			return entry->jless_charset;

	return NULL;
}

/* Inspect the first line of data in a pipeline for preprocessor encoding
 * declarations.
 */
char *check_preprocessor_encoding (pipeline *p)
{
	char *pp_encoding = NULL;

#ifdef PP_COOKIE
	const char *line = pipeline_peekline (p);
	char *directive = NULL;

	/* Some people use .\" incorrectly. We allow it for encoding
	 * declarations but not for preprocessor declarations.
	 */
	if (line &&
	    (STRNEQ (line, PP_COOKIE, 4) || STRNEQ (line, ".\\\" ", 4))) {
		const char *newline = strchr (line, '\n');
		if (newline)
			directive = xstrndup (line + 4,
					      newline - (line + 4));
		else
			directive = xstrdup (line + 4);
	}

	if (directive && strstr (directive, "-*-")) {
		const char *pp_search = strstr (directive, "-*-") + 3;
		while (*pp_search == ' ')
			++pp_search;
		if (STRNEQ (pp_search, "coding:", 7)) {
			const char *pp_encoding_allow;
			size_t pp_encoding_len;
			pp_search += 7;
			while (*pp_search == ' ')
				++pp_search;
			pp_encoding_allow = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
					    "abcdefghijklmnopqrstuvwxyz"
					    "0123456789-_/:.()";
			pp_encoding_len = strspn (pp_search,
						  pp_encoding_allow);
			pp_encoding = xstrndup (pp_search, pp_encoding_len);
			debug ("preprocessor encoding: %s\n", pp_encoding);
		}
	}
#endif /* PP_COOKIE */

	return pp_encoding;
}

struct manconv_codes {
	char **from;
	char *to;
};

static void manconv_stdin (void *data)
{
	struct manconv_codes *codes = data;
	pipeline *p = decompress_fdopen (dup (STDIN_FILENO));

	pipeline_start (p);
	manconv (p, codes->from, codes->to);
	pipeline_wait (p);
}

static void free_manconv_codes (void *data)
{
	struct manconv_codes *codes = data;
	char **try_from;

	for (try_from = codes->from; *try_from; ++try_from)
		free (*try_from);
	free (codes->from);
	free (codes->to);
	free (codes);
}

void add_manconv (pipeline *p, const char *source, const char *target)
{
	struct manconv_codes *codes = xmalloc (sizeof *codes);
	char *name;
	command *cmd;

	if (STREQ (source, "UTF-8") && STREQ (target, "UTF-8"))
		return;

	/* informational only; no shell quoting concerns */
	name = appendstr (NULL, MANCONV, " -f ", NULL);
	if (STREQ (source, "UTF-8")) {
		codes->from = XNMALLOC (2, char *);
		codes->from[0] = xstrdup (source);
		codes->from[1] = NULL;
		name = appendstr (name, source, NULL);
	} else {
		codes->from = XNMALLOC (3, char *);
		codes->from[0] = xstrdup ("UTF-8");
		codes->from[1] = xstrdup (source);
		codes->from[2] = NULL;
		name = appendstr (name, "UTF-8:", source, NULL);
	}
	codes->to = appendstr (NULL, target, "//IGNORE", NULL);
	/* informational only; no shell quoting concerns */
	name = appendstr (name, " -t ", codes->to, NULL);
	if (quiet >= 2)
		name = appendstr (name, " -q", NULL);
	cmd = command_new_function (name, &manconv_stdin, &free_manconv_codes,
				    codes);
	free (name);
	pipeline_command (p, cmd);
}
