chiark - git - mdw - disorder/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* This file is part of DisOrder.
	3	* Copyright (C) 2004, 2005 Richard Kettlewell
	4	*
	5	* This program is free software; you can redistribute it and/or modify
	6	* it under the terms of the GNU General Public License as published by
	7	* the Free Software Foundation; either version 2 of the License, or
	8	* (at your option) any later version.
	9	*
	10	* This program is distributed in the hope that it will be useful, but
	11	* WITHOUT ANY WARRANTY; without even the implied warranty of
	12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	* General Public License for more details.
	14	*
	15	* You should have received a copy of the GNU General Public License
	16	* along with this program; if not, write to the Free Software
	17	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
	18	* USA
	19	*/
	20	/** @file lib/charset.c @brief Character set conversion */
	21
	22	#include <config.h>
	23	#include "types.h"
	24
	25	#include <iconv.h>
	26	#include <string.h>
	27	#include <errno.h>
	28	#include <langinfo.h>
	29
	30	#include "mem.h"
	31	#include "log.h"
	32	#include "charset.h"
	33	#include "configuration.h"
	34	#include "utf8.h"
	35	#include "vector.h"
	36
	37	/** @brief Low-level converstion routine
	38	* @param from Source encoding
	39	* @param to Destination encoding
	40	* @param ptr First byte to convert
	41	* @param n Number of bytes to convert
	42	* @return Converted text, 0-terminated; or NULL on error.
	43	*/
	44	static void convert(const char from, const char *to,
	45	const void *ptr, size_t n) {
	46	iconv_t i;
	47	size_t len;
	48	char buf = 0, s, *d;
	49	size_t bufsize = 0, sl, dl;
	50
	51	if((i = iconv_open(to, from)) == (iconv_t)-1)
	52	fatal(errno, "error calling iconv_open");
	53	do {
	54	bufsize = bufsize ? 2 * bufsize : 32;
	55	buf = xrealloc_noptr(buf, bufsize);
	56	iconv(i, 0, 0, 0, 0);
	57	s = (char *)ptr;
	58	sl = n;
	59	d = buf;
	60	dl = bufsize;
	61	/* (void ) to work around FreeBSD's nonstandard iconv prototype /
	62	len = iconv(i, (void *)&s, &sl, &d, &dl);
	63	} while(len == (size_t)-1 && errno == E2BIG);
	64	iconv_close(i);
	65	if(len == (size_t)-1) {
	66	error(errno, "error converting from %s to %s", from, to);
	67	return 0;
	68	}
	69	return buf;
	70	}
	71
	72	/** @brief Convert UTF-8 to UCS-4
	73	* @param mb Pointer to 0-terminated UTF-8 string
	74	* @return Pointer to 0-terminated UCS-4 string
	75	*
	76	* Not everybody's iconv supports UCS-4, and it's inconvenient to have to know
	77	* our endianness, and it's easy to convert it ourselves, so we do. See also
	78	* @ref ucs42utf8().
	79	*/
	80	uint32_t utf82ucs4(const char mb) {
	81	struct dynstr_ucs4 d;
	82	uint32_t c;
	83
	84	dynstr_ucs4_init(&d);
	85	while(*mb) {
	86	PARSE_UTF8(mb, c,
	87	error(0, "invalid UTF-8 sequence"); return 0;);
	88	dynstr_ucs4_append(&d, c);
	89	}
	90	dynstr_ucs4_terminate(&d);
	91	return d.vec;
	92	}
	93
	94	/** @brief Convert one UCS-4 character to UTF-8
	95	* @param c Character to convert
	96	* @param d Dynamic string to append UTF-8 sequence to
	97	* @return 0 on success, -1 on error
	98	*/
	99	int one_ucs42utf8(uint32_t c, struct dynstr *d) {
	100	if(c < 0x80)
	101	dynstr_append(d, c);
	102	else if(c < 0x800) {
	103	dynstr_append(d, 0xC0 \| (c >> 6));
	104	dynstr_append(d, 0x80 \| (c & 0x3F));
	105	} else if(c < 0x10000) {
	106	dynstr_append(d, 0xE0 \| (c >> 12));
	107	dynstr_append(d, 0x80 \| ((c >> 6) & 0x3F));
	108	dynstr_append(d, 0x80 \| (c & 0x3F));
	109	} else if(c < 0x110000) {
	110	dynstr_append(d, 0xF0 \| (c >> 18));
	111	dynstr_append(d, 0x80 \| ((c >> 12) & 0x3F));
	112	dynstr_append(d, 0x80 \| ((c >> 6) & 0x3F));
	113	dynstr_append(d, 0x80 \| (c & 0x3F));
	114	} else {
	115	error(0, "invalid UCS-4 character %#"PRIx32, c);
	116	return -1;
	117	}
	118	return 0;
	119	}
	120
	121	/** @brief Convert UCS-4 to UTF-8
	122	* @param u Pointer to 0-terminated UCS-4 string
	123	* @return Pointer to 0-terminated UTF-8 string
	124	*
	125	* See @ref utf82ucs4().
	126	*/
	127	char ucs42utf8(const uint32_t u) {
	128	struct dynstr d;
	129	uint32_t c;
	130
	131	dynstr_init(&d);
	132	while((c = *u++)) {
	133	if(one_ucs42utf8(c, &d))
	134	return 0;
	135	}
	136	dynstr_terminate(&d);
	137	return d.vec;
	138	}
	139
	140	/** @brief Convert from the local multibyte encoding to UTF-8 */
	141	char mb2utf8(const char mb) {
	142	return convert(nl_langinfo(CODESET), "UTF-8", mb, strlen(mb) + 1);
	143	}
	144
	145	/** @brief Convert from UTF-8 to the local multibyte encoding */
	146	char utf82mb(const char utf8) {
	147	return convert("UTF-8", nl_langinfo(CODESET), utf8, strlen(utf8) + 1);
	148	}
	149
	150	/** @brief Convert from encoding @p from to UTF-8 */
	151	char any2utf8(const char from, const char *any) {
	152	return convert(from, "UTF-8", any, strlen(any) + 1);
	153	}
	154
	155	/** @brief Convert from encoding @p from to the local multibyte encoding */
	156	char any2mb(const char from, const char *any) {
	157	if(from) return convert(from, nl_langinfo(CODESET), any, strlen(any) + 1);
	158	else return xstrdup(any);
	159	}
	160
	161	/** @brief Convert from encoding @p from to encoding @p to */
	162	char any2any(const char from,
	163	const char *to,
	164	const char *any) {
	165	if(from \|\| to) return convert(from, to, any, strlen(any) + 1);
	166	else return xstrdup(any);
	167	}
	168
	169	/** @brief strlen workalike for UCS-4 strings
	170	*
	171	* We don't rely on the local @c wchar_t being UCS-4.
	172	*/
	173	int ucs4cmp(const uint32_t a, const uint32_t b) {
	174	while(a && b && a == b) ++a, ++b;
	175	if(a > b) return 1;
	176	else if(a < b) return -1;
	177	else return 0;
	178	}
	179
	180	/*
	181	Local Variables:
	182	c-basic-offset:2
	183	comment-column:40
	184	End:
	185	*/