chiark - git - mdw - disorder/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* This file is part of DisOrder.
	3	* Copyright (C) 2004, 2005 Richard Kettlewell
	4	*
	5	* This program is free software; you can redistribute it and/or modify
	6	* it under the terms of the GNU General Public License as published by
	7	* the Free Software Foundation; either version 2 of the License, or
	8	* (at your option) any later version.
	9	*
	10	* This program is distributed in the hope that it will be useful, but
	11	* WITHOUT ANY WARRANTY; without even the implied warranty of
	12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	* General Public License for more details.
	14	*
	15	* You should have received a copy of the GNU General Public License
	16	* along with this program; if not, write to the Free Software
	17	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
	18	* USA
	19	*/
	20	/** @file lib/charset.c @brief Character set conversion */
	21
	22	#include <config.h>
	23	#include "types.h"
	24
	25	#include <iconv.h>
	26	#include <string.h>
	27	#include <errno.h>
	28	#include <langinfo.h>
	29
	30	#include "mem.h"
	31	#include "log.h"
	32	#include "charset.h"
	33	#include "configuration.h"
	34	#include "utf8.h"
	35	#include "vector.h"
	36	#include "unidata.h"
	37
	38	/** @brief Low-level converstion routine
	39	* @param from Source encoding
	40	* @param to Destination encoding
	41	* @param ptr First byte to convert
	42	* @param n Number of bytes to convert
	43	* @return Converted text, 0-terminated; or NULL on error.
	44	*/
	45	static void convert(const char from, const char *to,
	46	const void *ptr, size_t n) {
	47	iconv_t i;
	48	size_t len;
	49	char buf = 0, s, *d;
	50	size_t bufsize = 0, sl, dl;
	51
	52	if((i = iconv_open(to, from)) == (iconv_t)-1)
	53	fatal(errno, "error calling iconv_open");
	54	do {
	55	bufsize = bufsize ? 2 * bufsize : 32;
	56	buf = xrealloc_noptr(buf, bufsize);
	57	iconv(i, 0, 0, 0, 0);
	58	s = (char *)ptr;
	59	sl = n;
	60	d = buf;
	61	dl = bufsize;
	62	/* (void ) to work around FreeBSD's nonstandard iconv prototype /
	63	len = iconv(i, (void *)&s, &sl, &d, &dl);
	64	} while(len == (size_t)-1 && errno == E2BIG);
	65	iconv_close(i);
	66	if(len == (size_t)-1) {
	67	error(errno, "error converting from %s to %s", from, to);
	68	return 0;
	69	}
	70	return buf;
	71	}
	72
	73	/** @brief Convert UTF-8 to UCS-4
	74	* @param mb Pointer to 0-terminated UTF-8 string
	75	* @return Pointer to 0-terminated UCS-4 string
	76	*
	77	* Not everybody's iconv supports UCS-4, and it's inconvenient to have to know
	78	* our endianness, and it's easy to convert it ourselves, so we do. See also
	79	* @ref ucs42utf8().
	80	*/
	81	uint32_t utf82ucs4(const char mb) {
	82	struct dynstr_ucs4 d;
	83	uint32_t c;
	84
	85	dynstr_ucs4_init(&d);
	86	while(*mb) {
	87	PARSE_UTF8(mb, c,
	88	error(0, "invalid UTF-8 sequence"); return 0;);
	89	dynstr_ucs4_append(&d, c);
	90	}
	91	dynstr_ucs4_terminate(&d);
	92	return d.vec;
	93	}
	94
	95	/** @brief Convert one UCS-4 character to UTF-8
	96	* @param c Character to convert
	97	* @param d Dynamic string to append UTF-8 sequence to
	98	* @return 0 on success, -1 on error
	99	*/
	100	int one_ucs42utf8(uint32_t c, struct dynstr *d) {
	101	if(c < 0x80)
	102	dynstr_append(d, c);
	103	else if(c < 0x800) {
	104	dynstr_append(d, 0xC0 \| (c >> 6));
	105	dynstr_append(d, 0x80 \| (c & 0x3F));
	106	} else if(c < 0x10000) {
	107	dynstr_append(d, 0xE0 \| (c >> 12));
	108	dynstr_append(d, 0x80 \| ((c >> 6) & 0x3F));
	109	dynstr_append(d, 0x80 \| (c & 0x3F));
	110	} else if(c < 0x110000) {
	111	dynstr_append(d, 0xF0 \| (c >> 18));
	112	dynstr_append(d, 0x80 \| ((c >> 12) & 0x3F));
	113	dynstr_append(d, 0x80 \| ((c >> 6) & 0x3F));
	114	dynstr_append(d, 0x80 \| (c & 0x3F));
	115	} else {
	116	error(0, "invalid UCS-4 character %#"PRIx32, c);
	117	return -1;
	118	}
	119	return 0;
	120	}
	121
	122	/** @brief Convert UCS-4 to UTF-8
	123	* @param u Pointer to 0-terminated UCS-4 string
	124	* @return Pointer to 0-terminated UTF-8 string
	125	*
	126	* See @ref utf82ucs4().
	127	*/
	128	char ucs42utf8(const uint32_t u) {
	129	struct dynstr d;
	130	uint32_t c;
	131
	132	dynstr_init(&d);
	133	while((c = *u++)) {
	134	if(one_ucs42utf8(c, &d))
	135	return 0;
	136	}
	137	dynstr_terminate(&d);
	138	return d.vec;
	139	}
	140
	141	/** @brief Convert from the local multibyte encoding to UTF-8 */
	142	char mb2utf8(const char mb) {
	143	return convert(nl_langinfo(CODESET), "UTF-8", mb, strlen(mb) + 1);
	144	}
	145
	146	/** @brief Convert from UTF-8 to the local multibyte encoding */
	147	char utf82mb(const char utf8) {
	148	return convert("UTF-8", nl_langinfo(CODESET), utf8, strlen(utf8) + 1);
	149	}
	150
	151	/** @brief Convert from encoding @p from to UTF-8 */
	152	char any2utf8(const char from, const char *any) {
	153	return convert(from, "UTF-8", any, strlen(any) + 1);
	154	}
	155
	156	/** @brief Convert from encoding @p from to the local multibyte encoding */
	157	char any2mb(const char from, const char *any) {
	158	if(from) return convert(from, nl_langinfo(CODESET), any, strlen(any) + 1);
	159	else return xstrdup(any);
	160	}
	161
	162	/** @brief Convert from encoding @p from to encoding @p to */
	163	char any2any(const char from,
	164	const char *to,
	165	const char *any) {
	166	if(from \|\| to) return convert(from, to, any, strlen(any) + 1);
	167	else return xstrdup(any);
	168	}
	169
	170	/** @brief strlen workalike for UCS-4 strings
	171	*
	172	* We don't rely on the local @c wchar_t being UCS-4.
	173	*/
	174	int ucs4cmp(const uint32_t a, const uint32_t b) {
	175	while(a && b && a == b) ++a, ++b;
	176	if(a > b) return 1;
	177	else if(a < b) return -1;
	178	else return 0;
	179	}
	180
	181	/** @brief Return nonzero if @p c is a combining character */
	182	static int combining(int c) {
	183	if(c < UNICODE_NCHARS) {
	184	const struct unidata *const ud = &unidata[c / 256][c % 256];
	185
	186	return ud->gc == unicode_gc_Mn \|\| ud->ccc != 0;
	187	}
	188	/* Assume unknown characters are noncombining */
	189	return 0;
	190	}
	191
	192	/** @brief Truncate a string for display purposes
	193	* @param s Pointer to UTF-8 string
	194	* @param max Maximum number of columns
	195	* @return @p or truncated string (never NULL)
	196	*
	197	* We don't correctly support bidi or double-width characters yet, nor
	198	* locate default grapheme cluster boundaries for saner truncation.
	199	*/
	200	const char truncate_for_display(const char s, long max) {
	201	const char t = s, r, *cut = 0;
	202	char *truncated;
	203	uint32_t c;
	204	long n = 0;
	205
	206	/* We need to discover two things: firstly whether the string is
	207	* longer than @p max glyphs and secondly if it is not, where to cut
	208	* the string.
	209	*
	210	* Combining characters follow their base character (unicode
	211	* standard 5.0 s2.11), so after each base character we must
	212	*/
	213	while(*t) {
	214	PARSE_UTF8(t, c, return s);
	215	if(combining(c))
	216	/* This must be an initial combining character. We just skip it. */
	217	continue;
	218	/* So c must be a base character. It may be followed by any
	219	* number of combining characters. We advance past them. */
	220	do {
	221	r = t;
	222	PARSE_UTF8(t, c, return s);
	223	} while(combining(c));
	224	/* Last character wasn't a combining character so back up */
	225	t = r;
	226	++n;
	227	/* So now there are N glyphs before position T. We might
	228	* therefore have reached the cut position. */
	229	if(n == max - 3)
	230	cut = t;
	231	}
	232	/* If the string is short enough we return it unmodified */
	233	if(n < max)
	234	return s;
	235	truncated = xmalloc_noptr(cut - s + 4);
	236	memcpy(truncated, s, cut - s);
	237	strcpy(truncated + (cut - s), "...");
	238	return truncated;
	239	}
	240
	241	/*
	242	Local Variables:
	243	c-basic-offset:2
	244	comment-column:40
	245	End:
	246	*/