[disorder] / lib / charset.c

/*
 * This file is part of DisOrder.
 * Copyright (C) 2004, 2005 Richard Kettlewell
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 * USA
 */
/** @file lib/charset.c @brief Character set conversion */

#include <config.h>
#include "types.h"

#include <iconv.h>
#include <string.h>
#include <errno.h>
#include <langinfo.h>

#include "mem.h"
#include "log.h"
#include "charset.h"
#include "configuration.h"
#include "utf8.h"
#include "vector.h"
#include "unidata.h"

/** @brief Low-level converstion routine
 * @param from Source encoding
 * @param to Destination encoding
 * @param ptr First byte to convert
 * @param n Number of bytes to convert
 * @return Converted text, 0-terminated; or NULL on error.
 */
static void *convert(const char *from, const char *to,
		     const void *ptr, size_t n) {
  iconv_t i;
  size_t len;
  char *buf = 0, *s, *d;
  size_t bufsize = 0, sl, dl;

  if((i = iconv_open(to, from)) == (iconv_t)-1)
    fatal(errno, "error calling iconv_open");
  do {
    bufsize = bufsize ? 2 * bufsize : 32;
    buf = xrealloc_noptr(buf, bufsize);
    iconv(i, 0, 0, 0, 0);
    s = (char *)ptr;
    sl = n;
    d = buf;
    dl = bufsize;
    /* (void *) to work around FreeBSD's nonstandard iconv prototype */
    len = iconv(i, (void *)&s, &sl, &d, &dl);
  } while(len == (size_t)-1 && errno == E2BIG);
  iconv_close(i);
  if(len == (size_t)-1) {
    error(errno, "error converting from %s to %s", from, to);
    return 0;
  }
  return buf;
}

/** @brief Convert UTF-8 to UCS-4
 * @param mb Pointer to 0-terminated UTF-8 string
 * @return Pointer to 0-terminated UCS-4 string
 *
 * Not everybody's iconv supports UCS-4, and it's inconvenient to have to know
 * our endianness, and it's easy to convert it ourselves, so we do.  See also
 * @ref ucs42utf8().
 */ 
uint32_t *utf82ucs4(const char *mb) {
  struct dynstr_ucs4 d;
  uint32_t c;

  dynstr_ucs4_init(&d);
  while(*mb) {
    PARSE_UTF8(mb, c,
	       error(0, "invalid UTF-8 sequence"); return 0;);
    dynstr_ucs4_append(&d, c);
  }
  dynstr_ucs4_terminate(&d);
  return d.vec;
}

/** @brief Convert one UCS-4 character to UTF-8
 * @param c Character to convert
 * @param d Dynamic string to append UTF-8 sequence to
 * @return 0 on success, -1 on error
 */
int one_ucs42utf8(uint32_t c, struct dynstr *d) {
  if(c < 0x80)
    dynstr_append(d, c);
  else if(c < 0x800) {
    dynstr_append(d, 0xC0 | (c >> 6));
    dynstr_append(d, 0x80 | (c & 0x3F));
  } else if(c < 0x10000) {
    dynstr_append(d, 0xE0 | (c >> 12));
    dynstr_append(d, 0x80 | ((c >> 6) & 0x3F));
    dynstr_append(d, 0x80 | (c & 0x3F));
  } else if(c < 0x110000) {
    dynstr_append(d, 0xF0 | (c >> 18));
    dynstr_append(d, 0x80 | ((c >> 12) & 0x3F));
    dynstr_append(d, 0x80 | ((c >> 6) & 0x3F));
    dynstr_append(d, 0x80 | (c & 0x3F));
  } else {
    error(0, "invalid UCS-4 character %#"PRIx32, c);
    return -1;
  }
  return 0;
}

/** @brief Convert UCS-4 to UTF-8
 * @param u Pointer to 0-terminated UCS-4 string
 * @return Pointer to 0-terminated UTF-8 string
 *
 * See @ref utf82ucs4().
 */
char *ucs42utf8(const uint32_t *u) {
  struct dynstr d;
  uint32_t c;

  dynstr_init(&d);
  while((c = *u++)) {
    if(one_ucs42utf8(c, &d))
      return 0;
  }
  dynstr_terminate(&d);
  return d.vec;
}

/** @brief Convert from the local multibyte encoding to UTF-8 */
char *mb2utf8(const char *mb) {
  return convert(nl_langinfo(CODESET), "UTF-8", mb, strlen(mb) + 1);
}

/** @brief Convert from UTF-8 to the local multibyte encoding */
char *utf82mb(const char *utf8) {
  return convert("UTF-8", nl_langinfo(CODESET), utf8, strlen(utf8) + 1);
}

/** @brief Convert from encoding @p from to UTF-8 */
char *any2utf8(const char *from, const char *any) {
  return convert(from, "UTF-8", any, strlen(any) + 1);
}

/** @brief Convert from encoding @p from to the local multibyte encoding */
char *any2mb(const char *from, const char *any) {
  if(from) return convert(from, nl_langinfo(CODESET), any, strlen(any) + 1);
  else return xstrdup(any);
}

/** @brief Convert from encoding @p from to encoding @p to */
char *any2any(const char *from,
	      const char *to,
	      const char *any) {
  if(from || to) return convert(from, to, any, strlen(any) + 1);
  else return xstrdup(any);
}

/** @brief strlen workalike for UCS-4 strings
 *
 * We don't rely on the local @c wchar_t being UCS-4.
 */
int ucs4cmp(const uint32_t *a, const uint32_t *b) {
  while(*a && *b && *a == *b) ++a, ++b;
  if(*a > *b) return 1;
  else if(*a < *b) return -1;
  else return 0;
}

/** @brief Return nonzero if @p c is a combining character */
static int combining(int c) {
  if(c < UNICODE_NCHARS) {
    const struct unidata *const ud = &unidata[c / 256][c % 256];

    return ud->gc == unicode_gc_Mn || ud->ccc != 0;
  }
  /* Assume unknown characters are noncombining */
  return 0;
}

/** @brief Truncate a string for display purposes
 * @param s Pointer to UTF-8 string
 * @param max Maximum number of columns
 * @return @p or truncated string (never NULL)
 *
 * We don't correctly support bidi or double-width characters yet, nor
 * locate default grapheme cluster boundaries for saner truncation.
 */
const char *truncate_for_display(const char *s, long max) {
  const char *t = s, *r, *cut = 0;
  char *truncated;
  uint32_t c;
  long n = 0;

  /* We need to discover two things: firstly whether the string is
   * longer than @p max glyphs and secondly if it is not, where to cut
   * the string.
   *
   * Combining characters follow their base character (unicode
   * standard 5.0 s2.11), so after each base character we must 
   */
  while(*t) {
    PARSE_UTF8(t, c, return s);
    if(combining(c))
      /* This must be an initial combining character.  We just skip it. */
      continue;
    /* So c must be a base character.  It may be followed by any
     * number of combining characters.  We advance past them. */
    do {
      r = t;
      PARSE_UTF8(t, c, return s);
    } while(combining(c));
    /* Last character wasn't a combining character so back up */
    t = r;
    ++n;
    /* So now there are N glyphs before position T.  We might
     * therefore have reached the cut position. */
    if(n == max - 3)
      cut = t;
  }
  /* If the string is short enough we return it unmodified */
  if(n < max)
    return s;
  truncated = xmalloc_noptr(cut - s + 4);
  memcpy(truncated, s, cut - s);
  strcpy(truncated + (cut - s), "...");
  return truncated;
}

/*
Local Variables:
c-basic-offset:2
comment-column:40
End:
*/
Commit	Line	Data
460b9539	1	/*
	2	* This file is part of DisOrder.
	3	* Copyright (C) 2004, 2005 Richard Kettlewell
	4	*
	5	* This program is free software; you can redistribute it and/or modify
	6	* it under the terms of the GNU General Public License as published by
	7	* the Free Software Foundation; either version 2 of the License, or
	8	* (at your option) any later version.
	9	*
	10	* This program is distributed in the hope that it will be useful, but
	11	* WITHOUT ANY WARRANTY; without even the implied warranty of
	12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	* General Public License for more details.
	14	*
	15	* You should have received a copy of the GNU General Public License
	16	* along with this program; if not, write to the Free Software
	17	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
	18	* USA
	19	*/
14ad73b9	20	/** @file lib/charset.c @brief Character set conversion */
460b9539	21
	22	#include <config.h>
	23	#include "types.h"
	24
	25	#include <iconv.h>
	26	#include <string.h>
	27	#include <errno.h>
	28	#include <langinfo.h>
	29
	30	#include "mem.h"
	31	#include "log.h"
	32	#include "charset.h"
	33	#include "configuration.h"
	34	#include "utf8.h"
	35	#include "vector.h"
61507e3c	36	#include "unidata.h"
460b9539	37
14ad73b9 RK	38	/** @brief Low-level converstion routine
	39	* @param from Source encoding
	40	* @param to Destination encoding
	41	* @param ptr First byte to convert
	42	* @param n Number of bytes to convert
	43	* @return Converted text, 0-terminated; or NULL on error.
	44	*/
460b9539	45	static void convert(const char from, const char *to,
	46	const void *ptr, size_t n) {
	47	iconv_t i;
	48	size_t len;
	49	char buf = 0, s, *d;
	50	size_t bufsize = 0, sl, dl;
	51
	52	if((i = iconv_open(to, from)) == (iconv_t)-1)
	53	fatal(errno, "error calling iconv_open");
	54	do {
	55	bufsize = bufsize ? 2 * bufsize : 32;
	56	buf = xrealloc_noptr(buf, bufsize);
	57	iconv(i, 0, 0, 0, 0);
	58	s = (char *)ptr;
	59	sl = n;
	60	d = buf;
	61	dl = bufsize;
	62	/* (void ) to work around FreeBSD's nonstandard iconv prototype /
	63	len = iconv(i, (void *)&s, &sl, &d, &dl);
	64	} while(len == (size_t)-1 && errno == E2BIG);
	65	iconv_close(i);
	66	if(len == (size_t)-1) {
	67	error(errno, "error converting from %s to %s", from, to);
	68	return 0;
	69	}
	70	return buf;
	71	}
	72
14ad73b9 RK	73	/** @brief Convert UTF-8 to UCS-4
	74	* @param mb Pointer to 0-terminated UTF-8 string
	75	* @return Pointer to 0-terminated UCS-4 string
	76	*
	77	* Not everybody's iconv supports UCS-4, and it's inconvenient to have to know
	78	* our endianness, and it's easy to convert it ourselves, so we do. See also
	79	* @ref ucs42utf8().
	80	*/
460b9539	81	uint32_t utf82ucs4(const char mb) {
	82	struct dynstr_ucs4 d;
	83	uint32_t c;
	84
	85	dynstr_ucs4_init(&d);
	86	while(*mb) {
	87	PARSE_UTF8(mb, c,
	88	error(0, "invalid UTF-8 sequence"); return 0;);
	89	dynstr_ucs4_append(&d, c);
	90	}
	91	dynstr_ucs4_terminate(&d);
	92	return d.vec;
	93	}
	94
13affe66 RK	95	/** @brief Convert one UCS-4 character to UTF-8
	96	* @param c Character to convert
	97	* @param d Dynamic string to append UTF-8 sequence to
	98	* @return 0 on success, -1 on error
	99	*/
	100	int one_ucs42utf8(uint32_t c, struct dynstr *d) {
	101	if(c < 0x80)
	102	dynstr_append(d, c);
	103	else if(c < 0x800) {
	104	dynstr_append(d, 0xC0 \| (c >> 6));
	105	dynstr_append(d, 0x80 \| (c & 0x3F));
	106	} else if(c < 0x10000) {
	107	dynstr_append(d, 0xE0 \| (c >> 12));
	108	dynstr_append(d, 0x80 \| ((c >> 6) & 0x3F));
	109	dynstr_append(d, 0x80 \| (c & 0x3F));
	110	} else if(c < 0x110000) {
	111	dynstr_append(d, 0xF0 \| (c >> 18));
	112	dynstr_append(d, 0x80 \| ((c >> 12) & 0x3F));
	113	dynstr_append(d, 0x80 \| ((c >> 6) & 0x3F));
	114	dynstr_append(d, 0x80 \| (c & 0x3F));
	115	} else {
	116	error(0, "invalid UCS-4 character %#"PRIx32, c);
	117	return -1;
	118	}
	119	return 0;
	120	}
	121
14ad73b9 RK	122	/** @brief Convert UCS-4 to UTF-8
	123	* @param u Pointer to 0-terminated UCS-4 string
	124	* @return Pointer to 0-terminated UTF-8 string
	125	*
	126	* See @ref utf82ucs4().
	127	*/
460b9539	128	char ucs42utf8(const uint32_t u) {
	129	struct dynstr d;
	130	uint32_t c;
	131
	132	dynstr_init(&d);
	133	while((c = *u++)) {
13affe66	134	if(one_ucs42utf8(c, &d))
460b9539	135	return 0;
460b9539	136	}
	137	dynstr_terminate(&d);
	138	return d.vec;
	139	}
	140
14ad73b9	141	/** @brief Convert from the local multibyte encoding to UTF-8 */
460b9539	142	char mb2utf8(const char mb) {
	143	return convert(nl_langinfo(CODESET), "UTF-8", mb, strlen(mb) + 1);
	144	}
	145
14ad73b9	146	/** @brief Convert from UTF-8 to the local multibyte encoding */
460b9539	147	char utf82mb(const char utf8) {
	148	return convert("UTF-8", nl_langinfo(CODESET), utf8, strlen(utf8) + 1);
	149	}
	150
14ad73b9	151	/** @brief Convert from encoding @p from to UTF-8 */
460b9539	152	char any2utf8(const char from, const char *any) {
	153	return convert(from, "UTF-8", any, strlen(any) + 1);
	154	}
	155
14ad73b9	156	/** @brief Convert from encoding @p from to the local multibyte encoding */
460b9539	157	char any2mb(const char from, const char *any) {
	158	if(from) return convert(from, nl_langinfo(CODESET), any, strlen(any) + 1);
	159	else return xstrdup(any);
	160	}
	161
14ad73b9	162	/** @brief Convert from encoding @p from to encoding @p to */
460b9539	163	char any2any(const char from,
	164	const char *to,
	165	const char *any) {
	166	if(from \|\| to) return convert(from, to, any, strlen(any) + 1);
	167	else return xstrdup(any);
	168	}
	169
14ad73b9 RK	170	/** @brief strlen workalike for UCS-4 strings
	171	*
	172	* We don't rely on the local @c wchar_t being UCS-4.
	173	*/
460b9539	174	int ucs4cmp(const uint32_t a, const uint32_t b) {
	175	while(a && b && a == b) ++a, ++b;
	176	if(a > b) return 1;
	177	else if(a < b) return -1;
	178	else return 0;
	179	}
	180
61507e3c RK	181	/** @brief Return nonzero if @p c is a combining character */
	182	static int combining(int c) {
	183	if(c < UNICODE_NCHARS) {
	184	const struct unidata *const ud = &unidata[c / 256][c % 256];
	185
	186	return ud->gc == unicode_gc_Mn \|\| ud->ccc != 0;
	187	}
	188	/* Assume unknown characters are noncombining */
	189	return 0;
	190	}
	191
	192	/** @brief Truncate a string for display purposes
	193	* @param s Pointer to UTF-8 string
	194	* @param max Maximum number of columns
	195	* @return @p or truncated string (never NULL)
	196	*
	197	* We don't correctly support bidi or double-width characters yet, nor
	198	* locate default grapheme cluster boundaries for saner truncation.
	199	*/
	200	const char truncate_for_display(const char s, long max) {
	201	const char t = s, r, *cut = 0;
	202	char *truncated;
	203	uint32_t c;
	204	long n = 0;
	205
	206	/* We need to discover two things: firstly whether the string is
	207	* longer than @p max glyphs and secondly if it is not, where to cut
	208	* the string.
	209	*
	210	* Combining characters follow their base character (unicode
	211	* standard 5.0 s2.11), so after each base character we must
	212	*/
	213	while(*t) {
	214	PARSE_UTF8(t, c, return s);
	215	if(combining(c))
	216	/* This must be an initial combining character. We just skip it. */
	217	continue;
	218	/* So c must be a base character. It may be followed by any
	219	* number of combining characters. We advance past them. */
	220	do {
	221	r = t;
	222	PARSE_UTF8(t, c, return s);
	223	} while(combining(c));
	224	/* Last character wasn't a combining character so back up */
	225	t = r;
	226	++n;
	227	/* So now there are N glyphs before position T. We might
	228	* therefore have reached the cut position. */
	229	if(n == max - 3)
	230	cut = t;
	231	}
	232	/* If the string is short enough we return it unmodified */
	233	if(n < max)
	234	return s;
	235	truncated = xmalloc_noptr(cut - s + 4);
	236	memcpy(truncated, s, cut - s);
	237	strcpy(truncated + (cut - s), "...");
	238	return truncated;
	239	}
	240
460b9539	241	/*
	242	Local Variables:
	243	c-basic-offset:2
	244	comment-column:40
	245	End:
	246	*/