[disorder] / lib / charset.c

/*
 * This file is part of DisOrder.
 * Copyright (C) 2004, 2005 Richard Kettlewell
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 * USA
 */
/** @file lib/charset.c @brief Character set conversion */

#include <config.h>
#include "types.h"

#include <iconv.h>
#include <string.h>
#include <errno.h>
#include <langinfo.h>

#include "mem.h"
#include "log.h"
#include "charset.h"
#include "configuration.h"
#include "utf8.h"
#include "vector.h"
#include "unidata.h"

/** @brief Low-level converstion routine
 * @param from Source encoding
 * @param to Destination encoding
 * @param ptr First byte to convert
 * @param n Number of bytes to convert
 * @return Converted text, 0-terminated; or NULL on error.
 */
static void *convert(const char *from, const char *to,
		     const void *ptr, size_t n) {
  iconv_t i;
  size_t len;
  char *buf = 0, *s, *d;
  size_t bufsize = 0, sl, dl;

  if((i = iconv_open(to, from)) == (iconv_t)-1)
    fatal(errno, "error calling iconv_open");
  do {
    bufsize = bufsize ? 2 * bufsize : 32;
    buf = xrealloc_noptr(buf, bufsize);
    iconv(i, 0, 0, 0, 0);
    s = (char *)ptr;
    sl = n;
    d = buf;
    dl = bufsize;
    /* (void *) to work around FreeBSD's nonstandard iconv prototype */
    len = iconv(i, (void *)&s, &sl, &d, &dl);
  } while(len == (size_t)-1 && errno == E2BIG);
  iconv_close(i);
  if(len == (size_t)-1) {
    error(errno, "error converting from %s to %s", from, to);
    return 0;
  }
  return buf;
}

/** @brief Convert from the local multibyte encoding to UTF-8 */
char *mb2utf8(const char *mb) {
  return convert(nl_langinfo(CODESET), "UTF-8", mb, strlen(mb) + 1);
}

/** @brief Convert from UTF-8 to the local multibyte encoding */
char *utf82mb(const char *utf8) {
  return convert("UTF-8", nl_langinfo(CODESET), utf8, strlen(utf8) + 1);
}

/** @brief Convert from encoding @p from to UTF-8 */
char *any2utf8(const char *from, const char *any) {
  return convert(from, "UTF-8", any, strlen(any) + 1);
}

/** @brief Convert from encoding @p from to the local multibyte encoding */
char *any2mb(const char *from, const char *any) {
  if(from) return convert(from, nl_langinfo(CODESET), any, strlen(any) + 1);
  else return xstrdup(any);
}

/** @brief Convert from encoding @p from to encoding @p to */
char *any2any(const char *from,
	      const char *to,
	      const char *any) {
  if(from || to) return convert(from, to, any, strlen(any) + 1);
  else return xstrdup(any);
}

/** @brief Return nonzero if @p c is a combining character */
static int combining(int c) {
  if(c < UNICODE_NCHARS) {
    const struct unidata *const ud = &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];

    return ud->general_category == unicode_General_Category_Mn || ud->ccc != 0;
  }
  /* Assume unknown characters are noncombining */
  return 0;
}

/** @brief Truncate a string for display purposes
 * @param s Pointer to UTF-8 string
 * @param max Maximum number of columns
 * @return @p or truncated string (never NULL)
 *
 * We don't correctly support bidi or double-width characters yet, nor
 * locate default grapheme cluster boundaries for saner truncation.
 */
const char *truncate_for_display(const char *s, long max) {
  const char *t = s, *r, *cut = 0;
  char *truncated;
  uint32_t c;
  long n = 0;

  /* We need to discover two things: firstly whether the string is
   * longer than @p max glyphs and secondly if it is not, where to cut
   * the string.
   *
   * Combining characters follow their base character (unicode
   * standard 5.0 s2.11), so after each base character we must 
   */
  while(*t) {
    PARSE_UTF8(t, c, return s);
    if(combining(c))
      /* This must be an initial combining character.  We just skip it. */
      continue;
    /* So c must be a base character.  It may be followed by any
     * number of combining characters.  We advance past them. */
    do {
      r = t;
      PARSE_UTF8(t, c, return s);
    } while(combining(c));
    /* Last character wasn't a combining character so back up */
    t = r;
    ++n;
    /* So now there are N glyphs before position T.  We might
     * therefore have reached the cut position. */
    if(n == max - 3)
      cut = t;
  }
  /* If the string is short enough we return it unmodified */
  if(n < max)
    return s;
  truncated = xmalloc_noptr(cut - s + 4);
  memcpy(truncated, s, cut - s);
  strcpy(truncated + (cut - s), "...");
  return truncated;
}

/*
Local Variables:
c-basic-offset:2
comment-column:40
End:
*/
Commit	Line	Data
460b9539	1	/*
	2	* This file is part of DisOrder.
	3	* Copyright (C) 2004, 2005 Richard Kettlewell
	4	*
	5	* This program is free software; you can redistribute it and/or modify
	6	* it under the terms of the GNU General Public License as published by
	7	* the Free Software Foundation; either version 2 of the License, or
	8	* (at your option) any later version.
	9	*
	10	* This program is distributed in the hope that it will be useful, but
	11	* WITHOUT ANY WARRANTY; without even the implied warranty of
	12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	* General Public License for more details.
	14	*
	15	* You should have received a copy of the GNU General Public License
	16	* along with this program; if not, write to the Free Software
	17	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
	18	* USA
	19	*/
14ad73b9	20	/** @file lib/charset.c @brief Character set conversion */
460b9539	21
	22	#include <config.h>
	23	#include "types.h"
	24
	25	#include <iconv.h>
	26	#include <string.h>
	27	#include <errno.h>
	28	#include <langinfo.h>
	29
	30	#include "mem.h"
	31	#include "log.h"
	32	#include "charset.h"
	33	#include "configuration.h"
	34	#include "utf8.h"
	35	#include "vector.h"
61507e3c	36	#include "unidata.h"
460b9539	37
14ad73b9 RK	38	/** @brief Low-level converstion routine
	39	* @param from Source encoding
	40	* @param to Destination encoding
	41	* @param ptr First byte to convert
	42	* @param n Number of bytes to convert
	43	* @return Converted text, 0-terminated; or NULL on error.
	44	*/
460b9539	45	static void convert(const char from, const char *to,
	46	const void *ptr, size_t n) {
	47	iconv_t i;
	48	size_t len;
	49	char buf = 0, s, *d;
	50	size_t bufsize = 0, sl, dl;
	51
	52	if((i = iconv_open(to, from)) == (iconv_t)-1)
	53	fatal(errno, "error calling iconv_open");
	54	do {
	55	bufsize = bufsize ? 2 * bufsize : 32;
	56	buf = xrealloc_noptr(buf, bufsize);
	57	iconv(i, 0, 0, 0, 0);
	58	s = (char *)ptr;
	59	sl = n;
	60	d = buf;
	61	dl = bufsize;
	62	/* (void ) to work around FreeBSD's nonstandard iconv prototype /
	63	len = iconv(i, (void *)&s, &sl, &d, &dl);
	64	} while(len == (size_t)-1 && errno == E2BIG);
	65	iconv_close(i);
	66	if(len == (size_t)-1) {
	67	error(errno, "error converting from %s to %s", from, to);
	68	return 0;
	69	}
	70	return buf;
	71	}
	72
14ad73b9	73	/** @brief Convert from the local multibyte encoding to UTF-8 */
460b9539	74	char mb2utf8(const char mb) {
	75	return convert(nl_langinfo(CODESET), "UTF-8", mb, strlen(mb) + 1);
	76	}
	77
14ad73b9	78	/** @brief Convert from UTF-8 to the local multibyte encoding */
460b9539	79	char utf82mb(const char utf8) {
	80	return convert("UTF-8", nl_langinfo(CODESET), utf8, strlen(utf8) + 1);
	81	}
	82
14ad73b9	83	/** @brief Convert from encoding @p from to UTF-8 */
460b9539	84	char any2utf8(const char from, const char *any) {
	85	return convert(from, "UTF-8", any, strlen(any) + 1);
	86	}
	87
14ad73b9	88	/** @brief Convert from encoding @p from to the local multibyte encoding */
460b9539	89	char any2mb(const char from, const char *any) {
	90	if(from) return convert(from, nl_langinfo(CODESET), any, strlen(any) + 1);
	91	else return xstrdup(any);
	92	}
	93
14ad73b9	94	/** @brief Convert from encoding @p from to encoding @p to */
460b9539	95	char any2any(const char from,
	96	const char *to,
	97	const char *any) {
	98	if(from \|\| to) return convert(from, to, any, strlen(any) + 1);
	99	else return xstrdup(any);
	100	}
	101
61507e3c RK	102	/** @brief Return nonzero if @p c is a combining character */
	103	static int combining(int c) {
	104	if(c < UNICODE_NCHARS) {
e5a5a138	105	const struct unidata *const ud = &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
61507e3c	106
14523635	107	return ud->general_category == unicode_General_Category_Mn \|\| ud->ccc != 0;
61507e3c RK	108	}
	109	/* Assume unknown characters are noncombining */
	110	return 0;
	111	}
	112
	113	/** @brief Truncate a string for display purposes
	114	* @param s Pointer to UTF-8 string
	115	* @param max Maximum number of columns
	116	* @return @p or truncated string (never NULL)
	117	*
	118	* We don't correctly support bidi or double-width characters yet, nor
	119	* locate default grapheme cluster boundaries for saner truncation.
	120	*/
	121	const char truncate_for_display(const char s, long max) {
	122	const char t = s, r, *cut = 0;
	123	char *truncated;
	124	uint32_t c;
	125	long n = 0;
	126
	127	/* We need to discover two things: firstly whether the string is
	128	* longer than @p max glyphs and secondly if it is not, where to cut
	129	* the string.
	130	*
	131	* Combining characters follow their base character (unicode
	132	* standard 5.0 s2.11), so after each base character we must
	133	*/
	134	while(*t) {
	135	PARSE_UTF8(t, c, return s);
	136	if(combining(c))
	137	/* This must be an initial combining character. We just skip it. */
	138	continue;
	139	/* So c must be a base character. It may be followed by any
	140	* number of combining characters. We advance past them. */
	141	do {
	142	r = t;
	143	PARSE_UTF8(t, c, return s);
	144	} while(combining(c));
	145	/* Last character wasn't a combining character so back up */
	146	t = r;
	147	++n;
	148	/* So now there are N glyphs before position T. We might
	149	* therefore have reached the cut position. */
	150	if(n == max - 3)
	151	cut = t;
	152	}
	153	/* If the string is short enough we return it unmodified */
	154	if(n < max)
	155	return s;
	156	truncated = xmalloc_noptr(cut - s + 4);
	157	memcpy(truncated, s, cut - s);
	158	strcpy(truncated + (cut - s), "...");
	159	return truncated;
	160	}
	161
460b9539	162	/*
	163	Local Variables:
	164	c-basic-offset:2
	165	comment-column:40
	166	End:
	167	*/