chiark - git - mdw - disorder/blob - lib/charset.c

   1 /*
   2  * This file is part of DisOrder.
   3  * Copyright (C) 2004, 2005 Richard Kettlewell
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation; either version 2 of the License, or
   8  * (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
  18  * USA
  19  */
  20 /** @file lib/charset.c @brief Character set conversion */
  21
  22 #include <config.h>
  23 #include "types.h"
  24
  25 #include <iconv.h>
  26 #include <string.h>
  27 #include <errno.h>
  28 #include <langinfo.h>
  29
  30 #include "mem.h"
  31 #include "log.h"
  32 #include "charset.h"
  33 #include "configuration.h"
  34 #include "utf8.h"
  35 #include "vector.h"
  36 #include "unidata.h"
  37
  38 /** @brief Low-level converstion routine
  39  * @param from Source encoding
  40  * @param to Destination encoding
  41  * @param ptr First byte to convert
  42  * @param n Number of bytes to convert
  43  * @return Converted text, 0-terminated; or NULL on error.
  44  */
  45 static void *convert(const char *from, const char *to,
  46                      const void *ptr, size_t n) {
  47   iconv_t i;
  48   size_t len;
  49   char *buf = 0, *s, *d;
  50   size_t bufsize = 0, sl, dl;
  51
  52   if((i = iconv_open(to, from)) == (iconv_t)-1)
  53     fatal(errno, "error calling iconv_open");
  54   do {
  55     bufsize = bufsize ? 2 * bufsize : 32;
  56     buf = xrealloc_noptr(buf, bufsize);
  57     iconv(i, 0, 0, 0, 0);
  58     s = (char *)ptr;
  59     sl = n;
  60     d = buf;
  61     dl = bufsize;
  62     /* (void *) to work around FreeBSD's nonstandard iconv prototype */
  63     len = iconv(i, (void *)&s, &sl, &d, &dl);
  64   } while(len == (size_t)-1 && errno == E2BIG);
  65   iconv_close(i);
  66   if(len == (size_t)-1) {
  67     error(errno, "error converting from %s to %s", from, to);
  68     return 0;
  69   }
  70   return buf;
  71 }
  72
  73 /** @brief Convert from the local multibyte encoding to UTF-8 */
  74 char *mb2utf8(const char *mb) {
  75   return convert(nl_langinfo(CODESET), "UTF-8", mb, strlen(mb) + 1);
  76 }
  77
  78 /** @brief Convert from UTF-8 to the local multibyte encoding */
  79 char *utf82mb(const char *utf8) {
  80   return convert("UTF-8", nl_langinfo(CODESET), utf8, strlen(utf8) + 1);
  81 }
  82
  83 /** @brief Convert from encoding @p from to UTF-8 */
  84 char *any2utf8(const char *from, const char *any) {
  85   return convert(from, "UTF-8", any, strlen(any) + 1);
  86 }
  87
  88 /** @brief Convert from encoding @p from to the local multibyte encoding */
  89 char *any2mb(const char *from, const char *any) {
  90   if(from) return convert(from, nl_langinfo(CODESET), any, strlen(any) + 1);
  91   else return xstrdup(any);
  92 }
  93
  94 /** @brief Convert from encoding @p from to encoding @p to */
  95 char *any2any(const char *from,
  96               const char *to,
  97               const char *any) {
  98   if(from || to) return convert(from, to, any, strlen(any) + 1);
  99   else return xstrdup(any);
 100 }
 101
 102 /** @brief Return nonzero if @p c is a combining character */
 103 static int combining(int c) {
 104   if(c < UNICODE_NCHARS) {
 105     const struct unidata *const ud = &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
 106
 107     return ud->general_category == unicode_General_Category_Mn || ud->ccc != 0;
 108   }
 109   /* Assume unknown characters are noncombining */
 110   return 0;
 111 }
 112
 113 /** @brief Truncate a string for display purposes
 114  * @param s Pointer to UTF-8 string
 115  * @param max Maximum number of columns
 116  * @return @p or truncated string (never NULL)
 117  *
 118  * We don't correctly support bidi or double-width characters yet, nor
 119  * locate default grapheme cluster boundaries for saner truncation.
 120  */
 121 const char *truncate_for_display(const char *s, long max) {
 122   const char *t = s, *r, *cut = 0;
 123   char *truncated;
 124   uint32_t c;
 125   long n = 0;
 126
 127   /* We need to discover two things: firstly whether the string is
 128    * longer than @p max glyphs and secondly if it is not, where to cut
 129    * the string.
 130    *
 131    * Combining characters follow their base character (unicode
 132    * standard 5.0 s2.11), so after each base character we must
 133    */
 134   while(*t) {
 135     PARSE_UTF8(t, c, return s);
 136     if(combining(c))
 137       /* This must be an initial combining character.  We just skip it. */
 138       continue;
 139     /* So c must be a base character.  It may be followed by any
 140      * number of combining characters.  We advance past them. */
 141     do {
 142       r = t;
 143       PARSE_UTF8(t, c, return s);
 144     } while(combining(c));
 145     /* Last character wasn't a combining character so back up */
 146     t = r;
 147     ++n;
 148     /* So now there are N glyphs before position T.  We might
 149      * therefore have reached the cut position. */
 150     if(n == max - 3)
 151       cut = t;
 152   }
 153   /* If the string is short enough we return it unmodified */
 154   if(n < max)
 155     return s;
 156   truncated = xmalloc_noptr(cut - s + 4);
 157   memcpy(truncated, s, cut - s);
 158   strcpy(truncated + (cut - s), "...");
 159   return truncated;
 160 }
 161
 162 /*
 163 Local Variables:
 164 c-basic-offset:2
 165 comment-column:40
 166 End:
 167 */