1 /* utf8conf.c - UTF8 character set conversion
2 * Copyright (C) 1994, 1998, 1999, 2000, 2001, 2003, 2006,
3 * 2008, 2010 Free Software Foundation, Inc.
5 * This file is part of GnuPG.
7 * GnuPG is free software; you can redistribute it and/or modify it
8 * under the terms of either
10 * - the GNU Lesser General Public License as published by the Free
11 * Software Foundation; either version 3 of the License, or (at
12 * your option) any later version.
16 * - the GNU General Public License as published by the Free
17 * Software Foundation; either version 2 of the License, or (at
18 * your option) any later version.
20 * or both in parallel, as here.
22 * GnuPG is distributed in the hope that it will be useful, but
23 * WITHOUT ANY WARRANTY; without even the implied warranty of
24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
25 * General Public License for more details.
27 * You should have received a copies of the GNU General Public License
28 * and the GNU Lesser General Public License along with this program;
29 * if not, see <https://www.gnu.org/licenses/>.
37 #ifdef HAVE_LANGINFO_CODESET
43 # /* Tell libgpg-error to provide the iconv macros. */
44 # define GPGRT_ENABLE_W32_ICONV_MACROS 1
45 #elif HAVE_ANDROID_SYSTEM
46 # /* No iconv support. */
53 #include "common-defs.h"
55 #include "stringhelp.h"
62 static const char *active_charset_name = "iso-8859-1";
63 static int no_translation; /* Set to true if we let simply pass through. */
64 static int use_iconv; /* iconv conversion functions required. */
67 #ifdef HAVE_ANDROID_SYSTEM
68 /* Fake stuff to get things building. */
69 typedef void *iconv_t;
73 iconv_open (const char *tocode, const char *fromcode)
81 iconv (iconv_t cd, char **inbuf, size_t *inbytesleft,
82 char **outbuf, size_t *outbytesleft)
93 iconv_close (iconv_t cd)
98 #endif /*HAVE_ANDROID_SYSTEM*/
101 /* Error handler for iconv failures. This is needed to not clutter the
102 output with repeated diagnostics about a missing conversion. */
104 handle_iconv_error (const char *to, const char *from, int use_fallback)
108 static int shown1, shown2;
111 if (to && !strcmp (to, "utf-8"))
123 log_info (_("conversion from '%s' to '%s' not available\n"),
131 log_info (_("iconv_open failed: %s\n"), strerror (errno));
137 /* To avoid further error messages we fallback to UTF-8 for the
138 native encoding. Nowadays this seems to be the best bet in
139 case of errors from iconv or nl_langinfo. */
140 active_charset_name = "utf-8";
149 set_native_charset (const char *newset)
151 const char *full_newset;
155 #ifdef HAVE_ANDROID_SYSTEM
157 #elif defined HAVE_W32_SYSTEM
158 static char codepage[30];
162 /* We are a console program thus we need to use the
163 GetConsoleOutputCP function and not the the GetACP which
164 would give the codepage for a GUI program. Note this is not
165 a bulletproof detection because GetConsoleCP might return a
166 different one for console input. Not sure how to cope with
167 that. If the console Code page is not known we fall back to
168 the system code page. */
169 #ifndef HAVE_W32CE_SYSTEM
170 cpno = GetConsoleOutputCP ();
174 sprintf (codepage, "CP%u", cpno );
175 /* Resolve alias. We use a long string string and not the usual
176 array to optimize if the code is taken to a DSO. Taken from
179 for (aliases = ("CP936" "\0" "GBK" "\0"
180 "CP1361" "\0" "JOHAB" "\0"
181 "CP20127" "\0" "ASCII" "\0"
182 "CP20866" "\0" "KOI8-R" "\0"
183 "CP21866" "\0" "KOI8-RU" "\0"
184 "CP28591" "\0" "ISO-8859-1" "\0"
185 "CP28592" "\0" "ISO-8859-2" "\0"
186 "CP28593" "\0" "ISO-8859-3" "\0"
187 "CP28594" "\0" "ISO-8859-4" "\0"
188 "CP28595" "\0" "ISO-8859-5" "\0"
189 "CP28596" "\0" "ISO-8859-6" "\0"
190 "CP28597" "\0" "ISO-8859-7" "\0"
191 "CP28598" "\0" "ISO-8859-8" "\0"
192 "CP28599" "\0" "ISO-8859-9" "\0"
193 "CP28605" "\0" "ISO-8859-15" "\0"
194 "CP65001" "\0" "UTF-8" "\0");
196 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
198 if (!strcmp (codepage, aliases) ||(*aliases == '*' && !aliases[1]))
200 newset = aliases + strlen (aliases) + 1;
205 #else /*!HAVE_W32_SYSTEM && !HAVE_ANDROID_SYSTEM*/
207 #ifdef HAVE_LANGINFO_CODESET
208 newset = nl_langinfo (CODESET);
209 #else /*!HAVE_LANGINFO_CODESET*/
210 /* Try to get the used charset from environment variables. */
211 static char codepage[30];
212 const char *lc, *dot, *mod;
214 strcpy (codepage, "iso-8859-1");
215 lc = getenv ("LC_ALL");
218 lc = getenv ("LC_CTYPE");
220 lc = getenv ("LANG");
224 dot = strchr (lc, '.');
227 mod = strchr (++dot, '@');
229 mod = dot + strlen (dot);
230 if (mod - dot < sizeof codepage && dot != mod)
232 memcpy (codepage, dot, mod - dot);
233 codepage [mod - dot] = 0;
238 #endif /*!HAVE_LANGINFO_CODESET*/
239 #endif /*!HAVE_W32_SYSTEM && !HAVE_ANDROID_SYSTEM*/
242 full_newset = newset;
243 if (strlen (newset) > 3 && !ascii_memcasecmp (newset, "iso", 3))
246 if (*newset == '-' || *newset == '_')
250 /* Note that we silently assume that plain ASCII is actually meant
251 as Latin-1. This makes sense because many Unix system don't have
252 their locale set up properly and thus would get annoying error
253 messages and we have to handle all the "bug" reports. Latin-1 has
254 traditionally been the character set used for 8 bit characters on
257 || !ascii_strcasecmp (newset, "8859-1" )
258 || !ascii_strcasecmp (newset, "646" )
259 || !ascii_strcasecmp (newset, "ASCII" )
260 || !ascii_strcasecmp (newset, "ANSI_X3.4-1968" )
263 active_charset_name = "iso-8859-1";
267 else if ( !ascii_strcasecmp (newset, "utf8" )
268 || !ascii_strcasecmp(newset, "utf-8") )
270 active_charset_name = "utf-8";
278 cd = iconv_open (full_newset, "utf-8");
279 if (cd == (iconv_t)-1)
281 handle_iconv_error (full_newset, "utf-8", 0);
285 cd = iconv_open ("utf-8", full_newset);
286 if (cd == (iconv_t)-1)
288 handle_iconv_error ("utf-8", full_newset, 0);
292 active_charset_name = full_newset;
300 get_native_charset ()
302 return active_charset_name;
305 /* Return true if the native charset is utf-8. */
307 is_native_utf8 (void)
309 return no_translation;
313 /* Convert string, which is in native encoding to UTF8 and return a
314 new allocated UTF-8 string. This function terminates the process
315 on memory shortage. */
317 native_to_utf8 (const char *orig_string)
319 const unsigned char *string = (const unsigned char *)orig_string;
320 const unsigned char *s;
327 /* Already utf-8 encoded. */
328 buffer = xstrdup (orig_string);
332 /* For Latin-1 we can avoid the iconv overhead. */
333 for (s = string; *s; s++)
339 buffer = xmalloc (length + 1);
340 for (p = (unsigned char *)buffer, s = string; *s; s++)
344 *p++ = 0xc0 | ((*s >> 6) & 3);
345 *p++ = 0x80 | (*s & 0x3f);
354 /* Need to use iconv. */
358 size_t inbytes, outbytes;
360 cd = iconv_open ("utf-8", active_charset_name);
361 if (cd == (iconv_t)-1)
363 handle_iconv_error ("utf-8", active_charset_name, 1);
364 return native_to_utf8 (string);
367 for (s=string; *s; s++ )
371 length += 5; /* We may need up to 6 bytes for the utf8 output. */
373 buffer = xmalloc (length + 1);
376 inbytes = strlen (string);
379 if ( iconv (cd, (ICONV_CONST char **)&inptr, &inbytes,
380 &outptr, &outbytes) == (size_t)-1)
385 log_info (_("conversion from '%s' to '%s' failed: %s\n"),
386 active_charset_name, "utf-8", strerror (errno));
388 /* We don't do any conversion at all but use the strings as is. */
389 strcpy (buffer, string);
394 /* We could realloc the buffer now but I doubt that it makes
395 much sense given that it will get freed anyway soon
406 do_utf8_to_native (const char *string, size_t length, int delim,
411 unsigned char encbuf[8];
413 const unsigned char *s;
417 unsigned long val = 0;
421 /* First pass (p==NULL): count the extended utf-8 characters. */
422 /* Second pass (p!=NULL): create string. */
425 for (slen = length, nleft = encidx = 0, n = 0,
426 s = (const unsigned char *)string;
432 if (!(*s < 128 || (*s >= 0xc0 && *s <= 0xfd)))
437 sprintf (p, "\\x%02x", *s);
451 && (*s < 0x20 || *s == 0x7f || *s == delim
452 || (delim && *s == '\\')))
459 case '\n': n++; if ( p ) *p++ = 'n'; break;
460 case '\r': n++; if ( p ) *p++ = 'r'; break;
461 case '\f': n++; if ( p ) *p++ = 'f'; break;
462 case '\v': n++; if ( p ) *p++ = 'v'; break;
463 case '\b': n++; if ( p ) *p++ = 'b'; break;
464 case 0: n++; if ( p ) *p++ = '0'; break;
469 sprintf (p, "x%02x", *s);
482 else if ((*s & 0xe0) == 0xc0) /* 110x xxxx */
487 encbuf[encidx++] = *s;
489 else if ((*s & 0xf0) == 0xe0) /* 1110 xxxx */
494 encbuf[encidx++] = *s;
496 else if ((*s & 0xf8) == 0xf0) /* 1111 0xxx */
501 encbuf[encidx++] = *s;
503 else if ((*s & 0xfc) == 0xf8) /* 1111 10xx */
508 encbuf[encidx++] = *s;
510 else if ((*s & 0xfe) == 0xfc) /* 1111 110x */
515 encbuf[encidx++] = *s;
517 else /* Invalid encoding: print as \xNN. */
521 sprintf (p, "\\x%02x", *s);
528 else if (*s < 0x80 || *s >= 0xc0) /* Invalid utf-8 */
532 for (i = 0; i < encidx; i++)
534 sprintf (p, "\\x%02x", encbuf[i]);
537 sprintf (p, "\\x%02x", *s);
547 encbuf[encidx++] = *s;
550 if (!--nleft) /* Ready. */
556 for (i = 0; i < encidx; i++)
564 /* Our strategy for using iconv is a bit strange
565 but it better keeps compatibility with
566 previous versions in regard to how invalid
567 encodings are displayed. What we do is to
568 keep the utf-8 as is and have the real
569 translation step then at the end. Yes, I
570 know that this is ugly. However we are short
571 of the 1.4 release and for this branch we
572 should not mess too much around with iconv
573 things. One reason for this is that we don't
574 know enough about non-GNU iconv
575 implementation and want to minimize the risk
576 of breaking the code on too many platforms. */
579 for (i=0; i < encidx; i++ )
585 else /* Latin-1 case. */
587 if (val >= 0x80 && val < 256)
589 /* We can simply print this character */
596 /* We do not have a translation: print utf8. */
599 for (i = 0; i < encidx; i++)
601 sprintf (p, "\\x%02x", encbuf[i]);
615 /* Allocate the buffer after the first pass. */
616 buffer = p = xmalloc (n + 1);
620 /* Note: See above for comments. */
623 char *outbuf, *outptr;
624 size_t inbytes, outbytes;
626 *p = 0; /* Terminate the buffer. */
628 cd = iconv_open (active_charset_name, "utf-8");
629 if (cd == (iconv_t)-1)
631 handle_iconv_error (active_charset_name, "utf-8", 1);
633 return utf8_to_native (string, length, delim);
636 /* Allocate a new buffer large enough to hold all possible
641 outbytes = n * MB_LEN_MAX;
642 if (outbytes / MB_LEN_MAX != n)
643 BUG (); /* Actually an overflow. */
644 outbuf = outptr = xmalloc (outbytes);
645 if ( iconv (cd, (ICONV_CONST char **)&inptr, &inbytes,
646 &outptr, &outbytes) == (size_t)-1)
651 log_info (_("conversion from '%s' to '%s' failed: %s\n"),
652 "utf-8", active_charset_name, strerror (errno));
654 /* Didn't worked out. Try again but without iconv. */
658 outbuf = do_utf8_to_native (string, length, delim, 0);
662 *outptr = 0; /* Make sure it is a string. */
663 /* We could realloc the buffer now but I doubt that it
664 makes much sense given that it will get freed
665 anyway soon after. */
671 else /* Not using iconv. */
673 *p = 0; /* Make sure it is a string. */
679 /* Convert string, which is in UTF-8 to native encoding. Replace
680 illegal encodings by some "\xnn" and quote all control
681 characters. A character with value DELIM will always be quoted, it
682 must be a vanilla ASCII character. A DELIM value of -1 is special:
683 it disables all quoting of control characters. This function
684 terminates the process on memory shortage. */
686 utf8_to_native (const char *string, size_t length, int delim)
688 return do_utf8_to_native (string, length, delim, use_iconv);
694 /* Wrapper function for iconv_open, required for W32 as we dlopen that
695 library on that system. */
697 jnlib_iconv_open (const char *tocode, const char *fromcode)
699 return (jnlib_iconv_t)iconv_open (tocode, fromcode);
703 /* Wrapper function for iconv, required for W32 as we dlopen that
704 library on that system. */
706 jnlib_iconv (jnlib_iconv_t cd,
707 const char **inbuf, size_t *inbytesleft,
708 char **outbuf, size_t *outbytesleft)
710 return iconv ((iconv_t)cd, (ICONV_CONST char**)inbuf, inbytesleft,
711 outbuf, outbytesleft);
714 /* Wrapper function for iconv_close, required for W32 as we dlopen that
715 library on that system. */
717 jnlib_iconv_close (jnlib_iconv_t cd)
719 return iconv_close ((iconv_t)cd);
723 #ifdef HAVE_W32_SYSTEM
724 /* Return a malloced string encoded for CODEPAGE from the wide char input
725 string STRING. Caller must free this value. Returns NULL and sets
726 ERRNO on failure. Calling this function with STRING set to NULL is
729 wchar_to_cp (const wchar_t *string, unsigned int codepage)
734 n = WideCharToMultiByte (codepage, 0, string, -1, NULL, 0, NULL, NULL);
737 gpg_err_set_errno (EINVAL);
741 result = xtrymalloc (n+1);
745 n = WideCharToMultiByte (codepage, 0, string, -1, result, n, NULL, NULL);
749 gpg_err_set_errno (EINVAL);
756 /* Return a malloced wide char string from a CODEPAGE encoded input
757 string STRING. Caller must free this value. Returns NULL and sets
758 ERRNO on failure. Calling this function with STRING set to NULL is
761 cp_to_wchar (const char *string, unsigned int codepage)
767 n = MultiByteToWideChar (codepage, 0, string, -1, NULL, 0);
770 gpg_err_set_errno (EINVAL);
774 nbytes = (size_t)(n+1) * sizeof(*result);
775 if (nbytes / sizeof(*result) != (n+1))
777 gpg_err_set_errno (ENOMEM);
780 result = xtrymalloc (nbytes);
784 n = MultiByteToWideChar (codepage, 0, string, -1, result, n);
788 gpg_err_set_errno (EINVAL);
795 /* Return a malloced string encoded in the active code page from the
796 * wide char input string STRING. Caller must free this value.
797 * Returns NULL and sets ERRNO on failure. Calling this function with
798 * STRING set to NULL is not defined. */
800 wchar_to_native (const wchar_t *string)
802 return wchar_to_cp (string, CP_ACP);
806 /* Return a malloced wide char string from an UTF-8 encoded input
807 * string STRING. Caller must free this value. Returns NULL and sets
808 * ERRNO on failure. Calling this function with STRING set to NULL is
811 native_to_wchar (const char *string)
813 return cp_to_wchar (string, CP_ACP);
817 /* Return a malloced string encoded in UTF-8 from the wide char input
818 * string STRING. Caller must free this value. Returns NULL and sets
819 * ERRNO on failure. Calling this function with STRING set to NULL is
822 wchar_to_utf8 (const wchar_t *string)
824 return wchar_to_cp (string, CP_UTF8);
828 /* Return a malloced wide char string from an UTF-8 encoded input
829 * string STRING. Caller must free this value. Returns NULL and sets
830 * ERRNO on failure. Calling this function with STRING set to NULL is
833 utf8_to_wchar (const char *string)
835 return cp_to_wchar (string, CP_UTF8);
838 #endif /*HAVE_W32_SYSTEM*/