1 /* Conversion between UTF-16 and UTF-32 BE/internal.
3 This module uses the Z9-109 variants of the Convert Unicode
5 Copyright (C) 1997-2009 Free Software Foundation, Inc.
7 Author: Andreas Krebbel <Andreas.Krebbel@de.ibm.com>
8 Based on the work by Ulrich Drepper <drepper@cygnus.com>, 1997.
10 Thanks to Daniel Appich who covered the relevant performance work
11 in his diploma thesis.
13 This is free software; you can redistribute it and/or
14 modify it under the terms of the GNU Lesser General Public
15 License as published by the Free Software Foundation; either
16 version 2.1 of the License, or (at your option) any later version.
18 This is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 Lesser General Public License for more details.
23 You should have received a copy of the GNU Lesser General Public
24 License along with the GNU C Library; if not, write to the Free
25 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
31 #include <dl-procinfo.h>
34 /* UTF-16 big endian byte order mark. */
35 #define BOM_UTF16 0xfeff
39 #define MIN_NEEDED_FROM 1
40 #define MAX_NEEDED_FROM 4
41 #define MIN_NEEDED_TO 2
42 #define MAX_NEEDED_TO 4
43 #define FROM_LOOP from_utf8_loop
44 #define TO_LOOP to_utf8_loop
45 #define FROM_DIRECTION (dir == from_utf8)
46 #define PREPARE_LOOP \
47 enum direction dir = ((struct utf8_data *) step->__data)->dir; \
48 int emit_bom = ((struct utf8_data *) step->__data)->emit_bom; \
50 if (emit_bom && !data->__internal_use \
51 && data->__invocation_counter == 0) \
53 /* Emit the UTF-16 Byte Order Mark. */ \
54 if (__builtin_expect (outbuf + 2 > outend, 0)) \
55 return __GCONV_FULL_OUTPUT; \
57 put16u (outbuf, BOM_UTF16); \
61 /* Direction of the transformation. */
76 extern int gconv_init (struct __gconv_step *step);
78 gconv_init (struct __gconv_step *step)
80 /* Determine which direction. */
81 struct utf8_data *new_data;
82 enum direction dir = illegal_dir;
86 emit_bom = (__strcasecmp (step->__to_name, "UTF-16//") == 0);
88 if (__strcasecmp (step->__from_name, "ISO-10646/UTF8/") == 0
89 && (__strcasecmp (step->__to_name, "UTF-16//") == 0
90 || __strcasecmp (step->__to_name, "UTF-16BE//") == 0))
94 else if (__strcasecmp (step->__from_name, "UTF-16BE//") == 0
95 && __strcasecmp (step->__to_name, "ISO-10646/UTF8/") == 0)
100 result = __GCONV_NOCONV;
101 if (dir != illegal_dir)
103 new_data = (struct utf8_data *) malloc (sizeof (struct utf8_data));
105 result = __GCONV_NOMEM;
106 if (new_data != NULL)
109 new_data->emit_bom = emit_bom;
110 step->__data = new_data;
112 if (dir == from_utf8)
114 step->__min_needed_from = MIN_NEEDED_FROM;
115 step->__max_needed_from = MIN_NEEDED_FROM;
116 step->__min_needed_to = MIN_NEEDED_TO;
117 step->__max_needed_to = MIN_NEEDED_TO;
121 step->__min_needed_from = MIN_NEEDED_TO;
122 step->__max_needed_from = MIN_NEEDED_TO;
123 step->__min_needed_to = MIN_NEEDED_FROM;
124 step->__max_needed_to = MIN_NEEDED_FROM;
127 step->__stateful = 0;
137 extern void gconv_end (struct __gconv_step *data);
139 gconv_end (struct __gconv_step *data)
144 /* The macro for the hardware loop. This is used for both
146 #define HARDWARE_CONVERT(INSTRUCTION) \
148 register const unsigned char* pInput asm ("8") = inptr; \
149 register unsigned long long inlen asm ("9") = inend - inptr; \
150 register unsigned char* pOutput asm ("10") = outptr; \
151 register unsigned long long outlen asm("11") = outend - outptr; \
154 asm volatile ("0: " INSTRUCTION " \n\t" \
157 : "+a" (pOutput), "+a" (pInput), "+d" (cc), \
158 "+d" (outlen), "+d" (inlen) \
168 result = __GCONV_FULL_OUTPUT; \
173 result = __GCONV_ILLEGAL_INPUT; \
178 /* Conversion function from UTF-8 to UTF-16. */
180 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
181 #define MAX_NEEDED_INPUT MAX_NEEDED_FROM
182 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
183 #define LOOPFCT FROM_LOOP
184 /* The software implementation is based on the code in gconv_simple.c. */
187 if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) \
189 HARDWARE_CONVERT ("cu12 %0, %1, 1"); \
191 if (inptr != inend) \
194 for (i = 1; inptr + i < inend; ++i) \
195 if ((inptr[i] & 0xc0) != 0x80) \
198 if (__builtin_expect (inptr + i == inend, 1)) \
200 result = __GCONV_INCOMPLETE_INPUT; \
203 STANDARD_FROM_LOOP_ERR_HANDLER (i); \
208 /* Next input byte. */ \
209 uint16_t ch = *inptr; \
211 if (__builtin_expect (ch < 0x80, 1)) \
213 /* One byte sequence. */ \
221 if (ch >= 0xc2 && ch < 0xe0) \
223 /* We expect two bytes. The first byte cannot be 0xc0 \
224 or 0xc1, otherwise the wide character could have been \
225 represented using a single byte. */ \
229 else if (__builtin_expect ((ch & 0xf0) == 0xe0, 1)) \
231 /* We expect three bytes. */ \
235 else if (__builtin_expect ((ch & 0xf8) == 0xf0, 1)) \
237 /* We expect four bytes. */ \
243 /* Search the end of this ill-formed UTF-8 character. This \
244 is the next byte with (x & 0xc0) != 0x80. */ \
248 while (inptr + i < inend \
249 && (*(inptr + i) & 0xc0) == 0x80 \
253 STANDARD_FROM_LOOP_ERR_HANDLER (i); \
256 if (__builtin_expect (inptr + cnt > inend, 0)) \
258 /* We don't have enough input. But before we report \
259 that check that all the bytes are correct. */ \
260 for (i = 1; inptr + i < inend; ++i) \
261 if ((inptr[i] & 0xc0) != 0x80) \
264 if (__builtin_expect (inptr + i == inend, 1)) \
266 result = __GCONV_INCOMPLETE_INPUT; \
275 /* For 4 byte UTF-8 chars two UTF-16 chars (high and \
276 low) are needed. */ \
277 uint16_t zabcd, high, low; \
279 if (__builtin_expect (outptr + 4 > outend, 0)) \
281 /* Overflow in the output buffer. */ \
282 result = __GCONV_FULL_OUTPUT; \
286 /* See Principles of Operations cu12. */ \
287 zabcd = (((inptr[0] & 0x7) << 2) | \
288 ((inptr[1] & 0x30) >> 4)) - 1; \
290 /* z-bit must be zero after subtracting 1. */ \
292 STANDARD_FROM_LOOP_ERR_HANDLER (4) \
294 high = (uint16_t)(0xd8 << 8); /* high surrogate id */ \
295 high |= zabcd << 6; /* abcd bits */ \
296 high |= (inptr[1] & 0xf) << 2; /* efgh bits */ \
297 high |= (inptr[2] & 0x30) >> 4; /* ij bits */ \
299 low = (uint16_t)(0xdc << 8); /* low surrogate id */ \
300 low |= ((uint16_t)inptr[2] & 0xc) << 6; /* kl bits */ \
301 low |= (inptr[2] & 0x3) << 6; /* mn bits */ \
302 low |= inptr[3] & 0x3f; /* opqrst bits */ \
304 put16 (outptr, high); \
306 put16 (outptr, low); \
313 /* Read the possible remaining bytes. */ \
314 for (i = 1; i < cnt; ++i) \
316 uint16_t byte = inptr[i]; \
318 if ((byte & 0xc0) != 0x80) \
319 /* This is an illegal encoding. */ \
329 /* Now adjust the pointers and store the result. */ \
330 *((uint16_t *) outptr) = ch; \
331 outptr += sizeof (uint16_t); \
334 #define LOOP_NEED_FLAGS
335 #include <iconv/loop.c>
337 /* Conversion from UTF-16 to UTF-8. */
339 #define MIN_NEEDED_INPUT MIN_NEEDED_TO
340 #define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM
341 #define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM
342 #define LOOPFCT TO_LOOP
343 /* The software routine is based on the functionality of the S/390
344 hardware instruction (cu21) as described in the Principles of
348 /* The hardware instruction currently fails to report an error for \
349 isolated low surrogates so we have to disable the instruction \
350 until this gets resolved. */ \
351 if (0) /* (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) */ \
353 HARDWARE_CONVERT ("cu21 %0, %1, 1"); \
354 if (inptr != inend) \
356 /* Check if the third byte is \
357 a valid start of a UTF-16 surrogate. */ \
358 if (inend - inptr == 3 && (inptr[3] & 0xfc) != 0xdc) \
359 STANDARD_TO_LOOP_ERR_HANDLER (3); \
361 result = __GCONV_INCOMPLETE_INPUT; \
367 uint16_t c = get16 (inptr); \
369 if (__builtin_expect (c <= 0x007f, 1)) \
371 /* Single byte UTF-8 char. */ \
372 *outptr = c & 0xff; \
375 else if (c >= 0x0080 && c <= 0x07ff) \
377 /* Two byte UTF-8 char. */ \
379 if (__builtin_expect (outptr + 2 > outend, 0)) \
381 /* Overflow in the output buffer. */ \
382 result = __GCONV_FULL_OUTPUT; \
387 outptr[0] |= c >> 6; \
390 outptr[1] |= c & 0x3f; \
394 else if ((c >= 0x0800 && c <= 0xd7ff) || c > 0xdfff) \
396 /* Three byte UTF-8 char. */ \
398 if (__builtin_expect (outptr + 3 > outend, 0)) \
400 /* Overflow in the output buffer. */ \
401 result = __GCONV_FULL_OUTPUT; \
405 outptr[0] |= c >> 12; \
408 outptr[1] |= (c >> 6) & 0x3f; \
411 outptr[2] |= c & 0x3f; \
415 else if (c >= 0xd800 && c <= 0xdbff) \
417 /* Four byte UTF-8 char. */ \
418 uint16_t low, uvwxy; \
420 if (__builtin_expect (outptr + 4 > outend, 0)) \
422 /* Overflow in the output buffer. */ \
423 result = __GCONV_FULL_OUTPUT; \
427 if (__builtin_expect (inptr + 2 > inend, 0)) \
429 result = __GCONV_INCOMPLETE_INPUT; \
433 low = get16 (inptr); \
435 if ((low & 0xfc00) != 0xdc00) \
438 STANDARD_TO_LOOP_ERR_HANDLER (2); \
440 uvwxy = ((c >> 6) & 0xf) + 1; \
442 outptr[0] |= uvwxy >> 2; \
445 outptr[1] |= (uvwxy << 4) & 0x30; \
446 outptr[1] |= (c >> 2) & 0x0f; \
449 outptr[2] |= (c & 0x03) << 4; \
450 outptr[2] |= (low >> 6) & 0x0f; \
453 outptr[3] |= low & 0x3f; \
459 STANDARD_TO_LOOP_ERR_HANDLER (2); \
463 #define LOOP_NEED_FLAGS
464 #include <iconv/loop.c>
466 #include <iconv/skeleton.c>