chiark / gitweb /
eglibc (2.11.3-4+deb6u3) squeeze-lts; urgency=medium
[eglibc.git] / sysdeps / s390 / s390-64 / utf8-utf16-z9.c
1 /* Conversion between UTF-16 and UTF-32 BE/internal.
2
3    This module uses the Z9-109 variants of the Convert Unicode
4    instructions.
5    Copyright (C) 1997-2009 Free Software Foundation, Inc.
6
7    Author: Andreas Krebbel  <Andreas.Krebbel@de.ibm.com>
8    Based on the work by Ulrich Drepper  <drepper@cygnus.com>, 1997.
9
10    Thanks to Daniel Appich who covered the relevant performance work
11    in his diploma thesis.
12
13    This is free software; you can redistribute it and/or
14    modify it under the terms of the GNU Lesser General Public
15    License as published by the Free Software Foundation; either
16    version 2.1 of the License, or (at your option) any later version.
17
18    This is distributed in the hope that it will be useful,
19    but WITHOUT ANY WARRANTY; without even the implied warranty of
20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21    Lesser General Public License for more details.
22
23    You should have received a copy of the GNU Lesser General Public
24    License along with the GNU C Library; if not, write to the Free
25    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
26    02111-1307 USA.  */
27
28 #include <dlfcn.h>
29 #include <stdint.h>
30 #include <unistd.h>
31 #include <dl-procinfo.h>
32 #include <gconv.h>
33
34 /* UTF-16 big endian byte order mark.  */
35 #define BOM_UTF16       0xfeff
36
37 #define DEFINE_INIT             0
38 #define DEFINE_FINI             0
39 #define MIN_NEEDED_FROM         1
40 #define MAX_NEEDED_FROM         4
41 #define MIN_NEEDED_TO           2
42 #define MAX_NEEDED_TO           4
43 #define FROM_LOOP               from_utf8_loop
44 #define TO_LOOP                 to_utf8_loop
45 #define FROM_DIRECTION          (dir == from_utf8)
46 #define PREPARE_LOOP                                                    \
47   enum direction dir = ((struct utf8_data *) step->__data)->dir;        \
48   int emit_bom = ((struct utf8_data *) step->__data)->emit_bom;         \
49                                                                         \
50   if (emit_bom && !data->__internal_use                                 \
51       && data->__invocation_counter == 0)                               \
52     {                                                                   \
53       /* Emit the UTF-16 Byte Order Mark.  */                           \
54       if (__builtin_expect (outbuf + 2 > outend, 0))                    \
55         return __GCONV_FULL_OUTPUT;                                     \
56                                                                         \
57       put16u (outbuf, BOM_UTF16);                                       \
58       outbuf += 2;                                                      \
59     }
60
61 /* Direction of the transformation.  */
62 enum direction
63 {
64   illegal_dir,
65   to_utf8,
66   from_utf8
67 };
68
69 struct utf8_data
70 {
71   enum direction dir;
72   int emit_bom;
73 };
74
75
76 extern int gconv_init (struct __gconv_step *step);
77 int
78 gconv_init (struct __gconv_step *step)
79 {
80   /* Determine which direction.  */
81   struct utf8_data *new_data;
82   enum direction dir = illegal_dir;
83   int emit_bom;
84   int result;
85
86   emit_bom = (__strcasecmp (step->__to_name, "UTF-16//") == 0);
87
88   if (__strcasecmp (step->__from_name, "ISO-10646/UTF8/") == 0
89       && (__strcasecmp (step->__to_name, "UTF-16//") == 0
90           || __strcasecmp (step->__to_name, "UTF-16BE//") == 0))
91     {
92       dir = from_utf8;
93     }
94   else if (__strcasecmp (step->__from_name, "UTF-16BE//") == 0
95            && __strcasecmp (step->__to_name, "ISO-10646/UTF8/") == 0)
96     {
97       dir = to_utf8;
98     }
99
100   result = __GCONV_NOCONV;
101   if (dir != illegal_dir)
102     {
103       new_data = (struct utf8_data *) malloc (sizeof (struct utf8_data));
104
105       result = __GCONV_NOMEM;
106       if (new_data != NULL)
107         {
108           new_data->dir = dir;
109           new_data->emit_bom = emit_bom;
110           step->__data = new_data;
111
112           if (dir == from_utf8)
113             {
114               step->__min_needed_from = MIN_NEEDED_FROM;
115               step->__max_needed_from = MIN_NEEDED_FROM;
116               step->__min_needed_to = MIN_NEEDED_TO;
117               step->__max_needed_to = MIN_NEEDED_TO;
118             }
119           else
120             {
121               step->__min_needed_from = MIN_NEEDED_TO;
122               step->__max_needed_from = MIN_NEEDED_TO;
123               step->__min_needed_to = MIN_NEEDED_FROM;
124               step->__max_needed_to = MIN_NEEDED_FROM;
125             }
126
127           step->__stateful = 0;
128
129           result = __GCONV_OK;
130         }
131     }
132
133   return result;
134 }
135
136
137 extern void gconv_end (struct __gconv_step *data);
138 void
139 gconv_end (struct __gconv_step *data)
140 {
141   free (data->__data);
142 }
143
144 /* The macro for the hardware loop.  This is used for both
145    directions.  */
146 #define HARDWARE_CONVERT(INSTRUCTION)                                   \
147   {                                                                     \
148     register const unsigned char* pInput asm ("8") = inptr;             \
149     register unsigned long long inlen asm ("9") = inend - inptr;        \
150     register unsigned char* pOutput asm ("10") = outptr;                \
151     register unsigned long long outlen asm("11") = outend - outptr;     \
152     uint64_t cc = 0;                                                    \
153                                                                         \
154     asm volatile ("0: " INSTRUCTION "  \n\t"                            \
155                   "   jo     0b        \n\t"                            \
156                   "   ipm    %2        \n"                              \
157                   : "+a" (pOutput), "+a" (pInput), "+d" (cc),           \
158                     "+d" (outlen), "+d" (inlen)                         \
159                   :                                                     \
160                   : "cc", "memory");                                    \
161                                                                         \
162     inptr = pInput;                                                     \
163     outptr = pOutput;                                                   \
164     cc >>= 28;                                                          \
165                                                                         \
166     if (cc == 1)                                                        \
167       {                                                                 \
168         result = __GCONV_FULL_OUTPUT;                                   \
169         break;                                                          \
170       }                                                                 \
171     else if (cc == 2)                                                   \
172       {                                                                 \
173         result = __GCONV_ILLEGAL_INPUT;                                 \
174         break;                                                          \
175       }                                                                 \
176   }
177
178 /* Conversion function from UTF-8 to UTF-16.  */
179
180 #define MIN_NEEDED_INPUT        MIN_NEEDED_FROM
181 #define MAX_NEEDED_INPUT        MAX_NEEDED_FROM
182 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_TO
183 #define LOOPFCT                 FROM_LOOP
184 /* The software implementation is based on the code in gconv_simple.c.  */
185 #define BODY                                                            \
186   {                                                                     \
187     if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH)                            \
188       {                                                                 \
189         HARDWARE_CONVERT ("cu12 %0, %1, 1");                            \
190                                                                         \
191         if (inptr != inend)                                             \
192           {                                                             \
193             int i;                                                      \
194             for (i = 1; inptr + i < inend; ++i)                         \
195               if ((inptr[i] & 0xc0) != 0x80)                            \
196                 break;                                                  \
197                                                                 \
198             if (__builtin_expect (inptr + i == inend, 1))               \
199               {                                                         \
200                 result = __GCONV_INCOMPLETE_INPUT;                      \
201                 break;                                                  \
202               }                                                         \
203             STANDARD_FROM_LOOP_ERR_HANDLER (i);                         \
204           }                                                             \
205         continue;                                                       \
206     }                                                                   \
207                                                                         \
208     /* Next input byte.  */                                             \
209     uint16_t ch = *inptr;                                               \
210                                                                         \
211     if (__builtin_expect (ch < 0x80, 1))                                \
212       {                                                                 \
213         /* One byte sequence.  */                                       \
214         ++inptr;                                                        \
215       }                                                                 \
216     else                                                                \
217       {                                                                 \
218         uint_fast32_t cnt;                                              \
219         uint_fast32_t i;                                                \
220                                                                         \
221         if (ch >= 0xc2 && ch < 0xe0)                                    \
222           {                                                             \
223             /* We expect two bytes.  The first byte cannot be 0xc0      \
224                or 0xc1, otherwise the wide character could have been    \
225                represented using a single byte.  */                     \
226             cnt = 2;                                                    \
227             ch &= 0x1f;                                                 \
228           }                                                             \
229         else if (__builtin_expect ((ch & 0xf0) == 0xe0, 1))             \
230           {                                                             \
231             /* We expect three bytes.  */                               \
232             cnt = 3;                                                    \
233             ch &= 0x0f;                                                 \
234           }                                                             \
235         else if (__builtin_expect ((ch & 0xf8) == 0xf0, 1))             \
236           {                                                             \
237             /* We expect four bytes.  */                                \
238             cnt = 4;                                                    \
239             ch &= 0x07;                                                 \
240           }                                                             \
241         else                                                            \
242           {                                                             \
243             /* Search the end of this ill-formed UTF-8 character.  This \
244                is the next byte with (x & 0xc0) != 0x80.  */            \
245             i = 0;                                                      \
246             do                                                          \
247               ++i;                                                      \
248             while (inptr + i < inend                                    \
249                    && (*(inptr + i) & 0xc0) == 0x80                     \
250                    && i < 5);                                           \
251                                                                         \
252           errout:                                                       \
253             STANDARD_FROM_LOOP_ERR_HANDLER (i);                         \
254           }                                                             \
255                                                                         \
256         if (__builtin_expect (inptr + cnt > inend, 0))                  \
257           {                                                             \
258             /* We don't have enough input.  But before we report        \
259                that check that all the bytes are correct.  */           \
260             for (i = 1; inptr + i < inend; ++i)                         \
261               if ((inptr[i] & 0xc0) != 0x80)                            \
262                 break;                                                  \
263                                                                         \
264             if (__builtin_expect (inptr + i == inend, 1))               \
265               {                                                         \
266                 result = __GCONV_INCOMPLETE_INPUT;                      \
267                 break;                                                  \
268               }                                                         \
269                                                                         \
270             goto errout;                                                \
271           }                                                             \
272                                                                         \
273         if (cnt == 4)                                                   \
274           {                                                             \
275             /* For 4 byte UTF-8 chars two UTF-16 chars (high and        \
276                low) are needed.  */                                     \
277             uint16_t zabcd, high, low;                                  \
278                                                                         \
279             if (__builtin_expect (outptr + 4 > outend, 0))              \
280               {                                                         \
281                 /* Overflow in the output buffer.  */                   \
282                 result = __GCONV_FULL_OUTPUT;                           \
283                 break;                                                  \
284               }                                                         \
285                                                                         \
286             /* See Principles of Operations cu12.  */                   \
287             zabcd = (((inptr[0] & 0x7) << 2) |                          \
288                      ((inptr[1] & 0x30) >> 4)) - 1;                     \
289                                                                         \
290             /* z-bit must be zero after subtracting 1.  */              \
291             if (zabcd & 0x10)                                           \
292               STANDARD_FROM_LOOP_ERR_HANDLER (4)                        \
293                                                                         \
294             high = (uint16_t)(0xd8 << 8);       /* high surrogate id */ \
295             high |= zabcd << 6;                         /* abcd bits */ \
296             high |= (inptr[1] & 0xf) << 2;              /* efgh bits */ \
297             high |= (inptr[2] & 0x30) >> 4;               /* ij bits */ \
298                                                                         \
299             low = (uint16_t)(0xdc << 8);         /* low surrogate id */ \
300             low |= ((uint16_t)inptr[2] & 0xc) << 6;       /* kl bits */ \
301             low |= (inptr[2] & 0x3) << 6;                 /* mn bits */ \
302             low |= inptr[3] & 0x3f;                   /* opqrst bits */ \
303                                                                         \
304             put16 (outptr, high);                                       \
305             outptr += 2;                                                \
306             put16 (outptr, low);                                        \
307             outptr += 2;                                                \
308             inptr += 4;                                                 \
309             continue;                                                   \
310           }                                                             \
311         else                                                            \
312           {                                                             \
313             /* Read the possible remaining bytes.  */                   \
314             for (i = 1; i < cnt; ++i)                                   \
315               {                                                         \
316                 uint16_t byte = inptr[i];                               \
317                                                                         \
318                 if ((byte & 0xc0) != 0x80)                              \
319                   /* This is an illegal encoding.  */                   \
320                   break;                                                \
321                                                                         \
322                 ch <<= 6;                                               \
323                 ch |= byte & 0x3f;                                      \
324               }                                                         \
325             inptr += cnt;                                               \
326                                                                         \
327           }                                                             \
328       }                                                                 \
329     /* Now adjust the pointers and store the result.  */                \
330     *((uint16_t *) outptr) = ch;                                        \
331     outptr += sizeof (uint16_t);                                        \
332   }
333
334 #define LOOP_NEED_FLAGS
335 #include <iconv/loop.c>
336
337 /* Conversion from UTF-16 to UTF-8.  */
338
339 #define MIN_NEEDED_INPUT        MIN_NEEDED_TO
340 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_FROM
341 #define MAX_NEEDED_OUTPUT       MAX_NEEDED_FROM
342 #define LOOPFCT                 TO_LOOP
343 /* The software routine is based on the functionality of the S/390
344    hardware instruction (cu21) as described in the Principles of
345    Operation.  */
346 #define BODY                                                            \
347   {                                                                     \
348     /* The hardware instruction currently fails to report an error for  \
349        isolated low surrogates so we have to disable the instruction    \
350        until this gets resolved.  */                                    \
351     if (0) /* (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) */                  \
352       {                                                                 \
353         HARDWARE_CONVERT ("cu21 %0, %1, 1");                            \
354         if (inptr != inend)                                             \
355           {                                                             \
356             /* Check if the third byte is                               \
357                a valid start of a UTF-16 surrogate.  */                 \
358             if (inend - inptr == 3 && (inptr[3] & 0xfc) != 0xdc)        \
359               STANDARD_TO_LOOP_ERR_HANDLER (3);                         \
360                                                                         \
361             result = __GCONV_INCOMPLETE_INPUT;                          \
362             break;                                                      \
363           }                                                             \
364         continue;                                                       \
365       }                                                                 \
366                                                                         \
367     uint16_t c = get16 (inptr);                                         \
368                                                                         \
369     if (__builtin_expect (c <= 0x007f, 1))                              \
370       {                                                                 \
371         /* Single byte UTF-8 char.  */                                  \
372         *outptr = c & 0xff;                                             \
373         outptr++;                                                       \
374       }                                                                 \
375     else if (c >= 0x0080 && c <= 0x07ff)                                \
376       {                                                                 \
377         /* Two byte UTF-8 char.  */                                     \
378                                                                         \
379         if (__builtin_expect (outptr + 2 > outend, 0))                  \
380           {                                                             \
381             /* Overflow in the output buffer.  */                       \
382             result = __GCONV_FULL_OUTPUT;                               \
383             break;                                                      \
384           }                                                             \
385                                                                         \
386         outptr[0] = 0xc0;                                               \
387         outptr[0] |= c >> 6;                                            \
388                                                                         \
389         outptr[1] = 0x80;                                               \
390         outptr[1] |= c & 0x3f;                                          \
391                                                                         \
392         outptr += 2;                                                    \
393       }                                                                 \
394     else if ((c >= 0x0800 && c <= 0xd7ff) || c > 0xdfff)                \
395       {                                                                 \
396         /* Three byte UTF-8 char.  */                                   \
397                                                                         \
398         if (__builtin_expect (outptr + 3 > outend, 0))                  \
399           {                                                             \
400             /* Overflow in the output buffer.  */                       \
401             result = __GCONV_FULL_OUTPUT;                               \
402             break;                                                      \
403           }                                                             \
404         outptr[0] = 0xe0;                                               \
405         outptr[0] |= c >> 12;                                           \
406                                                                         \
407         outptr[1] = 0x80;                                               \
408         outptr[1] |= (c >> 6) & 0x3f;                                   \
409                                                                         \
410         outptr[2] = 0x80;                                               \
411         outptr[2] |= c & 0x3f;                                          \
412                                                                         \
413         outptr += 3;                                                    \
414       }                                                                 \
415     else if (c >= 0xd800 && c <= 0xdbff)                                \
416       {                                                                 \
417         /* Four byte UTF-8 char.  */                                    \
418         uint16_t low, uvwxy;                                            \
419                                                                         \
420         if (__builtin_expect (outptr + 4 > outend, 0))                  \
421           {                                                             \
422             /* Overflow in the output buffer.  */                       \
423             result = __GCONV_FULL_OUTPUT;                               \
424             break;                                                      \
425           }                                                             \
426         inptr += 2;                                                     \
427         if (__builtin_expect (inptr + 2 > inend, 0))                    \
428           {                                                             \
429             result = __GCONV_INCOMPLETE_INPUT;                          \
430             break;                                                      \
431           }                                                             \
432                                                                         \
433         low = get16 (inptr);                                            \
434                                                                         \
435         if ((low & 0xfc00) != 0xdc00)                                   \
436           {                                                             \
437             inptr -= 2;                                                 \
438             STANDARD_TO_LOOP_ERR_HANDLER (2);                           \
439           }                                                             \
440         uvwxy = ((c >> 6) & 0xf) + 1;                                   \
441         outptr[0] = 0xf0;                                               \
442         outptr[0] |= uvwxy >> 2;                                        \
443                                                                         \
444         outptr[1] = 0x80;                                               \
445         outptr[1] |= (uvwxy << 4) & 0x30;                               \
446         outptr[1] |= (c >> 2) & 0x0f;                                   \
447                                                                         \
448         outptr[2] = 0x80;                                               \
449         outptr[2] |= (c & 0x03) << 4;                                   \
450         outptr[2] |= (low >> 6) & 0x0f;                                 \
451                                                                         \
452         outptr[3] = 0x80;                                               \
453         outptr[3] |= low & 0x3f;                                        \
454                                                                         \
455         outptr += 4;                                                    \
456       }                                                                 \
457     else                                                                \
458       {                                                                 \
459         STANDARD_TO_LOOP_ERR_HANDLER (2);                               \
460       }                                                                 \
461     inptr += 2;                                                         \
462   }
463 #define LOOP_NEED_FLAGS
464 #include <iconv/loop.c>
465
466 #include <iconv/skeleton.c>