src/utf8.c

   1 /*
   2  * utf8.c:
   3  *
   4  * Copyright (c) 2008 James McKenzie <sympathy@madingley.org>,
   5  * All rights reserved.
   6  *
   7  */
   8
   9 static char rcsid[] = "$Id: utf8.c,v 1.16 2010/07/27 14:49:35 james Exp $";
  10
  11 /*
  12  * $Log: utf8.c,v $
  13  * Revision 1.16  2010/07/27 14:49:35  james
  14  * add support for byte logging
  15  *
  16  * Revision 1.15  2008/03/07 13:16:02  james
  17  * *** empty log message ***
  18  *
  19  * Revision 1.14  2008/03/07 12:37:04  james
  20  * *** empty log message ***
  21  *
  22  * Revision 1.13  2008/03/06 16:49:39  james
  23  * *** empty log message ***
  24  *
  25  * Revision 1.12  2008/03/06 16:49:05  james
  26  * *** empty log message ***
  27  *
  28  * Revision 1.11  2008/03/03 06:04:42  james
  29  * *** empty log message ***
  30  *
  31  * Revision 1.10  2008/03/02 10:37:56  james
  32  * *** empty log message ***
  33  *
  34  * Revision 1.9  2008/02/27 01:31:14  james
  35  * *** empty log message ***
  36  *
  37  * Revision 1.8  2008/02/27 00:54:16  james
  38  * *** empty log message ***
  39  *
  40  * Revision 1.7  2008/02/26 23:56:12  james
  41  * *** empty log message ***
  42  *
  43  * Revision 1.6  2008/02/26 23:23:17  james
  44  * *** empty log message ***
  45  *
  46  * Revision 1.5  2008/02/24 00:42:53  james
  47  * *** empty log message ***
  48  *
  49  * Revision 1.4  2008/02/23 13:05:58  staffcvs
  50  * *** empty log message ***
  51  *
  52  * Revision 1.3  2008/02/23 11:48:37  james
  53  * *** empty log message ***
  54  *
  55  * Revision 1.2  2008/02/22 23:39:27  james
  56  * *** empty log message ***
  57  *
  58  * Revision 1.1  2008/02/22 19:12:05  james
  59  * *** empty log message ***
  60  *
  61  */
  62
  63 #include "project.h"
  64
  65
  66 int
  67 utf8_flush (Context * c)
  68 {
  69   UTF8 *u = c->u;
  70   int i;
  71   int err = 0;
  72
  73   switch (u->utf_ptr) {
  74   case 1:
  75     log_f (c->l, "<invalid utf-8 sequence: \\%03o>", u->utf_buf[0]);
  76     break;
  77   case 2:
  78     log_f (c->l, "<invalid utf-8 sequence: \\%03o \\%03o>",
  79            u->utf_buf[0], u->utf_buf[1]);
  80     break;
  81   case 3:
  82     log_f (c->l, "<invalid utf-8 sequence: \\%03o \\%03o \\%03o>",
  83            u->utf_buf[0], u->utf_buf[1], u->utf_buf[2]);
  84     break;
  85   case 4:
  86     log_f (c->l,
  87            "<invalid utf-8 sequence: \\%03o \\%03o \\%03o \\%03o>",
  88            u->utf_buf[0], u->utf_buf[1], u->utf_buf[2], u->utf_buf[3]);
  89     break;
  90   }
  91
  92   for (i = 0; i < u->utf_ptr; ++i)
  93     err += vt102_parse_char (c, u->utf_buf[i]);
  94
  95   u->utf_ptr = 0;
  96   u->in_utf8 = 0;
  97
  98   return err;
  99
 100 }
 101
 102 int
 103 utf8_parse (Context * c, uint32_t ch)
 104 {
 105   UTF8 *u = c->u;
 106   int err = 0;
 107
 108   if (ch == SYM_CHAR_RESET) {
 109     u->in_utf8 = 0;
 110     err += vt102_parse_char (c, ch);
 111     return err;
 112   }
 113
 114   if (c->l && c->byte_logging) {
 115         uint8_t ch8=(uint8_t) ch;
 116         c->l->log_bytes(c->l,&ch8,1);
 117   }
 118
 119   if (!u->in_utf8) {
 120     /* FIXME: for the moment we bodge utf8 support - need to do */
 121     /* L->R and R->L and double width characters */
 122     if (ch == 0xb9)             // FIXME - OTHER 8 bit control chars
 123     {                           /* CSI, not a valid utf8 start char */
 124       err += vt102_parse_char (c, ch);
 125     } else if ((ch & 0xe0) == 0xc0) { /* Start of two byte unicode sequence */
 126       u->in_utf8 = 1;
 127       u->utf_ptr = 0;
 128       u->utf_buf[u->utf_ptr++] = ch;
 129       u->ch = (ch & 0x1f) << 6;
 130       u->sh = 0;
 131     } else if ((ch & 0xf0) == 0xe0) { /* Start of three byte unicode sequence
 132                                        */
 133       u->in_utf8 = 2;
 134       u->utf_ptr = 0;
 135       u->utf_buf[u->utf_ptr++] = ch;
 136       u->ch = (ch & 0x0f) << 12;
 137       u->sh = 6;
 138     } else if ((ch & 0xf8) == 0xf0) {
 139       u->in_utf8 = 3;
 140       u->utf_ptr = 0;
 141       u->utf_buf[u->utf_ptr++] = ch;
 142       u->ch = (ch & 0x07) << 18;
 143       u->sh = 12;
 144     } else {
 145       err += vt102_parse_char (c, ch);
 146     }
 147   } else {
 148     if ((ch & 0xc0) != 0x80) {
 149       err += utf8_flush (c);
 150       err += vt102_parse_char (c, ch);
 151     } else {
 152       u->utf_buf[u->utf_ptr++] = ch;
 153       u->ch |= (ch & 0x3f) << u->sh;
 154       u->sh -= 6;
 155       u->in_utf8--;
 156
 157       if (!u->in_utf8)
 158         err += vt102_parse_char (c, u->ch);
 159     }
 160   }
 161   return err;
 162 }
 163
 164
 165
 166 UTF8 *
 167 utf8_new (void)
 168 {
 169   UTF8 *ret;
 170
 171   ret = (UTF8 *) xmalloc (sizeof (UTF8));
 172
 173   ret->in_utf8 = 0;
 174
 175 }
 176
 177 int
 178 utf8_encode (char *ptr, int ch)
 179 {
 180
 181   if (ch < 0x80) {
 182     ptr[0] = ch;
 183     return 1;
 184   } else if (ch < 0x800) {
 185     ptr[0] = 0xc0 | (ch >> 6);
 186     ptr[1] = 0x80 | (ch & 0x3f);
 187     return 2;
 188   } else if (ch < 0x10000) {
 189     ptr[0] = 0xe0 | (ch >> 12);
 190     ptr[1] = 0x80 | ((ch >> 6) & 0x3f);
 191     ptr[2] = 0x80 | (ch & 0x3f);
 192     return 3;
 193   } else if (ch < 0x1fffff) {
 194     ptr[0] = 0xf0 | (ch >> 18);
 195     ptr[1] = 0x80 | ((ch >> 12) & 0x3f);
 196     ptr[2] = 0x80 | ((ch >> 6) & 0x3f);
 197     ptr[3] = 0x80 | (ch & 0x3f);
 198     return 4;
 199   }
 200   return 0;
 201 }
 202
 203 int
 204 utf8_emit (TTY * t, int ch)
 205 {
 206   uint8_t buf[4];
 207   int i;
 208   i = utf8_encode (buf, ch);
 209   if (!i)
 210     return 0;
 211
 212   if (t->xmit (t, buf, i) != i)
 213     return -1;
 214   return 0;
 215 }