transition various bits of code to unicode.h interfaces

author Richard Kettlewell <rjk@greenend.org.uk>

Sun, 18 Nov 2007 22:23:35 +0000 (22:23 +0000)

committer Richard Kettlewell <rjk@greenend.org.uk>

Sun, 18 Nov 2007 22:23:35 +0000 (22:23 +0000)
author Richard Kettlewell <rjk@greenend.org.uk>
Sun, 18 Nov 2007 22:23:35 +0000 (22:23 +0000)
committer Richard Kettlewell <rjk@greenend.org.uk>
Sun, 18 Nov 2007 22:23:35 +0000 (22:23 +0000)
diff --git a/lib/charset.c b/lib/charset.c

index c763d1070a675cdbf0cf16a9bae9b98e447663f2..c1b07bfcdd89d2a42c93059a0642b0eaa4fc80d6 100644 (file)
--- a/lib/charset.c
+++ b/lib/charset.c
@@ -70,74 +70,6 @@ static void *convert(const char *from, const char *to,
    return buf;
  }
  
-/** @brief Convert UTF-8 to UCS-4
- * @param mb Pointer to 0-terminated UTF-8 string
- * @return Pointer to 0-terminated UCS-4 string
- *
- * Not everybody's iconv supports UCS-4, and it's inconvenient to have to know
- * our endianness, and it's easy to convert it ourselves, so we do.  See also
- * @ref ucs42utf8().
- */ 
-uint32_t *utf82ucs4(const char *mb) {
-  struct dynstr_ucs4 d;
-  uint32_t c;
-
-  dynstr_ucs4_init(&d);
-  while(*mb) {
-    PARSE_UTF8(mb, c,
-              error(0, "invalid UTF-8 sequence"); return 0;);
-    dynstr_ucs4_append(&d, c);
-  }
-  dynstr_ucs4_terminate(&d);
-  return d.vec;
-}
-
-/** @brief Convert one UCS-4 character to UTF-8
- * @param c Character to convert
- * @param d Dynamic string to append UTF-8 sequence to
- * @return 0 on success, -1 on error
- */
-int one_ucs42utf8(uint32_t c, struct dynstr *d) {
-  if(c < 0x80)
-    dynstr_append(d, c);
-  else if(c < 0x800) {
-    dynstr_append(d, 0xC0 | (c >> 6));
-    dynstr_append(d, 0x80 | (c & 0x3F));
-  } else if(c < 0x10000) {
-    dynstr_append(d, 0xE0 | (c >> 12));
-    dynstr_append(d, 0x80 | ((c >> 6) & 0x3F));
-    dynstr_append(d, 0x80 | (c & 0x3F));
-  } else if(c < 0x110000) {
-    dynstr_append(d, 0xF0 | (c >> 18));
-    dynstr_append(d, 0x80 | ((c >> 12) & 0x3F));
-    dynstr_append(d, 0x80 | ((c >> 6) & 0x3F));
-    dynstr_append(d, 0x80 | (c & 0x3F));
-  } else {
-    error(0, "invalid UCS-4 character %#"PRIx32, c);
-    return -1;
-  }
-  return 0;
-}
-
-/** @brief Convert UCS-4 to UTF-8
- * @param u Pointer to 0-terminated UCS-4 string
- * @return Pointer to 0-terminated UTF-8 string
- *
- * See @ref utf82ucs4().
- */
-char *ucs42utf8(const uint32_t *u) {
-  struct dynstr d;
-  uint32_t c;
-
-  dynstr_init(&d);
-  while((c = *u++)) {
-    if(one_ucs42utf8(c, &d))
-      return 0;
-  }
-  dynstr_terminate(&d);
-  return d.vec;
-}
-
  /** @brief Convert from the local multibyte encoding to UTF-8 */
  char *mb2utf8(const char *mb) {
    return convert(nl_langinfo(CODESET), "UTF-8", mb, strlen(mb) + 1);
@@ -167,17 +99,6 @@ char *any2any(const char *from,
    else return xstrdup(any);
  }
  
-/** @brief strlen workalike for UCS-4 strings
- *
- * We don't rely on the local @c wchar_t being UCS-4.
- */
-int ucs4cmp(const uint32_t *a, const uint32_t *b) {
-  while(*a && *b && *a == *b) ++a, ++b;
-  if(*a > *b) return 1;
-  else if(*a < *b) return -1;
-  else return 0;
-}
-
  /** @brief Return nonzero if @p c is a combining character */
  static int combining(int c) {
    if(c < UNICODE_NCHARS) {
diff --git a/lib/charset.h b/lib/charset.h

index 70b5b2f40d68baa141e1f73497a87ddc6c5f92d6..f7860df1a2a6c457afc91c12b6f51319b3583d80 100644 (file)
--- a/lib/charset.h
+++ b/lib/charset.h
@@ -24,10 +24,6 @@ struct dynstr;
  
  /* Character encoding conversion routines */
  
-int one_ucs42utf8(uint32_t c, struct dynstr *d);
-
-uint32_t *utf82ucs4(const char *mb);
-char *ucs42utf8(const uint32_t *u);
  char *mb2utf8(const char *mb);
  char *utf82mb(const char *utf8);
  /* various conversions, between multibyte strings (mb) in
@@ -63,9 +59,6 @@ static inline char *nullcheck(char *s) {
    return s;
  }
  
-int ucs4cmp(const uint32_t *a, const uint32_t *b);
-/* like strcmp */
-
  const char *truncate_for_display(const char *s, long max);
  
  #endif /* CHARSET_H */
diff --git a/lib/test.c b/lib/test.c

index b0b5395db12c0da9b3e2a7c6f32bb9de803a3706..4b073a8b8fdfc75cab205d6a03ad1cd49705f7c3 100644 (file)
--- a/lib/test.c
+++ b/lib/test.c
@@ -139,10 +139,10 @@ static void test_utf8(void) {
    char *u8;                                    \
                                                 \
    insist(validutf8(CHARS));                    \
-  ucs = utf82ucs4(CHARS);                      \
+  ucs = utf8_to_utf32(CHARS, strlen(CHARS), 0); \
    insist(ucs != 0);                            \
-  insist(!ucs4cmp(w, ucs));                    \
-  u8 = ucs42utf8(ucs);                         \
+  insist(!utf32_cmp(w, ucs));                  \
+  u8 = utf32_to_utf8(ucs, utf32_len(ucs), 0);   \
    insist(u8 != 0);                             \
    insist(!strcmp(u8, CHARS));                  \
  } while(0)
@@ -393,8 +393,10 @@ static void test_casefold(void) {
        break;
      }
      if(l) {
+      uint32_t *d;
        /* Case-folded data is now normalized */
-      canon_expected = ucs42utf8(utf32_decompose_canon(&l, 1, 0));
+      d = utf32_decompose_canon(&l, 1, 0);
+      canon_expected = utf32_to_utf8(d, utf32_len(d), 0);
        if(strcmp(canon_folded, canon_expected)) {
         fprintf(stderr, "%s:%d: canon-casefolding %#lx got '%s', expected '%s'\n",
                 __FILE__, __LINE__, (unsigned long)c,
@@ -402,7 +404,8 @@ static void test_casefold(void) {
         count_error();
        }
        ++tests;
-      compat_expected = ucs42utf8(utf32_decompose_compat(&l, 1, 0));
+      d = utf32_decompose_compat(&l, 1, 0);
+      compat_expected = utf32_to_utf8(d, utf32_len(d), 0);
        if(strcmp(compat_folded, compat_expected)) {
         fprintf(stderr, "%s:%d: compat-casefolding %#lx got '%s', expected '%s'\n",
                 __FILE__, __LINE__, (unsigned long)c,
diff --git a/lib/unicode.h b/lib/unicode.h

index 1eb8c6876822b8418d57cd76a9451a9889e76a5d..47b39f0ed04960338ce0a054dd36982ad2245704 100644 (file)
--- a/lib/unicode.h
+++ b/lib/unicode.h
@@ -68,6 +68,26 @@ uint32_t utf32_iterator_code(utf32_iterator it);
  int utf32_iterator_grapheme_boundary(utf32_iterator it);
  int utf32_iterator_word_boundary(utf32_iterator it);
  
+/** @brief Convert 0-terminated UTF-32 to UTF-8
+ * @param s 0-terminated UTF-32 string
+ * @return 0-terminated UTF-8 string or 0 on error
+ *
+ * See utf32_to_utf8() for possible causes of errors.
+ */
+static inline char *utf32nt_to_utf8(const uint32_t *s) {
+  return utf32_to_utf8(s, utf32_len(s), 0);
+}
+
+/** @brief Convert 0-terminated UTF-8 to UTF-32
+ * @param s 0-terminated UTF-8 string
+ * @return 0-terminated UTF-32 string or 0 on error
+ *
+ * See utf8_to_utf32() for possible causes of errors.
+ */
+static inline uint32_t *utf8nt_to_utf32(const char *s) {
+  return utf8_to_utf32(s, strlen(s), 0);
+}
+
  #endif /* UNICODE_H */
  
  /*
diff --git a/server/cgi.c b/server/cgi.c

index 37ac724dd867946d79efa58428f619fabed2ce44..21895325578242ec298bba528b8bc1c57a27ff46 100644 (file)
--- a/server/cgi.c
+++ b/server/cgi.c
@@ -225,7 +225,7 @@ char *cgi_sgmlquote(const char *s, int raw) {
    int n;
  
    if(!raw) {
-    if(!(ucs = utf82ucs4(s))) exit(EXIT_FAILURE);
+    if(!(ucs = utf8_to_utf32(s, strlen(s), 0))) exit(EXIT_FAILURE);
    } else {
      ucs = xmalloc_noptr((strlen(s) + 1) * sizeof(uint32_t));
      for(n = 0; s[n]; ++n)
author	Richard Kettlewell <rjk@greenend.org.uk>
	Sun, 18 Nov 2007 22:23:35 +0000 (22:23 +0000)
committer	Richard Kettlewell <rjk@greenend.org.uk>
	Sun, 18 Nov 2007 22:23:35 +0000 (22:23 +0000)
lib/charset.c		patch \| blob \| blame \| history
lib/charset.h		patch \| blob \| blame \| history
lib/test.c		patch \| blob \| blame \| history
lib/unicode.h		patch \| blob \| blame \| history
server/cgi.c		patch \| blob \| blame \| history