From caecd4f412422d8130d616da33325e03557218e7 Mon Sep 17 00:00:00 2001
Message-Id: <caecd4f412422d8130d616da33325e03557218e7.1714351371.git.mdw@distorted.org.uk>
From: Mark Wooding <mdw@chiark.greenend.org.uk>
Date: Sun, 18 Nov 2007 22:23:35 +0000
Subject: [PATCH] transition various bits of code to unicode.h interfaces
Organization: Straylight/Edgeware

From: Richard Kettlewell <rjk@greenend.org.uk>

---
 lib/charset.c | 79 ---------------------------------------------------
 lib/charset.h |  7 -----
 lib/test.c    | 13 +++++----
 lib/unicode.h | 20 +++++++++++++
 server/cgi.c  |  2 +-
 5 files changed, 29 insertions(+), 92 deletions(-)

diff --git a/lib/charset.c b/lib/charset.c
index c763d10..c1b07bf 100644
--- a/lib/charset.c
+++ b/lib/charset.c
@@ -70,74 +70,6 @@ static void *convert(const char *from, const char *to,
   return buf;
 }
 
-/** @brief Convert UTF-8 to UCS-4
- * @param mb Pointer to 0-terminated UTF-8 string
- * @return Pointer to 0-terminated UCS-4 string
- *
- * Not everybody's iconv supports UCS-4, and it's inconvenient to have to know
- * our endianness, and it's easy to convert it ourselves, so we do.  See also
- * @ref ucs42utf8().
- */ 
-uint32_t *utf82ucs4(const char *mb) {
-  struct dynstr_ucs4 d;
-  uint32_t c;
-
-  dynstr_ucs4_init(&d);
-  while(*mb) {
-    PARSE_UTF8(mb, c,
-	       error(0, "invalid UTF-8 sequence"); return 0;);
-    dynstr_ucs4_append(&d, c);
-  }
-  dynstr_ucs4_terminate(&d);
-  return d.vec;
-}
-
-/** @brief Convert one UCS-4 character to UTF-8
- * @param c Character to convert
- * @param d Dynamic string to append UTF-8 sequence to
- * @return 0 on success, -1 on error
- */
-int one_ucs42utf8(uint32_t c, struct dynstr *d) {
-  if(c < 0x80)
-    dynstr_append(d, c);
-  else if(c < 0x800) {
-    dynstr_append(d, 0xC0 | (c >> 6));
-    dynstr_append(d, 0x80 | (c & 0x3F));
-  } else if(c < 0x10000) {
-    dynstr_append(d, 0xE0 | (c >> 12));
-    dynstr_append(d, 0x80 | ((c >> 6) & 0x3F));
-    dynstr_append(d, 0x80 | (c & 0x3F));
-  } else if(c < 0x110000) {
-    dynstr_append(d, 0xF0 | (c >> 18));
-    dynstr_append(d, 0x80 | ((c >> 12) & 0x3F));
-    dynstr_append(d, 0x80 | ((c >> 6) & 0x3F));
-    dynstr_append(d, 0x80 | (c & 0x3F));
-  } else {
-    error(0, "invalid UCS-4 character %#"PRIx32, c);
-    return -1;
-  }
-  return 0;
-}
-
-/** @brief Convert UCS-4 to UTF-8
- * @param u Pointer to 0-terminated UCS-4 string
- * @return Pointer to 0-terminated UTF-8 string
- *
- * See @ref utf82ucs4().
- */
-char *ucs42utf8(const uint32_t *u) {
-  struct dynstr d;
-  uint32_t c;
-
-  dynstr_init(&d);
-  while((c = *u++)) {
-    if(one_ucs42utf8(c, &d))
-      return 0;
-  }
-  dynstr_terminate(&d);
-  return d.vec;
-}
-
 /** @brief Convert from the local multibyte encoding to UTF-8 */
 char *mb2utf8(const char *mb) {
   return convert(nl_langinfo(CODESET), "UTF-8", mb, strlen(mb) + 1);
@@ -167,17 +99,6 @@ char *any2any(const char *from,
   else return xstrdup(any);
 }
 
-/** @brief strlen workalike for UCS-4 strings
- *
- * We don't rely on the local @c wchar_t being UCS-4.
- */
-int ucs4cmp(const uint32_t *a, const uint32_t *b) {
-  while(*a && *b && *a == *b) ++a, ++b;
-  if(*a > *b) return 1;
-  else if(*a < *b) return -1;
-  else return 0;
-}
-
 /** @brief Return nonzero if @p c is a combining character */
 static int combining(int c) {
   if(c < UNICODE_NCHARS) {
diff --git a/lib/charset.h b/lib/charset.h
index 70b5b2f..f7860df 100644
--- a/lib/charset.h
+++ b/lib/charset.h
@@ -24,10 +24,6 @@ struct dynstr;
 
 /* Character encoding conversion routines */
 
-int one_ucs42utf8(uint32_t c, struct dynstr *d);
-
-uint32_t *utf82ucs4(const char *mb);
-char *ucs42utf8(const uint32_t *u);
 char *mb2utf8(const char *mb);
 char *utf82mb(const char *utf8);
 /* various conversions, between multibyte strings (mb) in
@@ -63,9 +59,6 @@ static inline char *nullcheck(char *s) {
   return s;
 }
 
-int ucs4cmp(const uint32_t *a, const uint32_t *b);
-/* like strcmp */
-
 const char *truncate_for_display(const char *s, long max);
 
 #endif /* CHARSET_H */
diff --git a/lib/test.c b/lib/test.c
index b0b5395..4b073a8 100644
--- a/lib/test.c
+++ b/lib/test.c
@@ -139,10 +139,10 @@ static void test_utf8(void) {
   char *u8;					\
 						\
   insist(validutf8(CHARS));			\
-  ucs = utf82ucs4(CHARS);			\
+  ucs = utf8_to_utf32(CHARS, strlen(CHARS), 0); \
   insist(ucs != 0);				\
-  insist(!ucs4cmp(w, ucs));			\
-  u8 = ucs42utf8(ucs);				\
+  insist(!utf32_cmp(w, ucs));			\
+  u8 = utf32_to_utf8(ucs, utf32_len(ucs), 0);   \
   insist(u8 != 0);				\
   insist(!strcmp(u8, CHARS));			\
 } while(0)
@@ -393,8 +393,10 @@ static void test_casefold(void) {
       break;
     }
     if(l) {
+      uint32_t *d;
       /* Case-folded data is now normalized */
-      canon_expected = ucs42utf8(utf32_decompose_canon(&l, 1, 0));
+      d = utf32_decompose_canon(&l, 1, 0);
+      canon_expected = utf32_to_utf8(d, utf32_len(d), 0);
       if(strcmp(canon_folded, canon_expected)) {
 	fprintf(stderr, "%s:%d: canon-casefolding %#lx got '%s', expected '%s'\n",
 		__FILE__, __LINE__, (unsigned long)c,
@@ -402,7 +404,8 @@ static void test_casefold(void) {
 	count_error();
       }
       ++tests;
-      compat_expected = ucs42utf8(utf32_decompose_compat(&l, 1, 0));
+      d = utf32_decompose_compat(&l, 1, 0);
+      compat_expected = utf32_to_utf8(d, utf32_len(d), 0);
       if(strcmp(compat_folded, compat_expected)) {
 	fprintf(stderr, "%s:%d: compat-casefolding %#lx got '%s', expected '%s'\n",
 		__FILE__, __LINE__, (unsigned long)c,
diff --git a/lib/unicode.h b/lib/unicode.h
index 1eb8c68..47b39f0 100644
--- a/lib/unicode.h
+++ b/lib/unicode.h
@@ -68,6 +68,26 @@ uint32_t utf32_iterator_code(utf32_iterator it);
 int utf32_iterator_grapheme_boundary(utf32_iterator it);
 int utf32_iterator_word_boundary(utf32_iterator it);
 
+/** @brief Convert 0-terminated UTF-32 to UTF-8
+ * @param s 0-terminated UTF-32 string
+ * @return 0-terminated UTF-8 string or 0 on error
+ *
+ * See utf32_to_utf8() for possible causes of errors.
+ */
+static inline char *utf32nt_to_utf8(const uint32_t *s) {
+  return utf32_to_utf8(s, utf32_len(s), 0);
+}
+
+/** @brief Convert 0-terminated UTF-8 to UTF-32
+ * @param s 0-terminated UTF-8 string
+ * @return 0-terminated UTF-32 string or 0 on error
+ *
+ * See utf8_to_utf32() for possible causes of errors.
+ */
+static inline uint32_t *utf8nt_to_utf32(const char *s) {
+  return utf8_to_utf32(s, strlen(s), 0);
+}
+
 #endif /* UNICODE_H */
 
 /*
diff --git a/server/cgi.c b/server/cgi.c
index 37ac724..2189532 100644
--- a/server/cgi.c
+++ b/server/cgi.c
@@ -225,7 +225,7 @@ char *cgi_sgmlquote(const char *s, int raw) {
   int n;
 
   if(!raw) {
-    if(!(ucs = utf82ucs4(s))) exit(EXIT_FAILURE);
+    if(!(ucs = utf8_to_utf32(s, strlen(s), 0))) exit(EXIT_FAILURE);
   } else {
     ucs = xmalloc_noptr((strlen(s) + 1) * sizeof(uint32_t));
     for(n = 0; s[n]; ++n)
-- 
[mdw]