From f405e86de361ec305dc2b8634efeaa23dc144053 Mon Sep 17 00:00:00 2001 From: Shawn Landden Date: Fri, 20 Sep 2013 18:37:33 -0700 Subject: [PATCH] util, utf8: make ellipsize take multi-byte characters into account rename old versions to ascii_* Do not take into account zerowidth characters, but do consider double-wide characters. Import needed utf8 helper code from glib. v3: rebase ontop of utf8 restructuring work [zj: tweak the algorithm a bit, move new code to separate file] --- Makefile.am | 2 + TODO | 4 -- src/shared/gunicode.c | 108 ++++++++++++++++++++++++++++++++++++++++++ src/shared/gunicode.h | 28 +++++++++++ src/shared/utf8.c | 2 +- src/shared/utf8.h | 1 + src/shared/util.c | 78 +++++++++++++++++++++++++++++- src/shared/util.h | 1 + 8 files changed, 218 insertions(+), 6 deletions(-) create mode 100644 src/shared/gunicode.c create mode 100644 src/shared/gunicode.h diff --git a/Makefile.am b/Makefile.am index e5cd7a5ed..fcdb69fd1 100644 --- a/Makefile.am +++ b/Makefile.am @@ -683,6 +683,8 @@ libsystemd_shared_la_SOURCES = \ src/shared/exit-status.h \ src/shared/utf8.c \ src/shared/utf8.h \ + src/shared/gunicode.c \ + src/shared/gunicode.h \ src/shared/pager.c \ src/shared/pager.h \ src/shared/ioprio.h \ diff --git a/TODO b/TODO index 4a498b909..955241a90 100644 --- a/TODO +++ b/TODO @@ -17,10 +17,6 @@ Bugfixes: * properly handle .mount unit state tracking when two mount points are stacked one on top of another on the exact same mount point. -* ellipsize_mem must take into account multi-byte unicode characters, and - - make the resulting line the requested number of *characters*, not *bytes*, - - avoid truncuating multi-byte sequences in the middle. - * When we detect invalid UTF-8, we cant't use it in an error message: log...("Path is not UTF-8 clean, ignoring assignment: %s", rvalue); diff --git a/src/shared/gunicode.c b/src/shared/gunicode.c new file mode 100644 index 000000000..f2d460834 --- /dev/null +++ b/src/shared/gunicode.c @@ -0,0 +1,108 @@ +/* gunicode.c - Unicode manipulation functions + * + * Copyright (C) 1999, 2000 Tom Tromey + * Copyright 2000, 2005 Red Hat, Inc. + */ + +#include "gunicode.h" + +#define unichar uint32_t + +/** + * g_utf8_prev_char: + * @p: a pointer to a position within a UTF-8 encoded string + * + * Finds the previous UTF-8 character in the string before @p. + * + * @p does not have to be at the beginning of a UTF-8 character. No check + * is made to see if the character found is actually valid other than + * it starts with an appropriate byte. If @p might be the first + * character of the string, you must use g_utf8_find_prev_char() instead. + * + * Return value: a pointer to the found character. + **/ +char * +utf8_prev_char (const char *p) +{ + while (1) + { + p--; + if ((*p & 0xc0) != 0x80) + return (char *)p; + } +} + +struct Interval +{ + unichar start, end; +}; + +static int +interval_compare (const void *key, const void *elt) +{ + unichar c = (unichar) (long) (key); + struct Interval *interval = (struct Interval *)elt; + + if (c < interval->start) + return -1; + if (c > interval->end) + return +1; + + return 0; +} + +/* + * NOTE: + * + * The tables for g_unichar_iswide() and g_unichar_iswide_cjk() are + * generated from the Unicode Character Database's file + * extracted/DerivedEastAsianWidth.txt using the gen-iswide-table.py + * in this way: + * + * ./gen-iswide-table.py < path/to/ucd/extracted/DerivedEastAsianWidth.txt | fmt + * + * Last update for Unicode 6.0. + */ + +/** + * g_unichar_iswide: + * @c: a Unicode character + * + * Determines if a character is typically rendered in a double-width + * cell. + * + * Return value: %TRUE if the character is wide + **/ +bool +unichar_iswide (unichar c) +{ + /* See NOTE earlier for how to update this table. */ + static const struct Interval wide[] = { + {0x1100, 0x115F}, {0x2329, 0x232A}, {0x2E80, 0x2E99}, {0x2E9B, 0x2EF3}, + {0x2F00, 0x2FD5}, {0x2FF0, 0x2FFB}, {0x3000, 0x303E}, {0x3041, 0x3096}, + {0x3099, 0x30FF}, {0x3105, 0x312D}, {0x3131, 0x318E}, {0x3190, 0x31BA}, + {0x31C0, 0x31E3}, {0x31F0, 0x321E}, {0x3220, 0x3247}, {0x3250, 0x32FE}, + {0x3300, 0x4DBF}, {0x4E00, 0xA48C}, {0xA490, 0xA4C6}, {0xA960, 0xA97C}, + {0xAC00, 0xD7A3}, {0xF900, 0xFAFF}, {0xFE10, 0xFE19}, {0xFE30, 0xFE52}, + {0xFE54, 0xFE66}, {0xFE68, 0xFE6B}, {0xFF01, 0xFF60}, {0xFFE0, 0xFFE6}, + {0x1B000, 0x1B001}, {0x1F200, 0x1F202}, {0x1F210, 0x1F23A}, {0x1F240, + 0x1F248}, {0x1F250, 0x1F251}, {0x20000, 0x2FFFD}, {0x30000, 0x3FFFD} + }; + + if (bsearch ((void *)(uintptr_t)c, wide, (sizeof (wide) / sizeof ((wide)[0])), sizeof wide[0], + interval_compare)) + return true; + + return false; +} + +const char utf8_skip_data[256] = { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 +}; diff --git a/src/shared/gunicode.h b/src/shared/gunicode.h new file mode 100644 index 000000000..a4b2934a4 --- /dev/null +++ b/src/shared/gunicode.h @@ -0,0 +1,28 @@ +/* gunicode.h - Unicode manipulation functions + * + * Copyright (C) 1999, 2000 Tom Tromey + * Copyright 2000, 2005 Red Hat, Inc. + */ + +#include +#include +#include + +char *utf8_prev_char (const char *p); + +extern const char utf8_skip_data[256]; + +/** + * g_utf8_next_char: + * @p: Pointer to the start of a valid UTF-8 character + * + * Skips to the next character in a UTF-8 string. The string must be + * valid; this macro is as fast as possible, and has no error-checking. + * You would use this macro to iterate over a string character by + * character. The macro returns the start of the next UTF-8 character. + * Before using this macro, use g_utf8_validate() to validate strings + * that may contain invalid UTF-8. + */ +#define utf8_next_char(p) (char *)((p) + utf8_skip_data[*(const unsigned char *)(p)]) + +bool unichar_iswide (uint32_t c); diff --git a/src/shared/utf8.c b/src/shared/utf8.c index 31120af04..98b68ef65 100644 --- a/src/shared/utf8.c +++ b/src/shared/utf8.c @@ -98,7 +98,7 @@ static int utf8_encoded_expected_len(const char *str) { } /* decode one unicode char */ -static int utf8_encoded_to_unichar(const char *str) { +int utf8_encoded_to_unichar(const char *str) { int unichar; int len; int i; diff --git a/src/shared/utf8.h b/src/shared/utf8.h index 96a03ea7c..e3eef11bb 100644 --- a/src/shared/utf8.h +++ b/src/shared/utf8.h @@ -35,3 +35,4 @@ char *ascii_filter(const char *s); char *utf16_to_utf8(const void *s, size_t length); int utf8_encoded_valid_unichar(const char *str); +int utf8_encoded_to_unichar(const char *str); diff --git a/src/shared/util.c b/src/shared/util.c index 31cea79f8..e18421e52 100644 --- a/src/shared/util.c +++ b/src/shared/util.c @@ -74,6 +74,8 @@ #include "env-util.h" #include "fileio.h" #include "device-nodes.h" +#include "utf8.h" +#include "gunicode.h" int saved_argc = 0; char **saved_argv = NULL; @@ -3288,7 +3290,7 @@ int running_in_chroot(void) { a.st_ino != b.st_ino; } -char *ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigned percent) { +static char *ascii_ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigned percent) { size_t x; char *r; @@ -3319,6 +3321,80 @@ char *ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigne return r; } +char *ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigned percent) { + size_t x; + char *e; + const char *i, *j; + unsigned k, len, len2; + + assert(s); + assert(percent <= 100); + assert(new_length >= 3); + + /* if no multibyte characters use ascii_ellipsize_mem for speed */ + if (ascii_is_valid(s)) + return ascii_ellipsize_mem(s, old_length, new_length, percent); + + if (old_length <= 3 || old_length <= new_length) + return strndup(s, old_length); + + x = (new_length * percent) / 100; + + if (x > new_length - 3) + x = new_length - 3; + + k = 0; + for (i = s; k < x && i < s + old_length; i = utf8_next_char(i)) { + int c; + + c = utf8_encoded_to_unichar(i); + if (c < 0) + return NULL; + k += unichar_iswide(c) ? 2 : 1; + } + + if (k > x) /* last character was wide and went over quota */ + x ++; + + for (j = s + old_length; k < new_length && j > i; ) { + int c; + + j = utf8_prev_char(j); + c = utf8_encoded_to_unichar(j); + if (c < 0) + return NULL; + k += unichar_iswide(c) ? 2 : 1; + } + assert(i <= j); + + /* we don't actually need to ellipsize */ + if (i == j) + return memdup(s, old_length + 1); + + /* make space for ellipsis */ + j = utf8_next_char(j); + + len = i - s; + len2 = s + old_length - j; + e = new(char, len + 3 + len2 + 1); + if (!e) + return NULL; + + /* + printf("old_length=%zu new_length=%zu x=%zu len=%u len2=%u k=%u\n", + old_length, new_length, x, len, len2, k); + */ + + memcpy(e, s, len); + e[len] = 0xe2; /* tri-dot ellipsis: … */ + e[len + 1] = 0x80; + e[len + 2] = 0xa6; + + memcpy(e + len + 3, j, len2 + 1); + + return e; +} + char *ellipsize(const char *s, size_t length, unsigned percent) { return ellipsize_mem(s, strlen(s), length, percent); } diff --git a/src/shared/util.h b/src/shared/util.h index c9d078257..26af5b30a 100644 --- a/src/shared/util.h +++ b/src/shared/util.h @@ -405,6 +405,7 @@ static inline const char *ansi_highlight_off(void) { int running_in_chroot(void); char *ellipsize(const char *s, size_t length, unsigned percent); + /* bytes columns */ char *ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigned percent); int touch(const char *path); -- 2.30.2