chiark / gitweb /
journalctl: show any printable Unicode character
authorZbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>
Thu, 12 Jul 2012 23:07:41 +0000 (01:07 +0200)
committerLennart Poettering <lennart@poettering.net>
Thu, 12 Jul 2012 23:07:41 +0000 (01:07 +0200)
This makes sure we are OK in outputting all valid, non-control UTF-8
characters, instead of just printable 7bit ASCII.

Makefile.am
TODO
src/shared/logs-show.c
src/shared/utf8.c
src/shared/utf8.h

index 14f9455..507ea3a 100644 (file)
@@ -726,7 +726,8 @@ libsystemd_logs_la_CFLAGS = \
 
 libsystemd_logs_la_LIBADD = \
        libsystemd-journal-internal.la \
-       libsystemd-id128-internal.la
+       libsystemd-id128-internal.la \
+       libsystemd-shared.la
 
 # ------------------------------------------------------------------------------
 noinst_LTLIBRARIES += \
diff --git a/TODO b/TODO
index e15d4b9..25266b2 100644 (file)
--- a/TODO
+++ b/TODO
@@ -121,8 +121,6 @@ Features:
 
 * drop accountsservice's StandardOutput=syslog and Type=dbus fields
 
-* make sure show-logs checks for utf8 validity, not ascii validity
-
 * when breaking cycles drop sysv services first, then services from /run, then from /etc, then from /usr
 
 * readahead: when bumping /sys readahead variable save mtime and compare later to detect changes
index 540b5a2..e111922 100644 (file)
 #include "logs-show.h"
 #include "log.h"
 #include "util.h"
+#include "utf8.h"
 
 #define PRINT_THRESHOLD 128
 
-static bool contains_unprintable(const void *p, size_t l) {
-        const char *j;
-
-        for (j = p; j < (const char *) p + l; j++)
-                if (*j < ' ' || *j >= 127)
-                        return true;
-
-        return false;
-}
-
 static int parse_field(const void *data, size_t length, const char *field, char **target, size_t *target_size) {
         size_t fl, nl;
         void *buf;
@@ -80,7 +71,7 @@ static bool shall_print(bool show_all, char *p, size_t l) {
         if (l > PRINT_THRESHOLD)
                 return false;
 
-        if (contains_unprintable(p, l))
+        if (!utf8_is_printable_n(p, l))
                 return false;
 
         return true;
@@ -226,7 +217,7 @@ static int output_short(sd_journal *j, unsigned line, unsigned n_columns, bool s
 
         if (show_all)
                 printf(": %.*s\n", (int) message_len, message);
-        else if (contains_unprintable(message, message_len)) {
+        else if (!utf8_is_printable_n(message, message_len)) {
                 char bytes[FORMAT_BYTES_MAX];
                 printf(": [%s blob data]\n", format_bytes(bytes, sizeof(bytes), message_len));
         } else if (message_len + n < n_columns)
@@ -298,7 +289,7 @@ static int output_verbose(sd_journal *j, unsigned line, unsigned n_columns, bool
 
         SD_JOURNAL_FOREACH_DATA(j, data, length) {
                 if (!show_all && (length > PRINT_THRESHOLD ||
-                                  contains_unprintable(data, length))) {
+                                  !utf8_is_printable_n(data, length))) {
                         const char *c;
                         char bytes[FORMAT_BYTES_MAX];
 
@@ -367,7 +358,7 @@ static int output_export(sd_journal *j, unsigned line, unsigned n_columns, bool
                     memcmp(data, "_BOOT_ID=", 9) == 0)
                         continue;
 
-                if (contains_unprintable(data, length)) {
+                if (!utf8_is_printable_n(data, length)) {
                         const char *c;
                         uint64_t le64;
 
@@ -394,8 +385,7 @@ static int output_export(sd_journal *j, unsigned line, unsigned n_columns, bool
 }
 
 static void json_escape(const char* p, size_t l) {
-
-        if (contains_unprintable(p, l)) {
+        if (!utf8_is_printable_n(p, l)) {
                 bool not_first = false;
 
                 fputs("[ ", stdout);
index 13f0521..a6f5b3f 100644 (file)
@@ -78,6 +78,77 @@ static inline void merge_continuation_char(uint32_t *u_ch, uint8_t ch) {
         *u_ch |= ch & 0x3f;
 }
 
+static bool is_unicode_control(uint32_t ch) {
+
+        /*
+          0 to ' '-1 is the C0 range.
+          DEL=0x7F, and DEL+1 to 0x9F is C1 range.
+          '\t' is in C0 range, but more or less harmless and commonly used.
+        */
+
+        return (ch < ' ' && ch != '\t') ||
+                (0x7F <= ch && ch <= 0x9F);
+}
+
+char* utf8_is_printable_n(const char* str, size_t length) {
+        uint32_t val = 0;
+        uint32_t min = 0;
+        const uint8_t *p;
+
+        assert(str);
+
+        for (p = (const uint8_t*) str; length; p++, length--) {
+                if (*p < 128) {
+                        val = *p;
+                } else {
+                        if ((*p & 0xe0) == 0xc0) { /* 110xxxxx two-char seq. */
+                                min = 128;
+                                val = (uint32_t) (*p & 0x1e);
+                                goto ONE_REMAINING;
+                        } else if ((*p & 0xf0) == 0xe0) { /* 1110xxxx three-char seq.*/
+                                min = (1 << 11);
+                                val = (uint32_t) (*p & 0x0f);
+                                goto TWO_REMAINING;
+                        } else if ((*p & 0xf8) == 0xf0) { /* 11110xxx four-char seq */
+                                min = (1 << 16);
+                                val = (uint32_t) (*p & 0x07);
+                        } else
+                                goto error;
+
+                        p++;
+                        length--;
+                        if (!length || !is_continuation_char(*p))
+                                goto error;
+                        merge_continuation_char(&val, *p);
+
+                TWO_REMAINING:
+                        p++;
+                        length--;
+                        if (!is_continuation_char(*p))
+                                goto error;
+                        merge_continuation_char(&val, *p);
+
+                ONE_REMAINING:
+                        p++;
+                        length--;
+                        if (!is_continuation_char(*p))
+                                goto error;
+                        merge_continuation_char(&val, *p);
+
+                        if (val < min)
+                                goto error;
+                }
+
+                if (is_unicode_control(val))
+                        goto error;
+        }
+
+        return (char*) str;
+
+error:
+        return NULL;
+}
+
 static char* utf8_validate(const char *str, char *output) {
         uint32_t val = 0;
         uint32_t min = 0;
index af2420f..fec76b4 100644 (file)
@@ -27,6 +27,8 @@
 char *utf8_is_valid(const char *s) _pure_;
 char *ascii_is_valid(const char *s) _pure_;
 
+char *utf8_is_printable_n(const char* str, size_t length) _pure_;
+
 char *utf8_filter(const char *s);
 char *ascii_filter(const char *s);