From 0d3fd8ae03eb180e1309f8b0bfd2fbce6c3cfdcc Mon Sep 17 00:00:00 2001
From: =?utf8?q?Zbigniew=20J=C4=99drzejewski-Szmek?= <zbyszek@in.waw.pl>
Date: Thu, 17 May 2018 10:55:21 +0200
Subject: [PATCH] basic/string-util: add a convenience function to cescape
 mostly-ascii fields
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

It's not supposed to be the most efficient, but instead fast and simple to use.

I kept the logic in ellipsize_mem() to use unicode ellipsis even in non-unicode
locales. I'm not quite convinced things should be this way, especially that with
this patch it'd actually be simpler to always use "â¦" in unicode locale and "..."
otherwise, but Lennart wanted it this way for some reason.
---
 src/basic/string-util.c     | 70 ++++++++++++++++++++++++++++---------
 src/basic/string-util.h     |  1 +
 src/test/test-string-util.c | 25 +++++++++++++
 3 files changed, 80 insertions(+), 16 deletions(-)

diff --git a/src/basic/string-util.c b/src/basic/string-util.c
index 498e31122..f241a3376 100644
--- a/src/basic/string-util.c
+++ b/src/basic/string-util.c
@@ -14,6 +14,7 @@
 #include <string.h>
 
 #include "alloc-util.h"
+//#include "escape.h"
 #include "gunicode.h"
 //#include "locale-util.h"
 #include "macro.h"
@@ -457,6 +458,20 @@ bool string_has_cc(const char *p, const char *ok) {
         return false;
 }
 
+static int write_ellipsis(char *buf, bool unicode) {
+        if (unicode || is_locale_utf8()) {
+                buf[0] = 0xe2; /* tri-dot ellipsis: â¦ */
+                buf[1] = 0x80;
+                buf[2] = 0xa6;
+        } else {
+                buf[0] = '.';
+                buf[1] = '.';
+                buf[2] = '.';
+        }
+
+        return 3;
+}
+
 static char *ascii_ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigned percent) {
         size_t x, need_space;
         char *r;
@@ -505,17 +520,7 @@ static char *ascii_ellipsize_mem(const char *s, size_t old_length, size_t new_le
         assert(x <= new_length - need_space);
 
         memcpy(r, s, x);
-
-        if (is_locale_utf8()) {
-                r[x+0] = 0xe2; /* tri-dot ellipsis: â¦ */
-                r[x+1] = 0x80;
-                r[x+2] = 0xa6;
-        } else {
-                r[x+0] = '.';
-                r[x+1] = '.';
-                r[x+2] = '.';
-        }
-
+        write_ellipsis(r + x, false);
         memcpy(r + x + 3,
                s + old_length - (new_length - x - need_space),
                new_length - x - need_space + 1);
@@ -600,23 +605,56 @@ char *ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigne
         */
 
         memcpy(e, s, len);
-        e[len + 0] = 0xe2; /* tri-dot ellipsis: â¦ */
-        e[len + 1] = 0x80;
-        e[len + 2] = 0xa6;
-
+        write_ellipsis(e + len, true);
         memcpy(e + len + 3, j, len2 + 1);
 
         return e;
 }
 
 char *ellipsize(const char *s, size_t length, unsigned percent) {
-
         if (length == (size_t) -1)
                 return strdup(s);
 
         return ellipsize_mem(s, strlen(s), length, percent);
 }
 
+char *cellescape(char *buf, size_t len, const char *s) {
+        /* Escape and ellipsize s into buffer buf of size len. Only non-control ASCII
+         * characters are copied as they are, everything else is escaped. The result
+         * is different then if escaping and ellipsization was performed in two
+         * separate steps, because each sequence is either stored in full or skipped.
+         *
+         * This function should be used for logging about strings which expected to
+         * be plain ASCII in a safe way.
+         *
+         * An ellipsis will be used if s is too long. It was always placed at the
+         * very end.
+         */
+
+        size_t i;
+        const char *t = s;
+
+        assert(len > 4 + 4 + 1); /* two chars and the terminator */
+
+        for (i = 0; i < len - 9; t++) {
+                if (!*t)
+                        goto done;
+                i += cescape_char(*t, buf + i);
+        }
+
+        /* We have space for one more char and terminating nul at this point */
+        if (*t) {
+                if (*(t+1))
+                        i += write_ellipsis(buf + i, false);
+                else
+                        i += cescape_char(*t, buf + i);
+        }
+
+ done:
+        buf[i] = '\0';
+        return buf;
+}
+
 bool nulstr_contains(const char *nulstr, const char *needle) {
         const char *i;
 
diff --git a/src/basic/string-util.h b/src/basic/string-util.h
index 52a1d7f2b..99b515569 100644
--- a/src/basic/string-util.h
+++ b/src/basic/string-util.h
@@ -165,6 +165,7 @@ bool string_has_cc(const char *p, const char *ok) _pure_;
 
 char *ellipsize_mem(const char *s, size_t old_length_bytes, size_t new_length_columns, unsigned percent);
 char *ellipsize(const char *s, size_t length, unsigned percent);
+char *cellescape(char *buf, size_t len, const char *s);
 
 bool nulstr_contains(const char *nulstr, const char *needle);
 
diff --git a/src/test/test-string-util.c b/src/test/test-string-util.c
index a7c863c89..8c7226e5f 100644
--- a/src/test/test-string-util.c
+++ b/src/test/test-string-util.c
@@ -6,6 +6,7 @@
 ***/
 
 #include "alloc-util.h"
+//#include "locale-util.h"
 #include "macro.h"
 #include "string-util.h"
 #include "strv.h"
@@ -79,6 +80,29 @@ static void test_ascii_strcasecmp_nn(void) {
 }
 #endif // 0
 
+static void test_cellescape(void) {
+        char buf[40];
+
+        assert_se(streq(cellescape(buf, 10, "1"), "1"));
+        assert_se(streq(cellescape(buf, 10, "12"), "12"));
+        assert_se(streq(cellescape(buf, 10, "123"), is_locale_utf8() ? "1â¦" : "1..."));
+
+        assert_se(streq(cellescape(buf, 10, "1\011"), "1\\t"));
+        assert_se(streq(cellescape(buf, 10, "1\020"), "1\\020"));
+        assert_se(streq(cellescape(buf, 10, "1\020x"), is_locale_utf8() ? "1â¦" : "1..."));
+
+        assert_se(streq(cellescape(buf, 40, "1\020"), "1\\020"));
+        assert_se(streq(cellescape(buf, 40, "1\020x"), "1\\020x"));
+
+        assert_se(streq(cellescape(buf, 40, "\a\b\f\n\r\t\v\\\"'"), "\\a\\b\\f\\n\\r\\t\\v\\\\\\\"\\'"));
+        assert_se(streq(cellescape(buf, 10, "\a\b\f\n\r\t\v\\\"'"), is_locale_utf8() ? "\\aâ¦" : "\\a..."));
+        assert_se(streq(cellescape(buf, 11, "\a\b\f\n\r\t\v\\\"'"), is_locale_utf8() ? "\\aâ¦" : "\\a..."));
+        assert_se(streq(cellescape(buf, 12, "\a\b\f\n\r\t\v\\\"'"), is_locale_utf8() ? "\\a\\bâ¦" : "\\a\\b..."));
+
+        assert_se(streq(cellescape(buf, sizeof buf, "1\020"), "1\\020"));
+        assert_se(streq(cellescape(buf, sizeof buf, "1\020x"), "1\\020x"));
+}
+
 static void test_streq_ptr(void) {
         assert_se(streq_ptr(NULL, NULL));
         assert_se(!streq_ptr("abc", "cdef"));
@@ -432,6 +456,7 @@ int main(int argc, char *argv[]) {
         test_ascii_strcasecmp_n();
         test_ascii_strcasecmp_nn();
 #endif // 0
+        test_cellescape();
         test_streq_ptr();
         test_strstrip();
         test_strextend();
-- 
2.30.2