From 1c9633d669948155455e29b0c6e770995a8b1ca3 Mon Sep 17 00:00:00 2001 From: David Herrmann Date: Sun, 15 Jun 2014 14:50:00 +0200 Subject: [PATCH] terminal: add parser state-machine The term-parser is used to parse any input from TTY-clients. It reads CSI, DCS, OSC and ST control sequences and normal escape sequences. It doesn't do anything with the parsed data besides detecting the sequence and returning it. The caller has to react to them. The parser also comes with its own UTF-8 helpers. The reason for that is that we don't want to assert() or hard-fail on parsing errors. Instead, we treat any invalid UTF-8 sequences as ISO-8859-1. This allows pasting invalid data into a terminal (which cannot be controlled through the TTY, anyway) and we still deal with it in a proper manner. This is _required_ for 8-bit and 7-bit DEC modes (including the g0-g3 mappings), so it's not just an ugly fallback because we can (it's still horribly ugly but at least we have an excuse). --- .gitignore | 1 + Makefile.am | 13 +- src/libsystemd-terminal/term-charset.c | 491 ++++++ src/libsystemd-terminal/term-internal.h | 349 +++++ src/libsystemd-terminal/term-parser.c | 1626 ++++++++++++++++++++ src/libsystemd-terminal/test-term-parser.c | 143 ++ 6 files changed, 2622 insertions(+), 1 deletion(-) create mode 100644 src/libsystemd-terminal/term-charset.c create mode 100644 src/libsystemd-terminal/term-parser.c create mode 100644 src/libsystemd-terminal/test-term-parser.c diff --git a/.gitignore b/.gitignore index 0ba01daeb..288cde670 100644 --- a/.gitignore +++ b/.gitignore @@ -220,6 +220,7 @@ /test-strxcpyx /test-tables /test-term-page +/test-term-parser /test-time /test-tmpfiles /test-udev diff --git a/Makefile.am b/Makefile.am index db846ad42..73f1252d3 100644 --- a/Makefile.am +++ b/Makefile.am @@ -2839,7 +2839,9 @@ libsystemd_terminal_la_CFLAGS = \ libsystemd_terminal_la_SOURCES = \ src/libsystemd-terminal/term-internal.h \ + src/libsystemd-terminal/term-charset.c \ src/libsystemd-terminal/term-page.c \ + src/libsystemd-terminal/term-parser.c \ src/libsystemd-terminal/term-wcwidth.c libsystemd_terminal_la_LIBADD = \ @@ -2854,8 +2856,17 @@ test_term_page_LDADD = \ libsystemd-internal.la \ libsystemd-shared.la +test_term_parser_SOURCES = \ + src/libsystemd-terminal/test-term-parser.c + +test_term_parser_LDADD = \ + libsystemd-terminal.la \ + libsystemd-internal.la \ + libsystemd-shared.la + tests += \ - test-term-page + test-term-page \ + test-term-parser # ------------------------------------------------------------------------------ if ENABLE_GTK_DOC diff --git a/src/libsystemd-terminal/term-charset.c b/src/libsystemd-terminal/term-charset.c new file mode 100644 index 000000000..a00a1912d --- /dev/null +++ b/src/libsystemd-terminal/term-charset.c @@ -0,0 +1,491 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ + +/*** + This file is part of systemd. + + Copyright (C) 2014 David Herrmann + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +/* + * VTE Character Sets + * These are predefined charactersets that can be loaded into GL and GR. By + * default we use unicode_lower and unicode_upper, that is, both sets have the + * exact unicode mapping. unicode_lower is effectively ASCII and unicode_upper + * as defined by the unicode standard (I guess, ISO 8859-1). + * Several other character sets are defined here. However, all of them are + * limited to the 96 character space of GL or GR. Everything beyond GR (which + * was not supported by the classic VTs by DEC but is available in VT emulators + * that support unicode/UTF8) is always mapped to unicode and cannot be changed + * by these character sets. Even mapping GL and GR is only available for + * backwards compatibility as new applications can use the Unicode functionality + * of the VTE. + * + * Moreover, mapping GR is almost unnecessary to support. In fact, Unicode UTF-8 + * support in VTE works by reading every incoming data as UTF-8 stream. This + * maps GL/ASCII to ASCII, as UTF-8 is backwards compatible to ASCII, however, + * everything that has the 8th bit set is a >=2-byte haracter in UTF-8. That is, + * this is in no way backwards compatible to >=VT220 8bit support. Therefore, if + * someone maps a character set into GR and wants to use them with this VTE, + * then they must already send UTF-8 characters to use GR (all GR characters are + * 8-bits). Hence, they can easily also send the correct UTF-8 character for the + * unicode mapping. + * The only advantage is that most characters in many sets are 3-byte UTF-8 + * characters and by mapping the set into GR/GL you can use 2 or 1 byte UTF-8 + * characters which saves bandwidth. + * Another reason is, if you have older applications that use the VT220 8-bit + * support and you put a ASCII/8bit-extension to UTF-8 converter in between, you + * need these mappings to have the application behave correctly if it uses GL/GR + * mappings extensively. + * + * Anyway, we support GL/GR mappings so here are the most commonly used maps as + * defined by Unicode-standard, DEC-private maps and other famous charmaps. + * + * Characters 1-32 are always the control characters (part of CL) and cannot be + * mapped. Characters 34-127 (94 characters) are part of GL and can be mapped. + * Characters 33 and 128 are not part of GL and always mapped by the VTE. + * However, for GR they can be mapped differently (96 chars) so we have to + * include them. The mapper has to take care not to use them in GL. + */ + +#include +#include +#include +#include "term-internal.h" + +/* + * Lower Unicode character set. This maps the characters to the basic ASCII + * characters 33-126. These are all graphics characters defined in ASCII. + */ +term_charset term_unicode_lower = { + [0] = 32, + [1] = 33, + [2] = 34, + [3] = 35, + [4] = 36, + [5] = 37, + [6] = 38, + [7] = 39, + [8] = 40, + [9] = 41, + [10] = 42, + [11] = 43, + [12] = 44, + [13] = 45, + [14] = 46, + [15] = 47, + [16] = 48, + [17] = 49, + [18] = 50, + [19] = 51, + [20] = 52, + [21] = 53, + [22] = 54, + [23] = 55, + [24] = 56, + [25] = 57, + [26] = 58, + [27] = 59, + [28] = 60, + [29] = 61, + [30] = 62, + [31] = 63, + [32] = 64, + [33] = 65, + [34] = 66, + [35] = 67, + [36] = 68, + [37] = 69, + [38] = 70, + [39] = 71, + [40] = 72, + [41] = 73, + [42] = 74, + [43] = 75, + [44] = 76, + [45] = 77, + [46] = 78, + [47] = 79, + [48] = 80, + [49] = 81, + [50] = 82, + [51] = 83, + [52] = 84, + [53] = 85, + [54] = 86, + [55] = 87, + [56] = 88, + [57] = 89, + [58] = 90, + [59] = 91, + [60] = 92, + [61] = 93, + [62] = 94, + [63] = 95, + [64] = 96, + [65] = 97, + [66] = 98, + [67] = 99, + [68] = 100, + [69] = 101, + [70] = 102, + [71] = 103, + [72] = 104, + [73] = 105, + [74] = 106, + [75] = 107, + [76] = 108, + [77] = 109, + [78] = 110, + [79] = 111, + [80] = 112, + [81] = 113, + [82] = 114, + [83] = 115, + [84] = 116, + [85] = 117, + [86] = 118, + [87] = 119, + [88] = 120, + [89] = 121, + [90] = 122, + [91] = 123, + [92] = 124, + [93] = 125, + [94] = 126, + [95] = 127, +}; + +/* + * Upper Unicode Table + * This maps all characters to the upper unicode characters 161-254. These are + * not compatible to any older 8 bit character sets. See the Unicode standard + * for the definitions of each symbol. + */ +term_charset term_unicode_upper = { + [0] = 160, + [1] = 161, + [2] = 162, + [3] = 163, + [4] = 164, + [5] = 165, + [6] = 166, + [7] = 167, + [8] = 168, + [9] = 169, + [10] = 170, + [11] = 171, + [12] = 172, + [13] = 173, + [14] = 174, + [15] = 175, + [16] = 176, + [17] = 177, + [18] = 178, + [19] = 179, + [20] = 180, + [21] = 181, + [22] = 182, + [23] = 183, + [24] = 184, + [25] = 185, + [26] = 186, + [27] = 187, + [28] = 188, + [29] = 189, + [30] = 190, + [31] = 191, + [32] = 192, + [33] = 193, + [34] = 194, + [35] = 195, + [36] = 196, + [37] = 197, + [38] = 198, + [39] = 199, + [40] = 200, + [41] = 201, + [42] = 202, + [43] = 203, + [44] = 204, + [45] = 205, + [46] = 206, + [47] = 207, + [48] = 208, + [49] = 209, + [50] = 210, + [51] = 211, + [52] = 212, + [53] = 213, + [54] = 214, + [55] = 215, + [56] = 216, + [57] = 217, + [58] = 218, + [59] = 219, + [60] = 220, + [61] = 221, + [62] = 222, + [63] = 223, + [64] = 224, + [65] = 225, + [66] = 226, + [67] = 227, + [68] = 228, + [69] = 229, + [70] = 230, + [71] = 231, + [72] = 232, + [73] = 233, + [74] = 234, + [75] = 235, + [76] = 236, + [77] = 237, + [78] = 238, + [79] = 239, + [80] = 240, + [81] = 241, + [82] = 242, + [83] = 243, + [84] = 244, + [85] = 245, + [86] = 246, + [87] = 247, + [88] = 248, + [89] = 249, + [90] = 250, + [91] = 251, + [92] = 252, + [93] = 253, + [94] = 254, + [95] = 255, +}; + +/* + * The DEC supplemental graphics set. For its definition see here: + * http://vt100.net/docs/vt220-rm/table2-3b.html + * Its basically a mixture of common European symbols that are not part of + * ASCII. Most often, this is mapped into GR to extend the basci ASCII part. + * + * This is very similar to unicode_upper, however, few symbols differ so do not + * mix them up! + */ +term_charset term_dec_supplemental_graphics = { + [0] = -1, /* undefined */ + [1] = 161, + [2] = 162, + [3] = 163, + [4] = 0, + [5] = 165, + [6] = 0, + [7] = 167, + [8] = 164, + [9] = 169, + [10] = 170, + [11] = 171, + [12] = 0, + [13] = 0, + [14] = 0, + [15] = 0, + [16] = 176, + [17] = 177, + [18] = 178, + [19] = 179, + [20] = 0, + [21] = 181, + [22] = 182, + [23] = 183, + [24] = 0, + [25] = 185, + [26] = 186, + [27] = 187, + [28] = 188, + [29] = 189, + [30] = 0, + [31] = 191, + [32] = 192, + [33] = 193, + [34] = 194, + [35] = 195, + [36] = 196, + [37] = 197, + [38] = 198, + [39] = 199, + [40] = 200, + [41] = 201, + [42] = 202, + [43] = 203, + [44] = 204, + [45] = 205, + [46] = 206, + [47] = 207, + [48] = 0, + [49] = 209, + [50] = 210, + [51] = 211, + [52] = 212, + [53] = 213, + [54] = 214, + [55] = 338, + [56] = 216, + [57] = 217, + [58] = 218, + [59] = 219, + [60] = 220, + [61] = 376, + [62] = 0, + [63] = 223, + [64] = 224, + [65] = 225, + [66] = 226, + [67] = 227, + [68] = 228, + [69] = 229, + [70] = 230, + [71] = 231, + [72] = 232, + [73] = 233, + [74] = 234, + [75] = 235, + [76] = 236, + [77] = 237, + [78] = 238, + [79] = 239, + [80] = 0, + [81] = 241, + [82] = 242, + [83] = 243, + [84] = 244, + [85] = 245, + [86] = 246, + [87] = 339, + [88] = 248, + [89] = 249, + [90] = 250, + [91] = 251, + [92] = 252, + [93] = 255, + [94] = 0, + [95] = -1, /* undefined */ +}; + +/* + * DEC special graphics character set. See here for its definition: + * http://vt100.net/docs/vt220-rm/table2-4.html + * This contains several characters to create ASCII drawings and similar. Its + * commonly mapped into GR to extend the basic ASCII characters. + * + * Lower 62 characters map to ASCII 33-64, everything beyond is special and + * commonly used for ASCII drawings. It depends on the Unicode Standard 3.2 for + * the extended horizontal scan-line characters 3, 5, 7, and 9. + */ +term_charset term_dec_special_graphics = { + [0] = -1, /* undefined */ + [1] = 33, + [2] = 34, + [3] = 35, + [4] = 36, + [5] = 37, + [6] = 38, + [7] = 39, + [8] = 40, + [9] = 41, + [10] = 42, + [11] = 43, + [12] = 44, + [13] = 45, + [14] = 46, + [15] = 47, + [16] = 48, + [17] = 49, + [18] = 50, + [19] = 51, + [20] = 52, + [21] = 53, + [22] = 54, + [23] = 55, + [24] = 56, + [25] = 57, + [26] = 58, + [27] = 59, + [28] = 60, + [29] = 61, + [30] = 62, + [31] = 63, + [32] = 64, + [33] = 65, + [34] = 66, + [35] = 67, + [36] = 68, + [37] = 69, + [38] = 70, + [39] = 71, + [40] = 72, + [41] = 73, + [42] = 74, + [43] = 75, + [44] = 76, + [45] = 77, + [46] = 78, + [47] = 79, + [48] = 80, + [49] = 81, + [50] = 82, + [51] = 83, + [52] = 84, + [53] = 85, + [54] = 86, + [55] = 87, + [56] = 88, + [57] = 89, + [58] = 90, + [59] = 91, + [60] = 92, + [61] = 93, + [62] = 94, + [63] = 0, + [64] = 9830, + [65] = 9618, + [66] = 9225, + [67] = 9228, + [68] = 9229, + [69] = 9226, + [70] = 176, + [71] = 177, + [72] = 9252, + [73] = 9227, + [74] = 9496, + [75] = 9488, + [76] = 9484, + [77] = 9492, + [78] = 9532, + [79] = 9146, + [80] = 9147, + [81] = 9472, + [82] = 9148, + [83] = 9149, + [84] = 9500, + [85] = 9508, + [86] = 9524, + [87] = 9516, + [88] = 9474, + [89] = 8804, + [90] = 8805, + [91] = 960, + [92] = 8800, + [93] = 163, + [94] = 8901, + [95] = -1, /* undefined */ +}; diff --git a/src/libsystemd-terminal/term-internal.h b/src/libsystemd-terminal/term-internal.h index d7d2f98b3..a3d1f5458 100644 --- a/src/libsystemd-terminal/term-internal.h +++ b/src/libsystemd-terminal/term-internal.h @@ -37,6 +37,11 @@ typedef struct term_line term_line; typedef struct term_page term_page; typedef struct term_history term_history; +typedef struct term_utf8 term_utf8; +typedef struct term_seq term_seq; +typedef struct term_parser term_parser; +typedef uint32_t term_charset[96]; + /* * Miscellaneous * Sundry things and external helpers. @@ -335,3 +340,347 @@ void term_history_trim(term_history *history, unsigned int max); void term_history_push(term_history *history, term_line *line); term_line *term_history_pop(term_history *history, unsigned int reserve_width, const term_attr *attr, term_age_t age); unsigned int term_history_peek(term_history *history, unsigned int max, unsigned int reserve_width, const term_attr *attr, term_age_t age); + +/* + * UTF-8 + * The UTF-decoder and encoder are adjusted for terminals and provide proper + * fallbacks for invalid UTF-8. In terminals it's quite usual to use fallbacks + * instead of rejecting invalid input. This way, old legacy applications still + * work (this is especially important for 7bit/ASCII DEC modes). + */ + +struct term_utf8 { + uint32_t chars[5]; + uint32_t ucs4; + + unsigned int i_bytes : 3; + unsigned int n_bytes : 3; + unsigned int valid : 1; +}; + +size_t term_utf8_encode(char *out_utf8, uint32_t g); +const uint32_t *term_utf8_decode(term_utf8 *p, size_t *out_len, char c); + +/* + * Parsers + * The term_parser object parses control-sequences for both host and terminal + * side. Based on this parser, there is a set of command-parsers that take a + * term_seq sequence and returns the command it represents. This is different + * for host and terminal side so a different set of parsers is provided. + */ + +enum { + TERM_SEQ_NONE, /* placeholder, no sequence parsed */ + + TERM_SEQ_IGNORE, /* no-op character */ + TERM_SEQ_GRAPHIC, /* graphic character */ + TERM_SEQ_CONTROL, /* control character */ + TERM_SEQ_ESCAPE, /* escape sequence */ + TERM_SEQ_CSI, /* control sequence function */ + TERM_SEQ_DCS, /* device control string */ + TERM_SEQ_OSC, /* operating system control */ + + TERM_SEQ_CNT +}; + +enum { + /* these must be kept compatible to (1U << (ch - 0x20)) */ + + TERM_SEQ_FLAG_SPACE = (1U << 0), /* char: */ + TERM_SEQ_FLAG_BANG = (1U << 1), /* char: ! */ + TERM_SEQ_FLAG_DQUOTE = (1U << 2), /* char: " */ + TERM_SEQ_FLAG_HASH = (1U << 3), /* char: # */ + TERM_SEQ_FLAG_CASH = (1U << 4), /* char: $ */ + TERM_SEQ_FLAG_PERCENT = (1U << 5), /* char: % */ + TERM_SEQ_FLAG_AND = (1U << 6), /* char: & */ + TERM_SEQ_FLAG_SQUOTE = (1U << 7), /* char: ' */ + TERM_SEQ_FLAG_POPEN = (1U << 8), /* char: ( */ + TERM_SEQ_FLAG_PCLOSE = (1U << 9), /* char: ) */ + TERM_SEQ_FLAG_MULT = (1U << 10), /* char: * */ + TERM_SEQ_FLAG_PLUS = (1U << 11), /* char: + */ + TERM_SEQ_FLAG_COMMA = (1U << 12), /* char: , */ + TERM_SEQ_FLAG_MINUS = (1U << 13), /* char: - */ + TERM_SEQ_FLAG_DOT = (1U << 14), /* char: . */ + TERM_SEQ_FLAG_SLASH = (1U << 15), /* char: / */ + + /* 16-35 is reserved for numbers; unused */ + + /* COLON is reserved = (1U << 26), char: : */ + /* SEMICOLON is reserved = (1U << 27), char: ; */ + TERM_SEQ_FLAG_LT = (1U << 28), /* char: < */ + TERM_SEQ_FLAG_EQUAL = (1U << 29), /* char: = */ + TERM_SEQ_FLAG_GT = (1U << 30), /* char: > */ + TERM_SEQ_FLAG_WHAT = (1U << 31), /* char: ? */ +}; + +enum { + TERM_CMD_NONE, /* placeholder */ + TERM_CMD_GRAPHIC, /* graphics character */ + + TERM_CMD_BEL, /* bell */ + TERM_CMD_BS, /* backspace */ + TERM_CMD_CBT, /* cursor-backward-tabulation */ + TERM_CMD_CHA, /* cursor-horizontal-absolute */ + TERM_CMD_CHT, /* cursor-horizontal-forward-tabulation */ + TERM_CMD_CNL, /* cursor-next-line */ + TERM_CMD_CPL, /* cursor-previous-line */ + TERM_CMD_CR, /* carriage-return */ + TERM_CMD_CUB, /* cursor-backward */ + TERM_CMD_CUD, /* cursor-down */ + TERM_CMD_CUF, /* cursor-forward */ + TERM_CMD_CUP, /* cursor-position */ + TERM_CMD_CUU, /* cursor-up */ + TERM_CMD_DA1, /* primary-device-attributes */ + TERM_CMD_DA2, /* secondary-device-attributes */ + TERM_CMD_DA3, /* tertiary-device-attributes */ + TERM_CMD_DC1, /* device-control-1 */ + TERM_CMD_DC3, /* device-control-3 */ + TERM_CMD_DCH, /* delete-character */ + TERM_CMD_DECALN, /* screen-alignment-pattern */ + TERM_CMD_DECANM, /* ansi-mode */ + TERM_CMD_DECBI, /* back-index */ + TERM_CMD_DECCARA, /* change-attributes-in-rectangular-area */ + TERM_CMD_DECCRA, /* copy-rectangular-area */ + TERM_CMD_DECDC, /* delete-column */ + TERM_CMD_DECDHL_BH, /* double-width-double-height-line: bottom half */ + TERM_CMD_DECDHL_TH, /* double-width-double-height-line: top half */ + TERM_CMD_DECDWL, /* double-width-single-height-line */ + TERM_CMD_DECEFR, + TERM_CMD_DECELF, + TERM_CMD_DECELR, + TERM_CMD_DECERA, + TERM_CMD_DECFI, + TERM_CMD_DECFRA, + TERM_CMD_DECIC, + TERM_CMD_DECID, + TERM_CMD_DECINVM, + TERM_CMD_DECKBD, + TERM_CMD_DECKPAM, + TERM_CMD_DECKPNM, + TERM_CMD_DECLFKC, + TERM_CMD_DECLL, + TERM_CMD_DECLTOD, + TERM_CMD_DECPCTERM, + TERM_CMD_DECPKA, + TERM_CMD_DECPKFMR, + TERM_CMD_DECRARA, + TERM_CMD_DECRC, + TERM_CMD_DECREQTPARM, + TERM_CMD_DECRPKT, + TERM_CMD_DECRQCRA, + TERM_CMD_DECRQDE, + TERM_CMD_DECRQKT, + TERM_CMD_DECRQLP, + TERM_CMD_DECRQM_ANSI, + TERM_CMD_DECRQM_DEC, + TERM_CMD_DECRQPKFM, + TERM_CMD_DECRQPSR, + TERM_CMD_DECRQTSR, + TERM_CMD_DECRQUPSS, + TERM_CMD_DECSACE, + TERM_CMD_DECSASD, + TERM_CMD_DECSC, + TERM_CMD_DECSCA, + TERM_CMD_DECSCL, + TERM_CMD_DECSCP, + TERM_CMD_DECSCPP, + TERM_CMD_DECSCS, + TERM_CMD_DECSCUSR, + TERM_CMD_DECSDDT, + TERM_CMD_DECSDPT, + TERM_CMD_DECSED, + TERM_CMD_DECSEL, + TERM_CMD_DECSERA, + TERM_CMD_DECSFC, + TERM_CMD_DECSKCV, + TERM_CMD_DECSLCK, + TERM_CMD_DECSLE, + TERM_CMD_DECSLPP, + TERM_CMD_DECSLRM_OR_SC, + TERM_CMD_DECSMBV, + TERM_CMD_DECSMKR, + TERM_CMD_DECSNLS, + TERM_CMD_DECSPP, + TERM_CMD_DECSPPCS, + TERM_CMD_DECSPRTT, + TERM_CMD_DECSR, + TERM_CMD_DECSRFR, + TERM_CMD_DECSSCLS, + TERM_CMD_DECSSDT, + TERM_CMD_DECSSL, + TERM_CMD_DECST8C, + TERM_CMD_DECSTBM, + TERM_CMD_DECSTR, + TERM_CMD_DECSTRL, + TERM_CMD_DECSWBV, + TERM_CMD_DECSWL, + TERM_CMD_DECTID, + TERM_CMD_DECTME, + TERM_CMD_DECTST, + TERM_CMD_DL, + TERM_CMD_DSR_ANSI, + TERM_CMD_DSR_DEC, + TERM_CMD_ECH, + TERM_CMD_ED, + TERM_CMD_EL, + TERM_CMD_ENQ, + TERM_CMD_EPA, + TERM_CMD_FF, + TERM_CMD_HPA, + TERM_CMD_HPR, + TERM_CMD_HT, + TERM_CMD_HTS, + TERM_CMD_HVP, + TERM_CMD_ICH, + TERM_CMD_IL, + TERM_CMD_IND, + TERM_CMD_LF, + TERM_CMD_LS1R, + TERM_CMD_LS2, + TERM_CMD_LS2R, + TERM_CMD_LS3, + TERM_CMD_LS3R, + TERM_CMD_MC_ANSI, + TERM_CMD_MC_DEC, + TERM_CMD_NEL, + TERM_CMD_NP, + TERM_CMD_NULL, + TERM_CMD_PP, + TERM_CMD_PPA, + TERM_CMD_PPB, + TERM_CMD_PPR, + TERM_CMD_RC, + TERM_CMD_REP, + TERM_CMD_RI, + TERM_CMD_RIS, + TERM_CMD_RM_ANSI, + TERM_CMD_RM_DEC, + TERM_CMD_S7C1T, + TERM_CMD_S8C1T, + TERM_CMD_SCS, + TERM_CMD_SD, + TERM_CMD_SGR, + TERM_CMD_SI, + TERM_CMD_SM_ANSI, + TERM_CMD_SM_DEC, + TERM_CMD_SO, + TERM_CMD_SPA, + TERM_CMD_SS2, + TERM_CMD_SS3, + TERM_CMD_ST, + TERM_CMD_SU, + TERM_CMD_SUB, + TERM_CMD_TBC, + TERM_CMD_VPA, + TERM_CMD_VPR, + TERM_CMD_VT, + TERM_CMD_XTERM_CLLHP, /* xterm-cursor-lower-left-hp-bugfix */ + TERM_CMD_XTERM_IHMT, /* xterm-initiate-highlight-mouse-tracking*/ + TERM_CMD_XTERM_MLHP, /* xterm-memory-lock-hp-bugfix */ + TERM_CMD_XTERM_MUHP, /* xterm-memory-unlock-hp-bugfix */ + TERM_CMD_XTERM_RPM, /* xterm-restore-private-mode */ + TERM_CMD_XTERM_RRV, /* xterm-reset-resource-value */ + TERM_CMD_XTERM_RTM, /* xterm-reset-title-mode */ + TERM_CMD_XTERM_SACL1, /* xterm-set-ansi-conformance-level-1 */ + TERM_CMD_XTERM_SACL2, /* xterm-set-ansi-conformance-level-2 */ + TERM_CMD_XTERM_SACL3, /* xterm-set-ansi-conformance-level-3 */ + TERM_CMD_XTERM_SDCS, /* xterm-set-default-character-set */ + TERM_CMD_XTERM_SGFX, /* xterm-sixel-graphics */ + TERM_CMD_XTERM_SPM, /* xterm-set-private-mode */ + TERM_CMD_XTERM_SRV, /* xterm-set-resource-value */ + TERM_CMD_XTERM_STM, /* xterm-set-title-mode */ + TERM_CMD_XTERM_SUCS, /* xterm-set-utf8-character-set */ + TERM_CMD_XTERM_WM, /* xterm-window-management */ + + TERM_CMD_CNT +}; + +enum { + /* + * Charsets: DEC marks charsets according to "Digital Equ. Corp.". + * NRCS marks charsets according to the "National Replacement + * Character Sets". ISO marks charsets according to ISO-8859. + * The USERDEF charset is special and can be modified by the host. + */ + + TERM_CHARSET_NONE, + + /* 96-compat charsets */ + TERM_CHARSET_ISO_LATIN1_SUPPLEMENTAL, + TERM_CHARSET_BRITISH_NRCS = TERM_CHARSET_ISO_LATIN1_SUPPLEMENTAL, + TERM_CHARSET_ISO_LATIN2_SUPPLEMENTAL, + TERM_CHARSET_AMERICAN_NRCS = TERM_CHARSET_ISO_LATIN2_SUPPLEMENTAL, + TERM_CHARSET_ISO_LATIN5_SUPPLEMENTAL, + TERM_CHARSET_ISO_GREEK_SUPPLEMENTAL, + TERM_CHARSET_ISO_HEBREW_SUPPLEMENTAL, + TERM_CHARSET_ISO_LATIN_CYRILLIC, + + TERM_CHARSET_96_CNT, + + /* 94-compat charsets */ + TERM_CHARSET_DEC_SPECIAL_GRAPHIC = TERM_CHARSET_96_CNT, + TERM_CHARSET_DEC_SUPPLEMENTAL, + TERM_CHARSET_DEC_TECHNICAL, + TERM_CHARSET_CYRILLIC_DEC, + TERM_CHARSET_DUTCH_NRCS, + TERM_CHARSET_FINNISH_NRCS, + TERM_CHARSET_FRENCH_NRCS, + TERM_CHARSET_FRENCH_CANADIAN_NRCS, + TERM_CHARSET_GERMAN_NRCS, + TERM_CHARSET_GREEK_DEC, + TERM_CHARSET_GREEK_NRCS, + TERM_CHARSET_HEBREW_DEC, + TERM_CHARSET_HEBREW_NRCS, + TERM_CHARSET_ITALIAN_NRCS, + TERM_CHARSET_NORWEGIAN_DANISH_NRCS, + TERM_CHARSET_PORTUGUESE_NRCS, + TERM_CHARSET_RUSSIAN_NRCS, + TERM_CHARSET_SCS_NRCS, + TERM_CHARSET_SPANISH_NRCS, + TERM_CHARSET_SWEDISH_NRCS, + TERM_CHARSET_SWISS_NRCS, + TERM_CHARSET_TURKISH_DEC, + TERM_CHARSET_TURKISH_NRCS, + + TERM_CHARSET_94_CNT, + + /* special charsets */ + TERM_CHARSET_USERPREF_SUPPLEMENTAL = TERM_CHARSET_94_CNT, + + TERM_CHARSET_CNT, +}; + +extern term_charset term_unicode_lower; +extern term_charset term_unicode_upper; +extern term_charset term_dec_supplemental_graphics; +extern term_charset term_dec_special_graphics; + +#define TERM_PARSER_ARG_MAX (16) +#define TERM_PARSER_ST_MAX (4096) + +struct term_seq { + unsigned int type; + unsigned int command; + uint32_t terminator; + unsigned int intermediates; + unsigned int charset; + unsigned int n_args; + int args[TERM_PARSER_ARG_MAX]; + unsigned int n_st; + char *st; +}; + +struct term_parser { + term_seq seq; + size_t st_alloc; + unsigned int state; + + bool is_host : 1; +}; + +int term_parser_new(term_parser **out, bool host); +term_parser *term_parser_free(term_parser *parser); +int term_parser_feed(term_parser *parser, const term_seq **seq_out, uint32_t raw); + +#define _term_parser_free_ _cleanup_(term_parser_freep) +DEFINE_TRIVIAL_CLEANUP_FUNC(term_parser*, term_parser_free); diff --git a/src/libsystemd-terminal/term-parser.c b/src/libsystemd-terminal/term-parser.c new file mode 100644 index 000000000..1c968520b --- /dev/null +++ b/src/libsystemd-terminal/term-parser.c @@ -0,0 +1,1626 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ + +/*** + This file is part of systemd. + + Copyright (C) 2014 David Herrmann + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +/* + * Terminal Parser + * This file contains a bunch of UTF-8 helpers and the main ctlseq-parser. The + * parser is a simple state-machine that correctly parses all CSI, DCS, OSC, ST + * control sequences and generic escape sequences. + * The parser itself does not perform any actions but lets the caller react to + * detected sequences. + */ + +#include +#include +#include +#include "macro.h" +#include "term-internal.h" +#include "util.h" + +/** + * term_utf8_encode() - Encode single UCS-4 character as UTF-8 + * @out_utf8: output buffer of at least 4 bytes or NULL + * @g: UCS-4 character to encode + * + * This encodes a single UCS-4 character as UTF-8 and writes it into @out_utf8. + * The length of the character is returned. It is not zero-terminated! If the + * output buffer is NULL, only the length is returned. + * + * Returns: The length in bytes that the UTF-8 representation does or would + * occupy. + */ +size_t term_utf8_encode(char *out_utf8, uint32_t g) { + if (g < (1 << 7)) { + if (out_utf8) + out_utf8[0] = g & 0x7f; + return 1; + } else if (g < (1 << 11)) { + if (out_utf8) { + out_utf8[0] = 0xc0 | ((g >> 6) & 0x1f); + out_utf8[1] = 0x80 | (g & 0x3f); + } + return 2; + } else if (g < (1 << 16)) { + if (out_utf8) { + out_utf8[0] = 0xe0 | ((g >> 12) & 0x0f); + out_utf8[1] = 0x80 | ((g >> 6) & 0x3f); + out_utf8[2] = 0x80 | (g & 0x3f); + } + return 3; + } else if (g < (1 << 21)) { + if (out_utf8) { + out_utf8[0] = 0xf0 | ((g >> 18) & 0x07); + out_utf8[1] = 0x80 | ((g >> 12) & 0x3f); + out_utf8[2] = 0x80 | ((g >> 6) & 0x3f); + out_utf8[3] = 0x80 | (g & 0x3f); + } + return 4; + } else { + return 0; + } +} + +/** + * term_utf8_decode() - Try decoding the next UCS-4 character + * @p: decoder object to operate on or NULL + * @out_len: output buffer for length of decoded UCS-4 string or NULL + * @c: next char to push into decoder + * + * This decodes a UTF-8 stream. It must be called for each input-byte of the + * UTF-8 stream and returns a UCS-4 stream. The length of the returned UCS-4 + * string (number of parsed characters) is stored in @out_len if non-NULL. A + * pointer to the string is returned (or NULL if none was parsed). The string + * is not zero-terminated! Furthermore, the string is only valid until the next + * invokation of this function. It is also bound to the parser-state @p. + * + * This function is highly optimized to work with terminal-emulators. Instead + * of being strict about UTF-8 validity, this tries to perform a fallback to + * ISO-8859-1 in case a wrong series was detected. Therefore, this function + * might return multiple UCS-4 characters by parsing just a single UTF-8 byte. + * + * The parser state @p should be allocated and managed by the caller. There're + * no helpers to do that for you. To initialize it, simply reset it to all + * zero. You can reset or free the object at any point in time. + * + * Returns: Pointer to the UCS-4 string or NULL. + */ +const uint32_t *term_utf8_decode(term_utf8 *p, size_t *out_len, char c) { + uint32_t t, *res = NULL; + uint8_t byte; + size_t len = 0; + + if (!p) + goto out; + + byte = c; + + if (!p->valid || p->i_bytes >= p->n_bytes) { + /* + * If the previous sequence was invalid or fully parsed, start + * parsing a fresh new sequence. + */ + + if ((byte & 0xE0) == 0xC0) { + /* start of two byte sequence */ + t = byte & 0x1F; + p->n_bytes = 2; + p->i_bytes = 1; + p->valid = 1; + } else if ((byte & 0xF0) == 0xE0) { + /* start of three byte sequence */ + t = byte & 0x0F; + p->n_bytes = 3; + p->i_bytes = 1; + p->valid = 1; + } else if ((byte & 0xF8) == 0xF0) { + /* start of four byte sequence */ + t = byte & 0x07; + p->n_bytes = 4; + p->i_bytes = 1; + p->valid = 1; + } else { + /* Either of: + * - single ASCII 7-bit char + * - out-of-sync continuation byte + * - overlong encoding + * All of them are treated as single byte ISO-8859-1 */ + t = byte; + p->n_bytes = 1; + p->i_bytes = 1; + p->valid = 0; + } + + p->chars[0] = byte; + p->ucs4 = t << (6 * (p->n_bytes - p->i_bytes)); + } else { + /* + * ..otherwise, try to continue the previous sequence.. + */ + + if ((byte & 0xC0) == 0x80) { + /* + * Valid continuation byte. Append to sequence and + * update the ucs4 cache accordingly. + */ + + t = byte & 0x3F; + p->chars[p->i_bytes++] = byte; + p->ucs4 |= t << (6 * (p->n_bytes - p->i_bytes)); + } else { + /* + * Invalid continuation? Treat cached sequence as + * ISO-8859-1, but parse the new char as valid new + * starting character. If it's a new single-byte UTF-8 + * sequence, we immediately return it in the same run, + * otherwise, we might suffer from starvation. + */ + + if ((byte & 0xE0) == 0xC0 || + (byte & 0xF0) == 0xE0 || + (byte & 0xF8) == 0xF0) { + /* + * New multi-byte sequence. Move to-be-returned + * data at the end and start new sequence. Only + * return the old sequence. + */ + + memmove(p->chars + 1, + p->chars, + sizeof(*p->chars) * p->i_bytes); + res = p->chars + 1; + len = p->i_bytes; + + if ((byte & 0xE0) == 0xC0) { + /* start of two byte sequence */ + t = byte & 0x1F; + p->n_bytes = 2; + p->i_bytes = 1; + p->valid = 1; + } else if ((byte & 0xF0) == 0xE0) { + /* start of three byte sequence */ + t = byte & 0x0F; + p->n_bytes = 3; + p->i_bytes = 1; + p->valid = 1; + } else if ((byte & 0xF8) == 0xF0) { + /* start of four byte sequence */ + t = byte & 0x07; + p->n_bytes = 4; + p->i_bytes = 1; + p->valid = 1; + } + + p->chars[0] = byte; + p->ucs4 = t << (6 * (p->n_bytes - p->i_bytes)); + + goto out; + } else { + /* + * New single byte sequence, append to output + * and return combined sequence. + */ + + p->chars[p->i_bytes++] = byte; + p->valid = 0; + } + } + } + + /* + * Check whether a full sequence (valid or invalid) has been parsed and + * then return it. Otherwise, return nothing. + */ + if (p->valid) { + /* still parsing? then bail out */ + if (p->i_bytes < p->n_bytes) + goto out; + + res = &p->ucs4; + len = 1; + } else { + res = p->chars; + len = p->i_bytes; + } + + p->valid = 0; + p->i_bytes = 0; + p->n_bytes = 0; + +out: + if (out_len) + *out_len = len; + return len > 0 ? res : NULL; +} + +/* + * Command Parser + * The ctl-seq parser "term_parser" only detects whole sequences, it does not + * detect the specific command. Once a sequence is parsed, the command-parsers + * are used to figure out their meaning. Note that this depends on whether we + * run on the host or terminal side. + */ + +static unsigned int term_parse_host_control(const term_seq *seq) { + assert_return(seq, TERM_CMD_NONE); + + switch (seq->terminator) { + case 0x00: /* NUL */ + return TERM_CMD_NULL; + case 0x05: /* ENQ */ + return TERM_CMD_ENQ; + case 0x07: /* BEL */ + return TERM_CMD_BEL; + case 0x08: /* BS */ + return TERM_CMD_BS; + case 0x09: /* HT */ + return TERM_CMD_HT; + case 0x0a: /* LF */ + return TERM_CMD_LF; + case 0x0b: /* VT */ + return TERM_CMD_VT; + case 0x0c: /* FF */ + return TERM_CMD_FF; + case 0x0d: /* CR */ + return TERM_CMD_CR; + case 0x0e: /* SO */ + return TERM_CMD_SO; + case 0x0f: /* SI */ + return TERM_CMD_SI; + case 0x11: /* DC1 */ + return TERM_CMD_DC1; + case 0x13: /* DC3 */ + return TERM_CMD_DC3; + case 0x18: /* CAN */ + /* this is already handled by the state-machine */ + break; + case 0x1a: /* SUB */ + return TERM_CMD_SUB; + case 0x1b: /* ESC */ + /* this is already handled by the state-machine */ + break; + case 0x1f: /* DEL */ + /* this is already handled by the state-machine */ + break; + case 0x84: /* IND */ + return TERM_CMD_IND; + case 0x85: /* NEL */ + return TERM_CMD_NEL; + case 0x88: /* HTS */ + return TERM_CMD_HTS; + case 0x8d: /* RI */ + return TERM_CMD_RI; + case 0x8e: /* SS2 */ + return TERM_CMD_SS2; + case 0x8f: /* SS3 */ + return TERM_CMD_SS3; + case 0x90: /* DCS */ + /* this is already handled by the state-machine */ + break; + case 0x96: /* SPA */ + return TERM_CMD_SPA; + case 0x97: /* EPA */ + return TERM_CMD_EPA; + case 0x98: /* SOS */ + /* this is already handled by the state-machine */ + break; + case 0x9a: /* DECID */ + return TERM_CMD_DECID; + case 0x9b: /* CSI */ + /* this is already handled by the state-machine */ + break; + case 0x9c: /* ST */ + return TERM_CMD_ST; + case 0x9d: /* OSC */ + /* this is already handled by the state-machine */ + break; + case 0x9e: /* PM */ + /* this is already handled by the state-machine */ + break; + case 0x9f: /* APC */ + /* this is already handled by the state-machine */ + break; + } + + return TERM_CMD_NONE; +} + +static inline int charset_from_cmd(uint32_t raw, unsigned int flags, bool require_96) { + static const struct { + uint32_t raw; + unsigned int flags; + } charset_cmds[] = { + /* 96-compat charsets */ + [TERM_CHARSET_ISO_LATIN1_SUPPLEMENTAL] = { .raw = 'A', .flags = 0 }, + [TERM_CHARSET_ISO_LATIN2_SUPPLEMENTAL] = { .raw = 'B', .flags = 0 }, + [TERM_CHARSET_ISO_LATIN5_SUPPLEMENTAL] = { .raw = 'M', .flags = 0 }, + [TERM_CHARSET_ISO_GREEK_SUPPLEMENTAL] = { .raw = 'F', .flags = 0 }, + [TERM_CHARSET_ISO_HEBREW_SUPPLEMENTAL] = { .raw = 'H', .flags = 0 }, + [TERM_CHARSET_ISO_LATIN_CYRILLIC] = { .raw = 'L', .flags = 0 }, + + /* 94-compat charsets */ + [TERM_CHARSET_DEC_SPECIAL_GRAPHIC] = { .raw = '0', .flags = 0 }, + [TERM_CHARSET_DEC_SUPPLEMENTAL] = { .raw = '5', .flags = TERM_SEQ_FLAG_PERCENT }, + [TERM_CHARSET_DEC_TECHNICAL] = { .raw = '>', .flags = 0 }, + [TERM_CHARSET_CYRILLIC_DEC] = { .raw = '4', .flags = TERM_SEQ_FLAG_AND }, + [TERM_CHARSET_DUTCH_NRCS] = { .raw = '4', .flags = 0 }, + [TERM_CHARSET_FINNISH_NRCS] = { .raw = '5', .flags = 0 }, + [TERM_CHARSET_FRENCH_NRCS] = { .raw = 'R', .flags = 0 }, + [TERM_CHARSET_FRENCH_CANADIAN_NRCS] = { .raw = '9', .flags = 0 }, + [TERM_CHARSET_GERMAN_NRCS] = { .raw = 'K', .flags = 0 }, + [TERM_CHARSET_GREEK_DEC] = { .raw = '?', .flags = TERM_SEQ_FLAG_DQUOTE }, + [TERM_CHARSET_GREEK_NRCS] = { .raw = '>', .flags = TERM_SEQ_FLAG_DQUOTE }, + [TERM_CHARSET_HEBREW_DEC] = { .raw = '4', .flags = TERM_SEQ_FLAG_DQUOTE }, + [TERM_CHARSET_HEBREW_NRCS] = { .raw = '=', .flags = TERM_SEQ_FLAG_PERCENT }, + [TERM_CHARSET_ITALIAN_NRCS] = { .raw = 'Y', .flags = 0 }, + [TERM_CHARSET_NORWEGIAN_DANISH_NRCS] = { .raw = '`', .flags = 0 }, + [TERM_CHARSET_PORTUGUESE_NRCS] = { .raw = '6', .flags = TERM_SEQ_FLAG_PERCENT }, + [TERM_CHARSET_RUSSIAN_NRCS] = { .raw = '5', .flags = TERM_SEQ_FLAG_AND }, + [TERM_CHARSET_SCS_NRCS] = { .raw = '3', .flags = TERM_SEQ_FLAG_PERCENT }, + [TERM_CHARSET_SPANISH_NRCS] = { .raw = 'Z', .flags = 0 }, + [TERM_CHARSET_SWEDISH_NRCS] = { .raw = '7', .flags = 0 }, + [TERM_CHARSET_SWISS_NRCS] = { .raw = '=', .flags = 0 }, + [TERM_CHARSET_TURKISH_DEC] = { .raw = '0', .flags = TERM_SEQ_FLAG_PERCENT }, + [TERM_CHARSET_TURKISH_NRCS] = { .raw = '2', .flags = TERM_SEQ_FLAG_PERCENT }, + + /* special charsets */ + [TERM_CHARSET_USERPREF_SUPPLEMENTAL] = { .raw = '<', .flags = 0 }, + + /* secondary choices */ + [TERM_CHARSET_CNT + TERM_CHARSET_FINNISH_NRCS] = { .raw = 'C', .flags = 0 }, + [TERM_CHARSET_CNT + TERM_CHARSET_FRENCH_NRCS] = { .raw = 'f', .flags = 0 }, + [TERM_CHARSET_CNT + TERM_CHARSET_FRENCH_CANADIAN_NRCS] = { .raw = 'Q', .flags = 0 }, + [TERM_CHARSET_CNT + TERM_CHARSET_NORWEGIAN_DANISH_NRCS] = { .raw = 'E', .flags = 0 }, + [TERM_CHARSET_CNT + TERM_CHARSET_SWEDISH_NRCS] = { .raw = 'H', .flags = 0 }, /* unused; conflicts with ISO_HEBREW */ + + /* tertiary choices */ + [TERM_CHARSET_CNT + TERM_CHARSET_CNT + TERM_CHARSET_NORWEGIAN_DANISH_NRCS] = { .raw = '6', .flags = 0 }, + }; + size_t i, cs; + + /* + * Secondary choice on SWEDISH_NRCS and primary choice on + * ISO_HEBREW_SUPPLEMENTAL have a conflict: raw=="H", flags==0. + * We always choose the ISO 96-compat set, which is what VT510 does. + */ + + for (i = 0; i < ELEMENTSOF(charset_cmds); ++i) { + if (charset_cmds[i].raw == raw && charset_cmds[i].flags == flags) { + cs = i; + while (cs >= TERM_CHARSET_CNT) + cs -= TERM_CHARSET_CNT; + + if (!require_96 || cs < TERM_CHARSET_96_CNT || cs >= TERM_CHARSET_94_CNT) + return cs; + } + } + + return -ENOENT; +} + +/* true if exactly one bit in @value is set */ +static inline bool exactly_one_bit_set(unsigned int value) { + return __builtin_popcount(value) == 1; +} + +static unsigned int term_parse_host_escape(const term_seq *seq, unsigned int *cs_out) { + unsigned int t, flags; + int cs; + + assert_return(seq, TERM_CMD_NONE); + + flags = seq->intermediates; + t = TERM_SEQ_FLAG_POPEN | TERM_SEQ_FLAG_PCLOSE | TERM_SEQ_FLAG_MULT | + TERM_SEQ_FLAG_PLUS | TERM_SEQ_FLAG_MINUS | TERM_SEQ_FLAG_DOT | + TERM_SEQ_FLAG_SLASH; + + if (exactly_one_bit_set(flags & t)) { + switch (flags & t) { + case TERM_SEQ_FLAG_POPEN: + case TERM_SEQ_FLAG_PCLOSE: + case TERM_SEQ_FLAG_MULT: + case TERM_SEQ_FLAG_PLUS: + cs = charset_from_cmd(seq->terminator, flags & ~t, false); + break; + case TERM_SEQ_FLAG_MINUS: + case TERM_SEQ_FLAG_DOT: + case TERM_SEQ_FLAG_SLASH: + cs = charset_from_cmd(seq->terminator, flags & ~t, true); + break; + default: + cs = -ENOENT; + break; + } + + if (cs >= 0) { + if (cs_out) + *cs_out = cs; + return TERM_CMD_SCS; + } + + /* looked like a charset-cmd but wasn't; continue */ + } + + switch (seq->terminator) { + case '3': + if (flags == TERM_SEQ_FLAG_HASH) /* DECDHL top-half */ + return TERM_CMD_DECDHL_TH; + break; + case '4': + if (flags == TERM_SEQ_FLAG_HASH) /* DECDHL bottom-half */ + return TERM_CMD_DECDHL_BH; + break; + case '5': + if (flags == TERM_SEQ_FLAG_HASH) /* DECSWL */ + return TERM_CMD_DECSWL; + break; + case '6': + if (flags == 0) /* DECBI */ + return TERM_CMD_DECBI; + else if (flags == TERM_SEQ_FLAG_HASH) /* DECDWL */ + return TERM_CMD_DECDWL; + break; + case '7': + if (flags == 0) /* DECSC */ + return TERM_CMD_DECSC; + break; + case '8': + if (flags == 0) /* DECRC */ + return TERM_CMD_DECRC; + else if (flags == TERM_SEQ_FLAG_HASH) /* DECALN */ + return TERM_CMD_DECALN; + break; + case '9': + if (flags == 0) /* DECFI */ + return TERM_CMD_DECFI; + break; + case '<': + if (flags == 0) /* DECANM */ + return TERM_CMD_DECANM; + break; + case '=': + if (flags == 0) /* DECKPAM */ + return TERM_CMD_DECKPAM; + break; + case '>': + if (flags == 0) /* DECKPNM */ + return TERM_CMD_DECKPNM; + break; + case '@': + if (flags == TERM_SEQ_FLAG_PERCENT) { + /* Select default character set */ + return TERM_CMD_XTERM_SDCS; + } + break; + case 'D': + if (flags == 0) /* IND */ + return TERM_CMD_IND; + break; + case 'E': + if (flags == 0) /* NEL */ + return TERM_CMD_NEL; + break; + case 'F': + if (flags == 0) /* Cursor to lower-left corner of screen */ + return TERM_CMD_XTERM_CLLHP; + else if (flags == TERM_SEQ_FLAG_SPACE) /* S7C1T */ + return TERM_CMD_S7C1T; + break; + case 'G': + if (flags == TERM_SEQ_FLAG_SPACE) { /* S8C1T */ + return TERM_CMD_S8C1T; + } else if (flags == TERM_SEQ_FLAG_PERCENT) { + /* Select UTF-8 character set */ + return TERM_CMD_XTERM_SUCS; + } + break; + case 'H': + if (flags == 0) /* HTS */ + return TERM_CMD_HTS; + break; + case 'L': + if (flags == TERM_SEQ_FLAG_SPACE) { + /* Set ANSI conformance level 1 */ + return TERM_CMD_XTERM_SACL1; + } + break; + case 'M': + if (flags == 0) { /* RI */ + return TERM_CMD_RI; + } else if (flags == TERM_SEQ_FLAG_SPACE) { + /* Set ANSI conformance level 2 */ + return TERM_CMD_XTERM_SACL2; + } + break; + case 'N': + if (flags == 0) { /* SS2 */ + return TERM_CMD_SS2; + } else if (flags == TERM_SEQ_FLAG_SPACE) { + /* Set ANSI conformance level 3 */ + return TERM_CMD_XTERM_SACL3; + } + break; + case 'O': + if (flags == 0) /* SS3 */ + return TERM_CMD_SS3; + break; + case 'P': + if (flags == 0) /* DCS: this is already handled by the state-machine */ + return 0; + break; + case 'V': + if (flags == 0) /* SPA */ + return TERM_CMD_SPA; + break; + case 'W': + if (flags == 0) /* EPA */ + return TERM_CMD_EPA; + break; + case 'X': + if (flags == 0) { /* SOS */ + /* this is already handled by the state-machine */ + break; + } + break; + case 'Z': + if (flags == 0) /* DECID */ + return TERM_CMD_DECID; + break; + case '[': + if (flags == 0) { /* CSI */ + /* this is already handled by the state-machine */ + break; + } + break; + case '\\': + if (flags == 0) /* ST */ + return TERM_CMD_ST; + break; + case ']': + if (flags == 0) { /* OSC */ + /* this is already handled by the state-machine */ + break; + } + break; + case '^': + if (flags == 0) { /* PM */ + /* this is already handled by the state-machine */ + break; + } + break; + case '_': + if (flags == 0) { /* APC */ + /* this is already handled by the state-machine */ + break; + } + break; + case 'c': + if (flags == 0) /* RIS */ + return TERM_CMD_RIS; + break; + case 'l': + if (flags == 0) /* Memory lock */ + return TERM_CMD_XTERM_MLHP; + break; + case 'm': + if (flags == 0) /* Memory unlock */ + return TERM_CMD_XTERM_MUHP; + break; + case 'n': + if (flags == 0) /* LS2 */ + return TERM_CMD_LS2; + break; + case 'o': + if (flags == 0) /* LS3 */ + return TERM_CMD_LS3; + break; + case '|': + if (flags == 0) /* LS3R */ + return TERM_CMD_LS3R; + break; + case '}': + if (flags == 0) /* LS2R */ + return TERM_CMD_LS2R; + break; + case '~': + if (flags == 0) /* LS1R */ + return TERM_CMD_LS1R; + break; + } + + return TERM_CMD_NONE; +} + +static unsigned int term_parse_host_csi(const term_seq *seq) { + unsigned int flags; + + assert_return(seq, TERM_CMD_NONE); + + flags = seq->intermediates; + + switch (seq->terminator) { + case 'A': + if (flags == 0) /* CUU */ + return TERM_CMD_CUU; + break; + case 'a': + if (flags == 0) /* HPR */ + return TERM_CMD_HPR; + break; + case 'B': + if (flags == 0) /* CUD */ + return TERM_CMD_CUD; + break; + case 'b': + if (flags == 0) /* REP */ + return TERM_CMD_REP; + break; + case 'C': + if (flags == 0) /* CUF */ + return TERM_CMD_CUF; + break; + case 'c': + if (flags == 0) /* DA1 */ + return TERM_CMD_DA1; + else if (flags == TERM_SEQ_FLAG_GT) /* DA2 */ + return TERM_CMD_DA2; + else if (flags == TERM_SEQ_FLAG_EQUAL) /* DA3 */ + return TERM_CMD_DA3; + break; + case 'D': + if (flags == 0) /* CUB */ + return TERM_CMD_CUB; + break; + case 'd': + if (flags == 0) /* VPA */ + return TERM_CMD_VPA; + break; + case 'E': + if (flags == 0) /* CNL */ + return TERM_CMD_CNL; + break; + case 'e': + if (flags == 0) /* VPR */ + return TERM_CMD_VPR; + break; + case 'F': + if (flags == 0) /* CPL */ + return TERM_CMD_CPL; + break; + case 'f': + if (flags == 0) /* HVP */ + return TERM_CMD_HVP; + break; + case 'G': + if (flags == 0) /* CHA */ + return TERM_CMD_CHA; + break; + case 'g': + if (flags == 0) /* TBC */ + return TERM_CMD_TBC; + else if (flags == TERM_SEQ_FLAG_MULT) /* DECLFKC */ + return TERM_CMD_DECLFKC; + break; + case 'H': + if (flags == 0) /* CUP */ + return TERM_CMD_CUP; + break; + case 'h': + if (flags == 0) /* SM ANSI */ + return TERM_CMD_SM_ANSI; + else if (flags == TERM_SEQ_FLAG_WHAT) /* SM DEC */ + return TERM_CMD_SM_DEC; + break; + case 'I': + if (flags == 0) /* CHT */ + return TERM_CMD_CHT; + break; + case 'i': + if (flags == 0) /* MC ANSI */ + return TERM_CMD_MC_ANSI; + else if (flags == TERM_SEQ_FLAG_WHAT) /* MC DEC */ + return TERM_CMD_MC_DEC; + break; + case 'J': + if (flags == 0) /* ED */ + return TERM_CMD_ED; + else if (flags == TERM_SEQ_FLAG_WHAT) /* DECSED */ + return TERM_CMD_DECSED; + break; + case 'K': + if (flags == 0) /* EL */ + return TERM_CMD_EL; + else if (flags == TERM_SEQ_FLAG_WHAT) /* DECSEL */ + return TERM_CMD_DECSEL; + break; + case 'L': + if (flags == 0) /* IL */ + return TERM_CMD_IL; + break; + case 'l': + if (flags == 0) /* RM ANSI */ + return TERM_CMD_RM_ANSI; + else if (flags == TERM_SEQ_FLAG_WHAT) /* RM DEC */ + return TERM_CMD_RM_DEC; + break; + case 'M': + if (flags == 0) /* DL */ + return TERM_CMD_DL; + break; + case 'm': + if (flags == 0) /* SGR */ + return TERM_CMD_SGR; + else if (flags == TERM_SEQ_FLAG_GT) /* XTERM SMR */ + return TERM_CMD_XTERM_SRV; + break; + case 'n': + if (flags == 0) /* DSR ANSI */ + return TERM_CMD_DSR_ANSI; + else if (flags == TERM_SEQ_FLAG_GT) /* XTERM RMR */ + return TERM_CMD_XTERM_RRV; + else if (flags == TERM_SEQ_FLAG_WHAT) /* DSR DEC */ + return TERM_CMD_DSR_DEC; + break; + case 'P': + if (flags == 0) /* DCH */ + return TERM_CMD_DCH; + else if (flags == TERM_SEQ_FLAG_SPACE) /* PPA */ + return TERM_CMD_PPA; + break; + case 'p': + if (flags == 0) /* DECSSL */ + return TERM_CMD_DECSSL; + else if (flags == TERM_SEQ_FLAG_SPACE) /* DECSSCLS */ + return TERM_CMD_DECSSCLS; + else if (flags == TERM_SEQ_FLAG_BANG) /* DECSTR */ + return TERM_CMD_DECSTR; + else if (flags == TERM_SEQ_FLAG_DQUOTE) /* DECSCL */ + return TERM_CMD_DECSCL; + else if (flags == TERM_SEQ_FLAG_CASH) /* DECRQM-ANSI */ + return TERM_CMD_DECRQM_ANSI; + else if (flags == (TERM_SEQ_FLAG_CASH | TERM_SEQ_FLAG_WHAT)) /* DECRQM-DEC */ + return TERM_CMD_DECRQM_DEC; + else if (flags == TERM_SEQ_FLAG_PCLOSE) /* DECSDPT */ + return TERM_CMD_DECSDPT; + else if (flags == TERM_SEQ_FLAG_MULT) /* DECSPPCS */ + return TERM_CMD_DECSPPCS; + else if (flags == TERM_SEQ_FLAG_PLUS) /* DECSR */ + return TERM_CMD_DECSR; + else if (flags == TERM_SEQ_FLAG_COMMA) /* DECLTOD */ + return TERM_CMD_DECLTOD; + else if (flags == TERM_SEQ_FLAG_GT) /* XTERM SPM */ + return TERM_CMD_XTERM_SPM; + break; + case 'Q': + if (flags == TERM_SEQ_FLAG_SPACE) /* PPR */ + return TERM_CMD_PPR; + break; + case 'q': + if (flags == 0) /* DECLL */ + return TERM_CMD_DECLL; + else if (flags == TERM_SEQ_FLAG_SPACE) /* DECSCUSR */ + return TERM_CMD_DECSCUSR; + else if (flags == TERM_SEQ_FLAG_DQUOTE) /* DECSCA */ + return TERM_CMD_DECSCA; + else if (flags == TERM_SEQ_FLAG_CASH) /* DECSDDT */ + return TERM_CMD_DECSDDT; + else if (flags == TERM_SEQ_FLAG_MULT) /* DECSRC */ + return TERM_CMD_DECSR; + else if (flags == TERM_SEQ_FLAG_PLUS) /* DECELF */ + return TERM_CMD_DECELF; + else if (flags == TERM_SEQ_FLAG_COMMA) /* DECTID */ + return TERM_CMD_DECTID; + break; + case 'R': + if (flags == TERM_SEQ_FLAG_SPACE) /* PPB */ + return TERM_CMD_PPB; + break; + case 'r': + if (flags == 0) { + /* DECSTBM */ + return TERM_CMD_DECSTBM; + } else if (flags == TERM_SEQ_FLAG_SPACE) { + /* DECSKCV */ + return TERM_CMD_DECSKCV; + } else if (flags == TERM_SEQ_FLAG_CASH) { + /* DECCARA */ + return TERM_CMD_DECCARA; + } else if (flags == TERM_SEQ_FLAG_MULT) { + /* DECSCS */ + return TERM_CMD_DECSCS; + } else if (flags == TERM_SEQ_FLAG_PLUS) { + /* DECSMKR */ + return TERM_CMD_DECSMKR; + } else if (flags == TERM_SEQ_FLAG_WHAT) { + /* + * There's a conflict between DECPCTERM and XTERM-RPM. + * XTERM-RPM takes a single argument, DECPCTERM takes 2. + * Split both up and forward the call to the closer + * match. + */ + if (seq->n_args <= 1) /* XTERM RPM */ + return TERM_CMD_XTERM_RPM; + else if (seq->n_args >= 2) /* DECPCTERM */ + return TERM_CMD_DECPCTERM; + } + break; + case 'S': + if (flags == 0) /* SU */ + return TERM_CMD_SU; + else if (flags == TERM_SEQ_FLAG_WHAT) /* XTERM SGFX */ + return TERM_CMD_XTERM_SGFX; + break; + case 's': + if (flags == 0) { + /* + * There's a conflict between DECSLRM and SC-ANSI which + * cannot be resolved without knowing the state of + * DECLRMM. We leave that decision up to the caller. + */ + return TERM_CMD_DECSLRM_OR_SC; + } else if (flags == TERM_SEQ_FLAG_CASH) { + /* DECSPRTT */ + return TERM_CMD_DECSPRTT; + } else if (flags == TERM_SEQ_FLAG_MULT) { + /* DECSFC */ + return TERM_CMD_DECSFC; + } else if (flags == TERM_SEQ_FLAG_WHAT) { + /* XTERM SPM */ + return TERM_CMD_XTERM_SPM; + } + break; + case 'T': + if (flags == 0) { + /* + * Awesome: There's a conflict between SD and XTERM IHMT + * that we have to resolve by checking the parameter + * count.. XTERM_IHMT needs exactly 5 arguments, SD + * takes 0 or 1. We're conservative here and give both + * a wider range to allow unused arguments (compat...). + */ + if (seq->n_args >= 5) { + /* XTERM IHMT */ + return TERM_CMD_XTERM_IHMT; + } else if (seq->n_args < 5) { + /* SD */ + return TERM_CMD_SD; + } + } else if (flags == TERM_SEQ_FLAG_GT) { + /* XTERM RTM */ + return TERM_CMD_XTERM_RTM; + } + break; + case 't': + if (flags == 0) { + if (seq->n_args > 0 && seq->args[0] < 24) { + /* XTERM WM */ + return TERM_CMD_XTERM_WM; + } else { + /* DECSLPP */ + return TERM_CMD_DECSLPP; + } + } else if (flags == TERM_SEQ_FLAG_SPACE) { + /* DECSWBV */ + return TERM_CMD_DECSWBV; + } else if (flags == TERM_SEQ_FLAG_DQUOTE) { + /* DECSRFR */ + return TERM_CMD_DECSRFR; + } else if (flags == TERM_SEQ_FLAG_CASH) { + /* DECRARA */ + return TERM_CMD_DECRARA; + } else if (flags == TERM_SEQ_FLAG_GT) { + /* XTERM STM */ + return TERM_CMD_XTERM_STM; + } + break; + case 'U': + if (flags == 0) /* NP */ + return TERM_CMD_NP; + break; + case 'u': + if (flags == 0) { + /* RC */ + return TERM_CMD_RC; + } else if (flags == TERM_SEQ_FLAG_SPACE) { + /* DECSMBV */ + return TERM_CMD_DECSMBV; + } else if (flags == TERM_SEQ_FLAG_DQUOTE) { + /* DECSTRL */ + return TERM_CMD_DECSTRL; + } else if (flags == TERM_SEQ_FLAG_WHAT) { + /* DECRQUPSS */ + return TERM_CMD_DECRQUPSS; + } else if (seq->args[0] == 1 && flags == TERM_SEQ_FLAG_CASH) { + /* DECRQTSR */ + return TERM_CMD_DECRQTSR; + } else if (flags == TERM_SEQ_FLAG_MULT) { + /* DECSCP */ + return TERM_CMD_DECSCP; + } else if (flags == TERM_SEQ_FLAG_COMMA) { + /* DECRQKT */ + return TERM_CMD_DECRQKT; + } + break; + case 'V': + if (flags == 0) /* PP */ + return TERM_CMD_PP; + break; + case 'v': + if (flags == TERM_SEQ_FLAG_SPACE) /* DECSLCK */ + return TERM_CMD_DECSLCK; + else if (flags == TERM_SEQ_FLAG_DQUOTE) /* DECRQDE */ + return TERM_CMD_DECRQDE; + else if (flags == TERM_SEQ_FLAG_CASH) /* DECCRA */ + return TERM_CMD_DECCRA; + else if (flags == TERM_SEQ_FLAG_COMMA) /* DECRPKT */ + return TERM_CMD_DECRPKT; + break; + case 'W': + if (seq->args[0] == 5 && flags == TERM_SEQ_FLAG_WHAT) { + /* DECST8C */ + return TERM_CMD_DECST8C; + } + break; + case 'w': + if (flags == TERM_SEQ_FLAG_CASH) /* DECRQPSR */ + return TERM_CMD_DECRQPSR; + else if (flags == TERM_SEQ_FLAG_SQUOTE) /* DECEFR */ + return TERM_CMD_DECEFR; + else if (flags == TERM_SEQ_FLAG_PLUS) /* DECSPP */ + return TERM_CMD_DECSPP; + break; + case 'X': + if (flags == 0) /* ECH */ + return TERM_CMD_ECH; + break; + case 'x': + if (flags == 0) /* DECREQTPARM */ + return TERM_CMD_DECREQTPARM; + else if (flags == TERM_SEQ_FLAG_CASH) /* DECFRA */ + return TERM_CMD_DECFRA; + else if (flags == TERM_SEQ_FLAG_MULT) /* DECSACE */ + return TERM_CMD_DECSACE; + else if (flags == TERM_SEQ_FLAG_PLUS) /* DECRQPKFM */ + return TERM_CMD_DECRQPKFM; + break; + case 'y': + if (flags == 0) /* DECTST */ + return TERM_CMD_DECTST; + else if (flags == TERM_SEQ_FLAG_MULT) /* DECRQCRA */ + return TERM_CMD_DECRQCRA; + else if (flags == TERM_SEQ_FLAG_PLUS) /* DECPKFMR */ + return TERM_CMD_DECPKFMR; + break; + case 'Z': + if (flags == 0) /* CBT */ + return TERM_CMD_CBT; + break; + case 'z': + if (flags == TERM_SEQ_FLAG_CASH) /* DECERA */ + return TERM_CMD_DECERA; + else if (flags == TERM_SEQ_FLAG_SQUOTE) /* DECELR */ + return TERM_CMD_DECELR; + else if (flags == TERM_SEQ_FLAG_MULT) /* DECINVM */ + return TERM_CMD_DECINVM; + else if (flags == TERM_SEQ_FLAG_PLUS) /* DECPKA */ + return TERM_CMD_DECPKA; + break; + case '@': + if (flags == 0) /* ICH */ + return TERM_CMD_ICH; + break; + case '`': + if (flags == 0) /* HPA */ + return TERM_CMD_HPA; + break; + case '{': + if (flags == TERM_SEQ_FLAG_CASH) /* DECSERA */ + return TERM_CMD_DECSERA; + else if (flags == TERM_SEQ_FLAG_SQUOTE) /* DECSLE */ + return TERM_CMD_DECSLE; + break; + case '|': + if (flags == TERM_SEQ_FLAG_CASH) /* DECSCPP */ + return TERM_CMD_DECSCPP; + else if (flags == TERM_SEQ_FLAG_SQUOTE) /* DECRQLP */ + return TERM_CMD_DECRQLP; + else if (flags == TERM_SEQ_FLAG_MULT) /* DECSNLS */ + return TERM_CMD_DECSNLS; + break; + case '}': + if (flags == TERM_SEQ_FLAG_SPACE) /* DECKBD */ + return TERM_CMD_DECKBD; + else if (flags == TERM_SEQ_FLAG_CASH) /* DECSASD */ + return TERM_CMD_DECSASD; + else if (flags == TERM_SEQ_FLAG_SQUOTE) /* DECIC */ + return TERM_CMD_DECIC; + break; + case '~': + if (flags == TERM_SEQ_FLAG_SPACE) /* DECTME */ + return TERM_CMD_DECTME; + else if (flags == TERM_SEQ_FLAG_CASH) /* DECSSDT */ + return TERM_CMD_DECSSDT; + else if (flags == TERM_SEQ_FLAG_SQUOTE) /* DECDC */ + return TERM_CMD_DECDC; + break; + } + + return TERM_CMD_NONE; +} + +/* + * State Machine + * This parser controls the parser-state and returns any detected sequence to + * the caller. The parser is based on this state-diagram from Paul Williams: + * http://vt100.net/emu/ + * It was written from scratch and extended where needed. + * This parser is fully compatible up to the vt500 series. We expect UCS-4 as + * input. It's the callers responsibility to do any UTF-8 parsing. + */ + +enum parser_state { + STATE_NONE, /* placeholder */ + STATE_GROUND, /* initial state and ground */ + STATE_ESC, /* ESC sequence was started */ + STATE_ESC_INT, /* intermediate escape characters */ + STATE_CSI_ENTRY, /* starting CSI sequence */ + STATE_CSI_PARAM, /* CSI parameters */ + STATE_CSI_INT, /* intermediate CSI characters */ + STATE_CSI_IGNORE, /* CSI error; ignore this CSI sequence */ + STATE_DCS_ENTRY, /* starting DCS sequence */ + STATE_DCS_PARAM, /* DCS parameters */ + STATE_DCS_INT, /* intermediate DCS characters */ + STATE_DCS_PASS, /* DCS data passthrough */ + STATE_DCS_IGNORE, /* DCS error; ignore this DCS sequence */ + STATE_OSC_STRING, /* parsing OSC sequence */ + STATE_ST_IGNORE, /* unimplemented seq; ignore until ST */ + STATE_NUM +}; + +enum parser_action { + ACTION_NONE, /* placeholder */ + ACTION_CLEAR, /* clear parameters */ + ACTION_IGNORE, /* ignore the character entirely */ + ACTION_PRINT, /* print the character on the console */ + ACTION_EXECUTE, /* execute single control character (C0/C1) */ + ACTION_COLLECT, /* collect intermediate character */ + ACTION_PARAM, /* collect parameter character */ + ACTION_ESC_DISPATCH, /* dispatch escape sequence */ + ACTION_CSI_DISPATCH, /* dispatch csi sequence */ + ACTION_DCS_START, /* start of DCS data */ + ACTION_DCS_COLLECT, /* collect DCS data */ + ACTION_DCS_CONSUME, /* consume DCS terminator */ + ACTION_DCS_DISPATCH, /* dispatch dcs sequence */ + ACTION_OSC_START, /* start of OSC data */ + ACTION_OSC_COLLECT, /* collect OSC data */ + ACTION_OSC_CONSUME, /* consume OSC terminator */ + ACTION_OSC_DISPATCH, /* dispatch osc sequence */ + ACTION_NUM +}; + +int term_parser_new(term_parser **out, bool host) { + _term_parser_free_ term_parser *parser = NULL; + + assert_return(out, -EINVAL); + + parser = new0(term_parser, 1); + if (!parser) + return -ENOMEM; + + parser->is_host = host; + parser->st_alloc = 64; + parser->seq.st = new0(char, parser->st_alloc + 1); + if (!parser->seq.st) + return -ENOMEM; + + *out = parser; + parser = NULL; + return 0; +} + +term_parser *term_parser_free(term_parser *parser) { + if (!parser) + return NULL; + + free(parser->seq.st); + free(parser); + return NULL; +} + +static inline void parser_clear(term_parser *parser) { + unsigned int i; + + parser->seq.command = TERM_CMD_NONE; + parser->seq.terminator = 0; + parser->seq.intermediates = 0; + parser->seq.charset = TERM_CHARSET_NONE; + parser->seq.n_args = 0; + for (i = 0; i < TERM_PARSER_ARG_MAX; ++i) + parser->seq.args[i] = -1; + + parser->seq.n_st = 0; + parser->seq.st[0] = 0; +} + +static int parser_ignore(term_parser *parser, uint32_t raw) { + parser_clear(parser); + parser->seq.type = TERM_SEQ_IGNORE; + parser->seq.command = TERM_CMD_NONE; + parser->seq.terminator = raw; + parser->seq.charset = TERM_CHARSET_NONE; + + return parser->seq.type; +} + +static int parser_print(term_parser *parser, uint32_t raw) { + parser_clear(parser); + parser->seq.type = TERM_SEQ_GRAPHIC; + parser->seq.command = TERM_CMD_GRAPHIC; + parser->seq.terminator = raw; + parser->seq.charset = TERM_CHARSET_NONE; + + return parser->seq.type; +} + +static int parser_execute(term_parser *parser, uint32_t raw) { + parser_clear(parser); + parser->seq.type = TERM_SEQ_CONTROL; + parser->seq.command = TERM_CMD_GRAPHIC; + parser->seq.terminator = raw; + parser->seq.charset = TERM_CHARSET_NONE; + if (!parser->is_host) + parser->seq.command = term_parse_host_control(&parser->seq); + + return parser->seq.type; +} + +static void parser_collect(term_parser *parser, uint32_t raw) { + /* + * Usually, characters from 0x30 to 0x3f are only allowed as leading + * markers (or as part of the parameters), characters from 0x20 to 0x2f + * are only allowed as trailing markers. However, our state-machine + * already verifies those restrictions so we can handle them the same + * way here. Note that we safely allow markers to be specified multiple + * times. + */ + + if (raw >= 0x20 && raw <= 0x3f) + parser->seq.intermediates |= 1 << (raw - 0x20); +} + +static void parser_param(term_parser *parser, uint32_t raw) { + int new; + + if (raw == ';') { + if (parser->seq.n_args < TERM_PARSER_ARG_MAX) + ++parser->seq.n_args; + + return; + } + + if (parser->seq.n_args >= TERM_PARSER_ARG_MAX) + return; + + if (raw >= '0' && raw <= '9') { + new = parser->seq.args[parser->seq.n_args]; + if (new < 0) + new = 0; + new = new * 10 + raw - '0'; + + /* VT510 tells us to clamp all values to [0, 9999], however, it + * also allows commands with values up to 2^15-1. We simply use + * 2^16 as maximum here to be compatible to all commands, but + * avoid overflows in any calculations. */ + if (new > 0xffff) + new = 0xffff; + + parser->seq.args[parser->seq.n_args] = new; + } +} + +static int parser_esc(term_parser *parser, uint32_t raw) { + parser->seq.type = TERM_SEQ_ESCAPE; + parser->seq.command = TERM_CMD_NONE; + parser->seq.terminator = raw; + parser->seq.charset = TERM_CHARSET_NONE; + if (!parser->is_host) + parser->seq.command = term_parse_host_escape(&parser->seq, &parser->seq.charset); + + return parser->seq.type; +} + +static int parser_csi(term_parser *parser, uint32_t raw) { + /* parser->seq is cleared during CSI-ENTER state, thus there's no need + * to clear invalid fields here. */ + + if (parser->seq.n_args < TERM_PARSER_ARG_MAX) { + if (parser->seq.n_args > 0 || + parser->seq.args[parser->seq.n_args] >= 0) + ++parser->seq.n_args; + } + + parser->seq.type = TERM_SEQ_CSI; + parser->seq.command = TERM_CMD_NONE; + parser->seq.terminator = raw; + parser->seq.charset = TERM_CHARSET_NONE; + if (!parser->is_host) + parser->seq.command = term_parse_host_csi(&parser->seq); + + return parser->seq.type; +} + +/* perform state transition and dispatch related actions */ +static int parser_transition(term_parser *parser, uint32_t raw, unsigned int state, unsigned int action) { + if (state != STATE_NONE) + parser->state = state; + + switch (action) { + case ACTION_NONE: + return TERM_SEQ_NONE; + case ACTION_CLEAR: + parser_clear(parser); + return TERM_SEQ_NONE; + case ACTION_IGNORE: + return parser_ignore(parser, raw); + case ACTION_PRINT: + return parser_print(parser, raw); + case ACTION_EXECUTE: + return parser_execute(parser, raw); + case ACTION_COLLECT: + parser_collect(parser, raw); + return TERM_SEQ_NONE; + case ACTION_PARAM: + parser_param(parser, raw); + return TERM_SEQ_NONE; + case ACTION_ESC_DISPATCH: + return parser_esc(parser, raw); + case ACTION_CSI_DISPATCH: + return parser_csi(parser, raw); + case ACTION_DCS_START: + /* not implemented */ + return TERM_SEQ_NONE; + case ACTION_DCS_COLLECT: + /* not implemented */ + return TERM_SEQ_NONE; + case ACTION_DCS_CONSUME: + /* not implemented */ + return TERM_SEQ_NONE; + case ACTION_DCS_DISPATCH: + /* not implemented */ + return TERM_SEQ_NONE; + case ACTION_OSC_START: + /* not implemented */ + return TERM_SEQ_NONE; + case ACTION_OSC_COLLECT: + /* not implemented */ + return TERM_SEQ_NONE; + case ACTION_OSC_CONSUME: + /* not implemented */ + return TERM_SEQ_NONE; + case ACTION_OSC_DISPATCH: + /* not implemented */ + return TERM_SEQ_NONE; + default: + assert_not_reached("invalid vte-parser action"); + return TERM_SEQ_NONE; + } +} + +static int parser_feed_to_state(term_parser *parser, uint32_t raw) { + switch (parser->state) { + case STATE_NONE: + /* + * During initialization, parser->state is cleared. Treat this + * as STATE_GROUND. We will then never get to STATE_NONE again. + */ + case STATE_GROUND: + switch (raw) { + case 0x00 ... 0x1f: /* C0 */ + case 0x80 ... 0x9b: /* C1 \ { ST } */ + case 0x9d ... 0x9f: + return parser_transition(parser, raw, STATE_NONE, ACTION_EXECUTE); + case 0x9c: /* ST */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + } + + return parser_transition(parser, raw, STATE_NONE, ACTION_PRINT); + case STATE_ESC: + switch (raw) { + case 0x00 ... 0x1f: /* C0 */ + return parser_transition(parser, raw, STATE_NONE, ACTION_EXECUTE); + case 0x20 ... 0x2f: /* [' ' - '\'] */ + return parser_transition(parser, raw, STATE_ESC_INT, ACTION_COLLECT); + case 0x30 ... 0x4f: /* ['0' - '~'] \ { 'P', 'X', '[', ']', '^', '_' } */ + case 0x51 ... 0x57: + case 0x59 ... 0x5a: + case 0x5c: + case 0x60 ... 0x7e: + return parser_transition(parser, raw, STATE_GROUND, ACTION_ESC_DISPATCH); + case 0x50: /* 'P' */ + return parser_transition(parser, raw, STATE_DCS_ENTRY, ACTION_CLEAR); + case 0x5b: /* '[' */ + return parser_transition(parser, raw, STATE_CSI_ENTRY, ACTION_CLEAR); + case 0x5d: /* ']' */ + return parser_transition(parser, raw, STATE_OSC_STRING, ACTION_CLEAR); + case 0x58: /* 'X' */ + case 0x5e: /* '^' */ + case 0x5f: /* '_' */ + return parser_transition(parser, raw, STATE_ST_IGNORE, ACTION_NONE); + case 0x7f: /* DEL */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x9c: /* ST */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_IGNORE); + } + + return parser_transition(parser, raw, STATE_ESC_INT, ACTION_COLLECT); + case STATE_ESC_INT: + switch (raw) { + case 0x00 ... 0x1f: /* C0 */ + return parser_transition(parser, raw, STATE_NONE, ACTION_EXECUTE); + case 0x20 ... 0x2f: /* [' ' - '\'] */ + return parser_transition(parser, raw, STATE_NONE, ACTION_COLLECT); + case 0x30 ... 0x7e: /* ['0' - '~'] */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_ESC_DISPATCH); + case 0x7f: /* DEL */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x9c: /* ST */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_IGNORE); + } + + return parser_transition(parser, raw, STATE_NONE, ACTION_COLLECT); + case STATE_CSI_ENTRY: + switch (raw) { + case 0x00 ... 0x1f: /* C0 */ + return parser_transition(parser, raw, STATE_NONE, ACTION_EXECUTE); + case 0x20 ... 0x2f: /* [' ' - '\'] */ + return parser_transition(parser, raw, STATE_CSI_INT, ACTION_COLLECT); + case 0x3a: /* ':' */ + return parser_transition(parser, raw, STATE_CSI_IGNORE, ACTION_NONE); + case 0x30 ... 0x39: /* ['0' - '9'] */ + case 0x3b: /* ';' */ + return parser_transition(parser, raw, STATE_CSI_PARAM, ACTION_PARAM); + case 0x3c ... 0x3f: /* ['<' - '?'] */ + return parser_transition(parser, raw, STATE_CSI_PARAM, ACTION_COLLECT); + case 0x40 ... 0x7e: /* ['@' - '~'] */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_CSI_DISPATCH); + case 0x7f: /* DEL */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x9c: /* ST */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_IGNORE); + } + + return parser_transition(parser, raw, STATE_CSI_IGNORE, ACTION_NONE); + case STATE_CSI_PARAM: + switch (raw) { + case 0x00 ... 0x1f: /* C0 */ + return parser_transition(parser, raw, STATE_NONE, ACTION_EXECUTE); + case 0x20 ... 0x2f: /* [' ' - '\'] */ + return parser_transition(parser, raw, STATE_CSI_INT, ACTION_COLLECT); + case 0x30 ... 0x39: /* ['0' - '9'] */ + case 0x3b: /* ';' */ + return parser_transition(parser, raw, STATE_NONE, ACTION_PARAM); + case 0x3a: /* ':' */ + case 0x3c ... 0x3f: /* ['<' - '?'] */ + return parser_transition(parser, raw, STATE_CSI_IGNORE, ACTION_NONE); + case 0x40 ... 0x7e: /* ['@' - '~'] */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_CSI_DISPATCH); + case 0x7f: /* DEL */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x9c: /* ST */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_IGNORE); + } + + return parser_transition(parser, raw, STATE_CSI_IGNORE, ACTION_NONE); + case STATE_CSI_INT: + switch (raw) { + case 0x00 ... 0x1f: /* C0 */ + return parser_transition(parser, raw, STATE_NONE, ACTION_EXECUTE); + case 0x20 ... 0x2f: /* [' ' - '\'] */ + return parser_transition(parser, raw, STATE_NONE, ACTION_COLLECT); + case 0x30 ... 0x3f: /* ['0' - '?'] */ + return parser_transition(parser, raw, STATE_CSI_IGNORE, ACTION_NONE); + case 0x40 ... 0x7e: /* ['@' - '~'] */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_CSI_DISPATCH); + case 0x7f: /* DEL */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x9c: /* ST */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_IGNORE); + } + + return parser_transition(parser, raw, STATE_CSI_IGNORE, ACTION_NONE); + case STATE_CSI_IGNORE: + switch (raw) { + case 0x00 ... 0x1f: /* C0 */ + return parser_transition(parser, raw, STATE_NONE, ACTION_EXECUTE); + case 0x20 ... 0x3f: /* [' ' - '?'] */ + return parser_transition(parser, raw, STATE_NONE, ACTION_NONE); + case 0x40 ... 0x7e: /* ['@' - '~'] */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_NONE); + case 0x7f: /* DEL */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x9c: /* ST */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_IGNORE); + } + + return parser_transition(parser, raw, STATE_NONE, ACTION_NONE); + case STATE_DCS_ENTRY: + switch (raw) { + case 0x00 ... 0x1f: /* C0 */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x20 ... 0x2f: /* [' ' - '\'] */ + return parser_transition(parser, raw, STATE_DCS_INT, ACTION_COLLECT); + case 0x3a: /* ':' */ + return parser_transition(parser, raw, STATE_DCS_IGNORE, ACTION_NONE); + case 0x30 ... 0x39: /* ['0' - '9'] */ + case 0x3b: /* ';' */ + return parser_transition(parser, raw, STATE_DCS_PARAM, ACTION_PARAM); + case 0x3c ... 0x3f: /* ['<' - '?'] */ + return parser_transition(parser, raw, STATE_DCS_PARAM, ACTION_COLLECT); + case 0x40 ... 0x7e: /* ['@' - '~'] */ + return parser_transition(parser, raw, STATE_DCS_PASS, ACTION_DCS_CONSUME); + case 0x7f: /* DEL */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x9c: /* ST */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_IGNORE); + } + + return parser_transition(parser, raw, STATE_DCS_PASS, ACTION_DCS_CONSUME); + case STATE_DCS_PARAM: + switch (raw) { + case 0x00 ... 0x1f: /* C0 */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x20 ... 0x2f: /* [' ' - '\'] */ + return parser_transition(parser, raw, STATE_DCS_INT, ACTION_COLLECT); + case 0x30 ... 0x39: /* ['0' - '9'] */ + case 0x3b: /* ';' */ + return parser_transition(parser, raw, STATE_NONE, ACTION_PARAM); + case 0x3a: /* ':' */ + case 0x3c ... 0x3f: /* ['<' - '?'] */ + return parser_transition(parser, raw, STATE_DCS_IGNORE, ACTION_NONE); + case 0x40 ... 0x7e: /* ['@' - '~'] */ + return parser_transition(parser, raw, STATE_DCS_PASS, ACTION_DCS_CONSUME); + case 0x7f: /* DEL */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x9c: /* ST */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_IGNORE); + } + + return parser_transition(parser, raw, STATE_DCS_PASS, ACTION_DCS_CONSUME); + case STATE_DCS_INT: + switch (raw) { + case 0x00 ... 0x1f: /* C0 */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x20 ... 0x2f: /* [' ' - '\'] */ + return parser_transition(parser, raw, STATE_NONE, ACTION_COLLECT); + case 0x30 ... 0x3f: /* ['0' - '?'] */ + return parser_transition(parser, raw, STATE_DCS_IGNORE, ACTION_NONE); + case 0x40 ... 0x7e: /* ['@' - '~'] */ + return parser_transition(parser, raw, STATE_DCS_PASS, ACTION_DCS_CONSUME); + case 0x7f: /* DEL */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x9c: /* ST */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_IGNORE); + } + + return parser_transition(parser, raw, STATE_DCS_PASS, ACTION_DCS_CONSUME); + case STATE_DCS_PASS: + switch (raw) { + case 0x00 ... 0x7e: /* ASCII \ { DEL } */ + return parser_transition(parser, raw, STATE_NONE, ACTION_DCS_COLLECT); + case 0x7f: /* DEL */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x9c: /* ST */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_DCS_DISPATCH); + } + + return parser_transition(parser, raw, STATE_NONE, ACTION_DCS_COLLECT); + case STATE_DCS_IGNORE: + switch (raw) { + case 0x00 ... 0x7f: /* ASCII */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x9c: /* ST */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_NONE); + } + + return parser_transition(parser, raw, STATE_NONE, ACTION_NONE); + case STATE_OSC_STRING: + switch (raw) { + case 0x00 ... 0x06: /* C0 \ { BEL } */ + case 0x08 ... 0x1f: + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x20 ... 0x7f: /* [' ' - DEL] */ + return parser_transition(parser, raw, STATE_NONE, ACTION_OSC_COLLECT); + case 0x07: /* BEL */ + case 0x9c: /* ST */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_OSC_DISPATCH); + } + + return parser_transition(parser, raw, STATE_NONE, ACTION_OSC_COLLECT); + case STATE_ST_IGNORE: + switch (raw) { + case 0x00 ... 0x7f: /* ASCII */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x9c: /* ST */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_IGNORE); + } + + return parser_transition(parser, raw, STATE_NONE, ACTION_NONE); + } + + assert_not_reached("bad vte-parser state"); + return -EINVAL; +} + +int term_parser_feed(term_parser *parser, const term_seq **seq_out, uint32_t raw) { + int r; + + assert_return(parser, -EINVAL); + assert_return(seq_out, -EINVAL); + + /* + * Notes: + * * DEC treats GR codes as GL. We don't do that as we require UTF-8 + * as charset and, thus, it doesn't make sense to treat GR special. + * * During control sequences, unexpected C1 codes cancel the sequence + * and immediately start a new one. C0 codes, however, may or may not + * be ignored/executed depending on the sequence. + */ + + switch (raw) { + case 0x18: /* CAN */ + r = parser_transition(parser, raw, STATE_GROUND, ACTION_IGNORE); + break; + case 0x1a: /* SUB */ + r = parser_transition(parser, raw, STATE_GROUND, ACTION_EXECUTE); + break; + case 0x80 ... 0x8f: /* C1 \ {DCS, SOS, CSI, ST, OSC, PM, APC} */ + case 0x91 ... 0x97: + case 0x99 ... 0x9a: + r = parser_transition(parser, raw, STATE_GROUND, ACTION_EXECUTE); + break; + case 0x1b: /* ESC */ + r = parser_transition(parser, raw, STATE_ESC, ACTION_CLEAR); + break; + case 0x98: /* SOS */ + case 0x9e: /* PM */ + case 0x9f: /* APC */ + r = parser_transition(parser, raw, STATE_ST_IGNORE, ACTION_NONE); + break; + case 0x90: /* DCS */ + r = parser_transition(parser, raw, STATE_DCS_ENTRY, ACTION_CLEAR); + break; + case 0x9d: /* OSC */ + r = parser_transition(parser, raw, STATE_OSC_STRING, ACTION_CLEAR); + break; + case 0x9b: /* CSI */ + r = parser_transition(parser, raw, STATE_CSI_ENTRY, ACTION_CLEAR); + break; + default: + r = parser_feed_to_state(parser, raw); + break; + } + + if (r <= 0) + *seq_out = NULL; + else + *seq_out = &parser->seq; + + return r; +} diff --git a/src/libsystemd-terminal/test-term-parser.c b/src/libsystemd-terminal/test-term-parser.c new file mode 100644 index 000000000..ed16f5f27 --- /dev/null +++ b/src/libsystemd-terminal/test-term-parser.c @@ -0,0 +1,143 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ +/*** + This file is part of systemd. + + Copyright (C) 2014 David Herrmann + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +/* + * Terminal Parser Tests + */ + +#include +#include +#include +#include +#include +#include "macro.h" +#include "term-internal.h" +#include "util.h" + +static void test_term_utf8_invalid(void) { + term_utf8 p = { }; + const uint32_t *res; + size_t len; + + res = term_utf8_decode(NULL, NULL, 0); + assert_se(res == NULL); + + res = term_utf8_decode(&p, NULL, 0); + assert_se(res != NULL); + + len = 5; + res = term_utf8_decode(NULL, &len, 0); + assert_se(res == NULL); + assert_se(len == 0); + + len = 5; + res = term_utf8_decode(&p, &len, 0); + assert_se(res != NULL); + assert_se(len == 1); + + len = 5; + res = term_utf8_decode(&p, &len, 0xCf); + assert_se(res == NULL); + assert_se(len == 0); + + len = 5; + res = term_utf8_decode(&p, &len, 0x0); + assert_se(res != NULL); + assert_se(len == 2); +} + +static void test_term_utf8_range(void) { + term_utf8 p = { }; + const uint32_t *res; + char u8[4]; + uint32_t i, j; + size_t ulen, len; + + /* Convert all ucs-4 chars to utf-8 and back */ + + for (i = 0; i < 0x10FFFF; ++i) { + ulen = term_utf8_encode(u8, i); + if (!ulen) + continue; + + for (j = 0; j < ulen; ++j) { + res = term_utf8_decode(&p, &len, u8[j]); + if (!res) { + assert_se(j + 1 != ulen); + continue; + } + + assert_se(j + 1 == ulen); + assert_se(len == 1 && *res == i); + assert_se(i <= 127 || ulen >= 2); + } + } +} + +static void test_term_utf8_mix(void) { + static const char source[] = { + 0x00, /* normal 0 */ + 0xC0, 0x80, /* overlong 0 */ + 0xC0, 0x81, /* overlong 1 */ + 0xE0, 0x80, 0x81, /* overlong 1 */ + 0xF0, 0x80, 0x80, 0x81, /* overlong 1 */ + 0xC0, 0x00, /* invalid continuation */ + 0xC0, 0xC0, 0x81, /* invalid continuation with a following overlong 1 */ + 0xF8, 0x80, 0x80, 0x80, 0x81, /* overlong 1 with 5 bytes */ + 0xE0, 0x80, 0xC0, 0x81, /* invalid 3-byte followed by valid 2-byte */ + 0xF0, 0x80, 0x80, 0xC0, 0x81, /* invalid 4-byte followed by valid 2-byte */ + }; + static const uint32_t result[] = { + 0x0000, + 0x0000, + 0x0001, + 0x0001, + 0x0001, + 0x00C0, 0x0000, + 0x00C0, 0x0001, + 0x00F8, 0x0080, 0x0080, 0x0080, 0x0081, + 0x00E0, 0x0080, 0x0001, + 0x00F0, 0x0080, 0x0080, 0x0001, + }; + term_utf8 p = { }; + const uint32_t *res; + unsigned int i, j; + size_t len; + + for (i = 0, j = 0; i < sizeof(source); ++i) { + res = term_utf8_decode(&p, &len, source[i]); + if (!res) + continue; + + assert_se(j + len <= ELEMENTSOF(result)); + assert_se(!memcmp(res, &result[j], sizeof(uint32_t) * len)); + j += len; + } + + assert_se(j == ELEMENTSOF(result)); +} + +int main(int argc, char *argv[]) { + test_term_utf8_invalid(); + test_term_utf8_range(); + test_term_utf8_mix(); + + return 0; +} -- 2.30.2