2 * This file is part of DisOrder
3 * Copyright (C) 2007 Richard Kettlewell
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
21 * @brief Noddy HTML parser
39 /** @brief Entity table type */
45 /** @brief Known entities
47 * We only support the entities that turn up in the HTML files we
48 * actually care about.
50 * Keep in alphabetical order.
52 static const struct entity entities[] = {
58 /** @brief Skip whitespace */
59 static const char *skipwhite(const char *input) {
60 while(*input && isspace((unsigned char)*input))
65 /** @brief Parse an entity */
66 static const char *parse_entity(const char *input,
68 input = skipwhite(input);
70 input = skipwhite(input + 1);
72 *entityp = strtoul(skipwhite(input + 1), (char **)&input, 16);
74 *entityp = strtoul(input, (char **)&input, 10);
76 struct dynstr name[1];
80 while(isalnum((unsigned char)*input))
81 dynstr_append(name, tolower((unsigned char)*input++));
82 dynstr_terminate(name);
83 if((n = TABLE_FIND(entities, struct entity, name, name->vec)) < 0) {
84 error(0, "unknown entity '%s'", name->vec);
87 *entityp = entities[n].value;
89 input = skipwhite(input);
95 /** @brief Parse one character or entity and append it to a @ref dynstr */
96 static const char *parse_one(const char *input, struct dynstr *d) {
99 input = parse_entity(input + 1, &c);
100 if(one_ucs42utf8(c, d))
101 dynstr_append(d, '?'); /* U+FFFD might be a better choice */
103 dynstr_append(d, *input++);
107 /** @brief Too-stupid-to-live HTML parser
108 * @param callbacks Parser callbacks
109 * @param input HTML document
110 * @param u User data pointer
111 * @return 0 on success, -1 on error
113 int html_parse(const struct html_parser_callbacks *callbacks,
116 struct dynstr text[1];
121 struct dynstr tag[1];
124 /* flush collected text */
126 dynstr_terminate(text);
127 callbacks->text(text->vec, u);
131 input = skipwhite(input + 1);
132 /* see if it's an open or close tag */
134 input = skipwhite(input + 1);
137 attrs = hash_new(sizeof(char *));
139 while(isalnum((unsigned char)*input))
140 dynstr_append(tag, tolower((unsigned char)*input++));
141 dynstr_terminate(tag);
142 input = skipwhite(input);
144 /* gather attributes */
145 while(*input && *input != '>') {
146 struct dynstr name[1], value[1];
151 while(isalnum((unsigned char)*input))
152 dynstr_append(name, tolower((unsigned char)*input++));
153 dynstr_terminate(name);
154 input = skipwhite(input);
156 /* attribute value */
157 input = skipwhite(input + 1);
158 if(*input == '"' || *input == '\'') {
160 const int q = *input++;
161 while(*input && *input != q)
162 input = parse_one(input, value);
167 while(*input && *input != '>' && !isspace((unsigned char)*input))
168 input = parse_one(input, value);
170 dynstr_terminate(value);
172 /* stash the value */
173 hash_add(attrs, name->vec, value->vec, HASH_INSERT_OR_REPLACE);
174 input = skipwhite(input);
178 error(0, "unterminated tag %s", tag->vec);
183 callbacks->open(tag->vec, attrs, u);
185 callbacks->close(tag->vec, u);
187 input = parse_one(input, text);
189 /* flush any trailing text */
191 dynstr_terminate(text);
192 callbacks->text(text->vec, u);