chiark / gitweb /
doxygen; chatty logging in hope of catching a bug
[disorder] / lib / html.c
CommitLineData
13affe66
RK
1/*
2 * This file is part of DisOrder
3 * Copyright (C) 2007 Richard Kettlewell
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18 * USA
19 */
20/** @file lib/html.c
21 * @brief Noddy HTML parser
22 */
23
24#include <config.h>
25#include "types.h"
26
27#include <string.h>
28#include <ctype.h>
29#include <stddef.h>
30
31#include "hash.h"
32#include "html.h"
33#include "mem.h"
34#include "log.h"
35#include "vector.h"
36#include "charset.h"
37#include "table.h"
38
39/** @brief Entity table type */
40struct entity {
41 const char *name;
42 uint32_t value;
43};
44
45/** @brief Known entities
46 *
47 * We only support the entities that turn up in the HTML files we
48 * actually care about.
49 *
50 * Keep in alphabetical order.
51 */
52static const struct entity entities[] = {
53 { "amp", '&' },
54 { "gt", '>' },
55 { "lt", '<' }
56};
57
58/** @brief Skip whitespace */
59static const char *skipwhite(const char *input) {
60 while(*input && isspace((unsigned char)*input))
61 ++input;
62 return input;
63}
64
65/** @brief Parse an entity */
66static const char *parse_entity(const char *input,
67 uint32_t *entityp) {
68 input = skipwhite(input);
69 if(*input == '#') {
70 input = skipwhite(input + 1);
71 if(*input == 'x')
72 *entityp = strtoul(skipwhite(input + 1), (char **)&input, 16);
73 else
74 *entityp = strtoul(input, (char **)&input, 10);
75 } else {
76 struct dynstr name[1];
77 int n;
78
79 dynstr_init(name);
80 while(isalnum((unsigned char)*input))
81 dynstr_append(name, tolower((unsigned char)*input++));
82 dynstr_terminate(name);
83 if((n = TABLE_FIND(entities, struct entity, name, name->vec)) < 0) {
84 error(0, "unknown entity '%s'", name->vec);
85 *entityp = '?';
86 } else
87 *entityp = entities[n].value;
88 }
89 input = skipwhite(input);
90 if(*input == ';')
91 ++input;
92 return input;
93}
94
95/** @brief Parse one character or entity and append it to a @ref dynstr */
96static const char *parse_one(const char *input, struct dynstr *d) {
97 if(*input == '&') {
98 uint32_t c;
99 input = parse_entity(input + 1, &c);
100 if(one_ucs42utf8(c, d))
101 dynstr_append(d, '?'); /* U+FFFD might be a better choice */
102 } else
103 dynstr_append(d, *input++);
104 return input;
105}
106
107/** @brief Too-stupid-to-live HTML parser
108 * @param callbacks Parser callbacks
109 * @param input HTML document
110 * @param u User data pointer
111 * @return 0 on success, -1 on error
112 */
113int html_parse(const struct html_parser_callbacks *callbacks,
114 const char *input,
115 void *u) {
116 struct dynstr text[1];
117
118 dynstr_init(text);
119 while(*input) {
120 if(*input == '<') {
121 struct dynstr tag[1];
122 hash *attrs;
123
124 /* flush collected text */
125 if(text->nvec) {
126 dynstr_terminate(text);
127 callbacks->text(text->vec, u);
128 text->nvec = 0;
129 }
130 dynstr_init(tag);
131 input = skipwhite(input + 1);
132 /* see if it's an open or close tag */
133 if(*input == '/') {
134 input = skipwhite(input + 1);
135 attrs = 0;
136 } else
137 attrs = hash_new(sizeof(char *));
138 /* gather tag */
139 while(isalnum((unsigned char)*input))
140 dynstr_append(tag, tolower((unsigned char)*input++));
141 dynstr_terminate(tag);
142 input = skipwhite(input);
143 if(attrs) {
144 /* gather attributes */
145 while(*input && *input != '>') {
146 struct dynstr name[1], value[1];
147
148 dynstr_init(name);
149 dynstr_init(value);
150 /* attribute name */
151 while(isalnum((unsigned char)*input))
152 dynstr_append(name, tolower((unsigned char)*input++));
153 dynstr_terminate(name);
154 input = skipwhite(input);
155 if(*input == '=') {
156 /* attribute value */
157 input = skipwhite(input + 1);
158 if(*input == '"' || *input == '\'') {
159 /* quoted value */
160 const int q = *input++;
161 while(*input && *input != q)
162 input = parse_one(input, value);
163 if(*input == q)
164 ++input;
165 } else {
166 /* unquoted value */
167 while(*input && *input != '>' && !isspace((unsigned char)*input))
168 input = parse_one(input, value);
169 }
170 dynstr_terminate(value);
171 }
172 /* stash the value */
173 hash_add(attrs, name->vec, value->vec, HASH_INSERT_OR_REPLACE);
174 input = skipwhite(input);
175 }
176 }
177 if(*input != '>') {
178 error(0, "unterminated tag %s", tag->vec);
179 return -1;
180 }
181 ++input;
182 if(attrs)
183 callbacks->open(tag->vec, attrs, u);
184 else
185 callbacks->close(tag->vec, u);
186 } else
187 input = parse_one(input, text);
188 }
189 /* flush any trailing text */
190 if(text->nvec) {
191 dynstr_terminate(text);
192 callbacks->text(text->vec, u);
193 text->nvec = 0;
194 }
195 return 0;
196}
197
198/*
199Local Variables:
200c-basic-offset:2
201comment-column:40
202fill-column:79
203indent-tabs-mode:nil
204End:
205*/