Commit | Line | Data |
---|---|---|
13affe66 RK |
1 | /* |
2 | * This file is part of DisOrder | |
3 | * Copyright (C) 2007 Richard Kettlewell | |
4 | * | |
5 | * This program is free software; you can redistribute it and/or modify | |
6 | * it under the terms of the GNU General Public License as published by | |
7 | * the Free Software Foundation; either version 2 of the License, or | |
8 | * (at your option) any later version. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License for more details. | |
14 | * | |
15 | * You should have received a copy of the GNU General Public License | |
16 | * along with this program; if not, write to the Free Software | |
17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 | |
18 | * USA | |
19 | */ | |
20 | /** @file lib/html.c | |
21 | * @brief Noddy HTML parser | |
22 | */ | |
23 | ||
24 | #include <config.h> | |
25 | #include "types.h" | |
26 | ||
27 | #include <string.h> | |
28 | #include <ctype.h> | |
29 | #include <stddef.h> | |
30 | ||
31 | #include "hash.h" | |
32 | #include "html.h" | |
33 | #include "mem.h" | |
34 | #include "log.h" | |
35 | #include "vector.h" | |
36 | #include "charset.h" | |
37 | #include "table.h" | |
38 | ||
39 | /** @brief Entity table type */ | |
40 | struct entity { | |
41 | const char *name; | |
42 | uint32_t value; | |
43 | }; | |
44 | ||
45 | /** @brief Known entities | |
46 | * | |
47 | * We only support the entities that turn up in the HTML files we | |
48 | * actually care about. | |
49 | * | |
50 | * Keep in alphabetical order. | |
51 | */ | |
52 | static const struct entity entities[] = { | |
53 | { "amp", '&' }, | |
54 | { "gt", '>' }, | |
55 | { "lt", '<' } | |
56 | }; | |
57 | ||
58 | /** @brief Skip whitespace */ | |
59 | static const char *skipwhite(const char *input) { | |
60 | while(*input && isspace((unsigned char)*input)) | |
61 | ++input; | |
62 | return input; | |
63 | } | |
64 | ||
65 | /** @brief Parse an entity */ | |
66 | static const char *parse_entity(const char *input, | |
67 | uint32_t *entityp) { | |
68 | input = skipwhite(input); | |
69 | if(*input == '#') { | |
70 | input = skipwhite(input + 1); | |
71 | if(*input == 'x') | |
72 | *entityp = strtoul(skipwhite(input + 1), (char **)&input, 16); | |
73 | else | |
74 | *entityp = strtoul(input, (char **)&input, 10); | |
75 | } else { | |
76 | struct dynstr name[1]; | |
77 | int n; | |
78 | ||
79 | dynstr_init(name); | |
80 | while(isalnum((unsigned char)*input)) | |
81 | dynstr_append(name, tolower((unsigned char)*input++)); | |
82 | dynstr_terminate(name); | |
83 | if((n = TABLE_FIND(entities, struct entity, name, name->vec)) < 0) { | |
84 | error(0, "unknown entity '%s'", name->vec); | |
85 | *entityp = '?'; | |
86 | } else | |
87 | *entityp = entities[n].value; | |
88 | } | |
89 | input = skipwhite(input); | |
90 | if(*input == ';') | |
91 | ++input; | |
92 | return input; | |
93 | } | |
94 | ||
95 | /** @brief Parse one character or entity and append it to a @ref dynstr */ | |
96 | static const char *parse_one(const char *input, struct dynstr *d) { | |
97 | if(*input == '&') { | |
98 | uint32_t c; | |
99 | input = parse_entity(input + 1, &c); | |
100 | if(one_ucs42utf8(c, d)) | |
101 | dynstr_append(d, '?'); /* U+FFFD might be a better choice */ | |
102 | } else | |
103 | dynstr_append(d, *input++); | |
104 | return input; | |
105 | } | |
106 | ||
107 | /** @brief Too-stupid-to-live HTML parser | |
108 | * @param callbacks Parser callbacks | |
109 | * @param input HTML document | |
110 | * @param u User data pointer | |
111 | * @return 0 on success, -1 on error | |
112 | */ | |
113 | int html_parse(const struct html_parser_callbacks *callbacks, | |
114 | const char *input, | |
115 | void *u) { | |
116 | struct dynstr text[1]; | |
117 | ||
118 | dynstr_init(text); | |
119 | while(*input) { | |
120 | if(*input == '<') { | |
121 | struct dynstr tag[1]; | |
122 | hash *attrs; | |
123 | ||
124 | /* flush collected text */ | |
125 | if(text->nvec) { | |
126 | dynstr_terminate(text); | |
127 | callbacks->text(text->vec, u); | |
128 | text->nvec = 0; | |
129 | } | |
130 | dynstr_init(tag); | |
131 | input = skipwhite(input + 1); | |
132 | /* see if it's an open or close tag */ | |
133 | if(*input == '/') { | |
134 | input = skipwhite(input + 1); | |
135 | attrs = 0; | |
136 | } else | |
137 | attrs = hash_new(sizeof(char *)); | |
138 | /* gather tag */ | |
139 | while(isalnum((unsigned char)*input)) | |
140 | dynstr_append(tag, tolower((unsigned char)*input++)); | |
141 | dynstr_terminate(tag); | |
142 | input = skipwhite(input); | |
143 | if(attrs) { | |
144 | /* gather attributes */ | |
145 | while(*input && *input != '>') { | |
146 | struct dynstr name[1], value[1]; | |
147 | ||
148 | dynstr_init(name); | |
149 | dynstr_init(value); | |
150 | /* attribute name */ | |
151 | while(isalnum((unsigned char)*input)) | |
152 | dynstr_append(name, tolower((unsigned char)*input++)); | |
153 | dynstr_terminate(name); | |
154 | input = skipwhite(input); | |
155 | if(*input == '=') { | |
156 | /* attribute value */ | |
157 | input = skipwhite(input + 1); | |
158 | if(*input == '"' || *input == '\'') { | |
159 | /* quoted value */ | |
160 | const int q = *input++; | |
161 | while(*input && *input != q) | |
162 | input = parse_one(input, value); | |
163 | if(*input == q) | |
164 | ++input; | |
165 | } else { | |
166 | /* unquoted value */ | |
167 | while(*input && *input != '>' && !isspace((unsigned char)*input)) | |
168 | input = parse_one(input, value); | |
169 | } | |
170 | dynstr_terminate(value); | |
171 | } | |
172 | /* stash the value */ | |
173 | hash_add(attrs, name->vec, value->vec, HASH_INSERT_OR_REPLACE); | |
174 | input = skipwhite(input); | |
175 | } | |
176 | } | |
177 | if(*input != '>') { | |
178 | error(0, "unterminated tag %s", tag->vec); | |
179 | return -1; | |
180 | } | |
181 | ++input; | |
182 | if(attrs) | |
183 | callbacks->open(tag->vec, attrs, u); | |
184 | else | |
185 | callbacks->close(tag->vec, u); | |
186 | } else | |
187 | input = parse_one(input, text); | |
188 | } | |
189 | /* flush any trailing text */ | |
190 | if(text->nvec) { | |
191 | dynstr_terminate(text); | |
192 | callbacks->text(text->vec, u); | |
193 | text->nvec = 0; | |
194 | } | |
195 | return 0; | |
196 | } | |
197 | ||
198 | /* | |
199 | Local Variables: | |
200 | c-basic-offset:2 | |
201 | comment-column:40 | |
202 | fill-column:79 | |
203 | indent-tabs-mode:nil | |
204 | End: | |
205 | */ |