Commit | Line | Data |
---|---|---|
dea4d055 MW |
1 | ;;; -*-lisp-*- |
2 | ;;; | |
3 | ;;; Efficient buffering character scanner | |
4 | ;;; | |
5 | ;;; (c) 2009 Straylight/Edgeware | |
6 | ;;; | |
7 | ||
8 | ;;;----- Licensing notice --------------------------------------------------- | |
9 | ;;; | |
e0808c47 | 10 | ;;; This file is part of the Sensible Object Design, an object system for C. |
dea4d055 MW |
11 | ;;; |
12 | ;;; SOD is free software; you can redistribute it and/or modify | |
13 | ;;; it under the terms of the GNU General Public License as published by | |
14 | ;;; the Free Software Foundation; either version 2 of the License, or | |
15 | ;;; (at your option) any later version. | |
16 | ;;; | |
17 | ;;; SOD is distributed in the hope that it will be useful, | |
18 | ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
20 | ;;; GNU General Public License for more details. | |
21 | ;;; | |
22 | ;;; You should have received a copy of the GNU General Public License | |
23 | ;;; along with SOD; if not, write to the Free Software Foundation, | |
24 | ;;; Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | |
25 | ||
26 | (cl:in-package #:sod-parser) | |
27 | ||
28 | ;;;-------------------------------------------------------------------------- | |
29 | ;;; Infrastructure types. | |
30 | ||
1d8cc67a MW |
31 | (eval-when (:compile-toplevel :load-toplevel :execute) |
32 | (defconstant charbuf-size 4096 | |
33 | "Number of characters in a character buffer.")) | |
dea4d055 MW |
34 | |
35 | (deftype charbuf () | |
36 | "Type of character buffers." | |
37 | `(simple-string ,charbuf-size)) | |
38 | ||
39 | (deftype charbuf-index () | |
40 | "Type of indices into character buffers." | |
41 | `(integer 0 ,charbuf-size)) | |
42 | ||
43 | (declaim (inline make-charbuf)) | |
44 | (defun make-charbuf () | |
45 | "Return a fresh uninitialized character buffer." | |
46 | (make-array charbuf-size :element-type 'character)) | |
47 | ||
48 | (defstruct charbuf-chain-link | |
49 | "A link in the charbuf scanner's buffer chain. | |
50 | ||
51 | Usually the scanner doesn't bother maintaining a buffer chain; but if | |
52 | we've rewound to a captured place then we need to be able to retrace our | |
53 | steps on to later buffers. | |
54 | ||
55 | It turns out to be easier to have an explicit link to the next structure | |
56 | in the chain than to maintain a spine of cons cells, so we do that; the | |
57 | only other things we need are the buffer itself and its length, which | |
58 | might be shorter than `charbuf-size', e.g., if we hit end-of-file." | |
59 | (next nil :type (or charbuf-chain-link null)) | |
60 | (buf nil :type (or charbuf (member nil :eof)) :read-only t) | |
61 | (size 0 :type charbuf-index :read-only t)) | |
62 | ||
dea4d055 MW |
63 | ;;;-------------------------------------------------------------------------- |
64 | ;;; Main class. | |
65 | ||
66 | (export 'charbuf-scanner) | |
67 | (defclass charbuf-scanner (character-scanner) | |
4b8e5c03 | 68 | ((%stream :initarg :stream :type stream) |
dea4d055 | 69 | (buf :initform nil :type (or charbuf (member nil :eof))) |
55f543fa MW |
70 | (size :initform 0 :type charbuf-index) |
71 | (index :initform 0 :type charbuf-index) | |
dea4d055 MW |
72 | (captures :initform 0 :type (and fixnum unsigned-byte)) |
73 | (tail :initform nil :type (or charbuf-chain-link null)) | |
74 | (unread :initform nil :type (or charbuf-chain-link nil)) | |
75 | (filename :initarg :filename :type (or string null) | |
76 | :reader scanner-filename) | |
bf090e02 | 77 | (line :initarg :line :initform 1 :type fixnum :reader scanner-line) |
dea4d055 MW |
78 | (column :initarg :column :initform 0 :type fixnum :reader scanner-column)) |
79 | (:documentation | |
80 | "An efficient rewindable scanner for character streams. | |
81 | ||
82 | The scanner should be used via the parser protocol. The following notes | |
83 | describe the class's slots and the invariants maintained by the class. | |
84 | ||
85 | The scanner reads characters from STREAM. It reads in chunks, | |
86 | `charbuf-size' characters at a time, into freshly allocated arrays. At | |
87 | the beginning of time, BUF is nil; and SIZE is 0, indicating that a new | |
88 | buffer needs to be read in; this anomalous situation is remedied during | |
89 | instance initialization. At all times thereafter: | |
90 | ||
91 | * If SIZE > 0 then BUF is a `charbuf' containing characters. | |
92 | ||
93 | * (<= 0 INDEX SIZE charbuf-size). | |
94 | ||
95 | When the current buffer is finished with, another one is fetched. If | |
96 | we've rewound the scanner to a captured place, then there'll be a chain of | |
97 | buffers starting at TAIL (which corresponds to the current buffer); and we | |
98 | should use its NEXT buffer when we've finished this one. | |
99 | ||
100 | If there is no next buffer then we should acquire a new one and fill it | |
101 | from the input stream. If there is an outstanding captured place then we | |
102 | must also create a buffer chain entry for this new buffer and link it onto | |
103 | the chain. If there aren't outstanding captures then we don't need to | |
104 | bother with any of that -- earlier places certainly can't be captured and | |
105 | a capture of the current position can allocate its own buffer chain | |
106 | entry. | |
107 | ||
108 | Which leaves us with the need to determine whether there are outstanding | |
109 | captures. We simply maintain a counter, and rely on the client releasing | |
110 | captured places properly when he's finished. In practice, this is usually | |
111 | done using the `peek' parser macro so there isn't a problem.")) | |
112 | ||
502df9a2 MW |
113 | (export 'charbuf-scanner-place-p) |
114 | (defstruct charbuf-scanner-place | |
115 | "A captured place we can return to later. | |
116 | ||
117 | We remember the buffer-chain link, so that we can retrace our steps up to | |
118 | the present. We also need the index at which we continue reading | |
119 | characters; and the line and column numbers to resume from." | |
120 | (scanner nil :type charbuf-scanner :read-only t) | |
121 | (link nil :type charbuf-chain-link :read-only t) | |
122 | (index 0 :type charbuf-index :read-only t) | |
123 | (line 0 :type fixnum :read-only t) | |
124 | (column 0 :type fixnum :read-only t)) | |
125 | ||
126 | (defmethod file-location ((place charbuf-scanner-place)) | |
127 | (make-file-location (scanner-filename | |
128 | (charbuf-scanner-place-scanner place)) | |
129 | (charbuf-scanner-place-line place) | |
130 | (charbuf-scanner-place-column place))) | |
131 | ||
dea4d055 MW |
132 | ;;;-------------------------------------------------------------------------- |
133 | ;;; Utilities. | |
134 | ||
135 | (defgeneric charbuf-scanner-fetch (scanner) | |
136 | (:documentation | |
137 | "Refill the scanner buffer. | |
138 | ||
139 | This is an internal method, which is really only a method so that the | |
140 | compiler will optimize slot references. | |
141 | ||
142 | Replace the current buffer with the next one, either from the buffer chain | |
143 | (if we're currently rewound) or with a new buffer from the stream.")) | |
144 | ||
145 | (defmethod charbuf-scanner-fetch ((scanner charbuf-scanner)) | |
4b8e5c03 | 146 | (with-slots ((stream %stream) buf size index tail captures) scanner |
dea4d055 MW |
147 | (loop |
148 | (acond | |
149 | ||
150 | ;; If we've hit the end of the line, stop. | |
151 | ((eq buf :eof) | |
152 | (return nil)) | |
153 | ||
154 | ;; If there's another buffer, we should check it out. | |
155 | ((and tail (charbuf-chain-link-next tail)) | |
156 | (setf tail it | |
157 | buf (charbuf-chain-link-buf it) | |
158 | size (charbuf-chain-link-size it) | |
159 | index 0)) | |
160 | ||
161 | ;; No joy: try reading more stuff from the input stream. | |
162 | (t | |
163 | (let* ((new (make-charbuf)) | |
164 | (n (read-sequence new stream :start 0 :end charbuf-size))) | |
165 | ||
166 | ;; If there's nothing coming in then store a magical marker. | |
167 | (when (zerop n) (setf new :eof)) | |
168 | ||
169 | ;; If there's someone watching, link a new entry onto the chain. | |
170 | ;; There must, under these circumstances, be a `tail'. | |
171 | (if (plusp captures) | |
172 | (let ((next (make-charbuf-chain-link :buf new :size n))) | |
173 | (setf (charbuf-chain-link-next tail) next | |
174 | tail next)) | |
175 | (setf tail nil)) | |
176 | ||
177 | ;; Store the new state. | |
178 | (setf buf new | |
179 | size n | |
180 | index 0)))) | |
181 | ||
182 | ;; If there's stuff in the current buffer, we're done. | |
183 | (when (< index size) | |
184 | (return t))))) | |
185 | ||
186 | (export 'charbuf-scanner-map) | |
187 | (defgeneric charbuf-scanner-map (scanner func &optional fail) | |
188 | (:documentation | |
189 | "Read characters from the SCANNER's raw buffers. | |
190 | ||
191 | This is intended to be an efficient and versatile interface for reading | |
192 | characters from a scanner in bulk. The FUNC is invoked repeatedly with | |
193 | three arguments: a simple string BUF and two nonnegative fixnums START and | |
194 | END, indicating that the subsequence of BUF between START (inclusive) and | |
195 | END (exclusive) should be processed. The FUNC returns two values: a | |
196 | generalized boolean DONEP and a nonnegative fixnum USED. If DONEP is | |
197 | false then USED is ignored: the function has consumed the entire buffer | |
198 | and wishes to read more. If DONEP is true then the condition (<= START | |
199 | USED END) must hold; the FUNC has consumed the buffer as far as USED | |
200 | (exclusive) and has completed successfully; the values DONEP and `t' are | |
6e5e1905 MW |
201 | returned as the result of `charbuf-scanner-map', along with a CONSUMEDP |
202 | flag. | |
dea4d055 MW |
203 | |
204 | If end-of-file is encountered before FUNC completes successfully then FAIL | |
6e5e1905 MW |
205 | is called with no arguments and expected to return two values, and |
206 | `charbuf-scanner-map' returns these values, along with a CONSUMEDP flag. | |
dea4d055 MW |
207 | |
208 | Observe that, if FAIL returns a second value of nil, then | |
209 | `charbuf-scanner-map' is usable as a parser expression.")) | |
210 | ||
211 | (defmethod charbuf-scanner-map | |
212 | ((scanner charbuf-scanner) func &optional fail) | |
213 | (with-slots (buf index size) scanner | |
6e5e1905 MW |
214 | (let ((consumedp nil)) |
215 | (flet ((offer (buf start end) | |
216 | ||
217 | ;; Pass the buffer to the function, and see what it thought. | |
218 | (multiple-value-bind (donep used) (funcall func buf start end) | |
219 | ||
220 | ;; Update the position as far as the function read. | |
221 | (with-slots (line column) scanner | |
222 | (let ((l line) (c column) (limit (if donep used end))) | |
223 | (do ((i start (1+ i))) | |
224 | ((>= i limit)) | |
225 | (setf (values l c) | |
226 | (update-position (char buf i) l c))) | |
227 | (setf line l column c))) | |
228 | ||
229 | ;; If the function is finished then update our state and | |
230 | ;; return. | |
231 | (when donep | |
232 | (setf index used) | |
233 | (when (>= index size) | |
234 | (charbuf-scanner-fetch scanner)) | |
235 | (return-from charbuf-scanner-map | |
236 | (values donep t (or consumedp (> used start))))) | |
237 | ||
238 | ;; We've definitely used that buffer. | |
239 | (setf consumedp t)))) | |
240 | ||
241 | ;; If there's anything in the current buffer, offer it to the | |
242 | ;; function. | |
243 | (when (< index size) | |
244 | (offer buf index size)) | |
245 | ||
246 | ;; Repeatedly fetch new buffers and offer them to the function. | |
247 | ;; Because the buffers are fresh, we know that we must process them | |
248 | ;; from the beginning. Note that `offer' will exit if FUNC has | |
249 | ;; finished, so we don't need to worry about that. | |
250 | (loop | |
251 | (unless (charbuf-scanner-fetch scanner) | |
252 | (return (if fail | |
253 | (multiple-value-bind (result win) (funcall fail) | |
254 | (values result win consumedp)) | |
255 | (values nil nil consumedp)))) | |
256 | (offer buf 0 size)))))) | |
dea4d055 MW |
257 | |
258 | ;;;-------------------------------------------------------------------------- | |
259 | ;;; Initialization. | |
260 | ||
261 | (defmethod shared-initialize :after | |
262 | ((scanner charbuf-scanner) slot-names &key) | |
263 | ||
264 | ;; Grab the filename from the underlying stream if we don't have a better | |
265 | ;; guess. | |
266 | (default-slot (scanner 'filename slot-names) | |
4b8e5c03 | 267 | (with-slots ((stream %stream)) scanner |
dea4d055 MW |
268 | (aif (stream-pathname stream) (namestring it) nil))) |
269 | ||
270 | ;; Get ready with the first character. | |
271 | (charbuf-scanner-fetch scanner)) | |
272 | ||
273 | ;;;-------------------------------------------------------------------------- | |
274 | ;;; Scanner protocol implementation. | |
275 | ||
276 | (defmethod scanner-at-eof-p ((scanner charbuf-scanner)) | |
277 | (with-slots (buf) scanner | |
278 | (eq buf :eof))) | |
279 | ||
280 | (defmethod scanner-current-char ((scanner charbuf-scanner)) | |
281 | (with-slots (buf index) scanner | |
282 | (schar buf index))) | |
283 | ||
284 | (defmethod scanner-step ((scanner charbuf-scanner)) | |
285 | (with-slots (buf size index line column) scanner | |
286 | ||
287 | ;; If there's a current character then update the position from it. When | |
288 | ;; is there a current character? When the index is valid. | |
289 | (when (< index size) | |
290 | (setf (values line column) | |
291 | (update-position (schar buf index) line column))) | |
292 | ||
293 | ;; Now move the position on. If there's still a character left then we | |
294 | ;; win; otherwise fetch another buffer. | |
295 | (or (< (incf index) size) | |
296 | (charbuf-scanner-fetch scanner)))) | |
297 | ||
298 | (defmethod scanner-unread ((scanner charbuf-scanner) char) | |
299 | (with-slots (buf index size unread tail line column) scanner | |
300 | (cond | |
301 | ||
302 | ;; First, let's rewind the buffer index. This isn't going to work if | |
303 | ;; the index is already zero. (Note that this implies that INDEX is | |
304 | ;; zero in the remaining cases.) | |
305 | ((plusp index) | |
306 | (decf index)) | |
307 | ||
308 | ;; Plan B. Maybe we've been here before, in which case we'll have left | |
309 | ;; the appropriate state kicking about already. Note that, according | |
310 | ;; to the `unread' rules, the character must be the same as last time, | |
311 | ;; so we can just reuse the whole thing unchanged. Also, note that | |
312 | ;; the NEXT field in UNREAD is not nil due to the way that we construct | |
313 | ;; the link below. | |
314 | ((and unread (eql (charbuf-chain-link-next unread) tail)) | |
315 | (setf tail unread size 1 | |
316 | buf (charbuf-chain-link-buf unread))) | |
317 | ||
318 | ;; Nope, we've not been here, at least not recently. We'll concoct a | |
319 | ;; new buffer and put the necessary stuff in it. Store it away for | |
320 | ;; later so that repeated read/unread oscillations at this position | |
321 | ;; don't end up consing enormous arrays too much. | |
322 | (t | |
323 | (let* ((next (or tail (make-charbuf-chain-link :buf buf :size size))) | |
324 | (fake (make-charbuf)) | |
325 | (this (make-charbuf-chain-link :buf fake :size 1 :next next))) | |
326 | (setf (schar fake 0) char buf fake size 1 | |
327 | tail this unread this)))) | |
328 | ||
329 | ;; That's that sorted; now we have to fiddle the position. | |
330 | (setf (values line column) (backtrack-position char line column)))) | |
331 | ||
332 | (defmethod scanner-capture-place ((scanner charbuf-scanner)) | |
333 | (with-slots (buf size index captures tail line column) scanner | |
334 | (incf captures) | |
335 | (unless tail | |
336 | (setf tail (make-charbuf-chain-link :buf buf :size size))) | |
9ec578d9 | 337 | (make-charbuf-scanner-place :scanner scanner :link tail :index index |
dea4d055 MW |
338 | :line line :column column))) |
339 | ||
340 | (defmethod scanner-restore-place ((scanner charbuf-scanner) place) | |
341 | (with-slots (buf size index tail line column) scanner | |
342 | (let ((link (charbuf-scanner-place-link place))) | |
343 | (setf buf (charbuf-chain-link-buf link) | |
344 | size (charbuf-chain-link-size link) | |
345 | index (charbuf-scanner-place-index place) | |
346 | line (charbuf-scanner-place-line place) | |
347 | column (charbuf-scanner-place-column place) | |
348 | tail link)))) | |
349 | ||
350 | (defmethod scanner-release-place ((scanner charbuf-scanner) place) | |
1d8cc67a | 351 | (declare (ignore place)) |
dea4d055 MW |
352 | (with-slots (captures) scanner |
353 | (decf captures))) | |
354 | ||
355 | (defstruct (charbuf-slice | |
356 | (:constructor make-charbuf-slice | |
357 | (buf &optional (start 0) %end | |
358 | &aux (end (or %end (length buf)))))) | |
359 | (buf nil :type (or charbuf (eql :eof)) :read-only t) | |
360 | (start 0 :type (and fixnum unsigned-byte) :read-only t) | |
361 | (end 0 :type (and fixnum unsigned-byte) :read-only t)) | |
362 | ||
363 | (declaim (inline charbuf-slice-length)) | |
364 | (defun charbuf-slice-length (slice) | |
365 | (- (charbuf-slice-end slice) (charbuf-slice-start slice))) | |
366 | ||
367 | (defun concatenate-charbuf-slices (slices) | |
368 | (let* ((len (reduce #'+ slices | |
369 | :key #'charbuf-slice-length | |
370 | :initial-value 0)) | |
371 | (string (make-array len :element-type 'character)) | |
372 | (i 0)) | |
373 | (dolist (slice slices) | |
374 | (let ((buf (charbuf-slice-buf slice)) | |
375 | (end (charbuf-slice-end slice))) | |
376 | (do ((j (charbuf-slice-start slice) (1+ j))) | |
377 | ((>= j end)) | |
378 | (setf (schar string i) (schar buf j)) | |
379 | (incf i)))) | |
380 | string)) | |
381 | ||
382 | (defmethod scanner-interval | |
383 | ((scanner charbuf-scanner) place-a &optional place-b) | |
384 | (let* ((slices nil) | |
385 | (place-b (or place-b | |
386 | (with-slots (index tail) scanner | |
9ec578d9 MW |
387 | (make-charbuf-scanner-place :scanner scanner |
388 | :link tail | |
dea4d055 MW |
389 | :index index)))) |
390 | (last-link (charbuf-scanner-place-link place-b))) | |
391 | (flet ((bad () | |
a1985b3c | 392 | (error "Incorrect places ~S and ~S to `scanner-interval'" |
dea4d055 MW |
393 | place-a place-b))) |
394 | (do ((link (charbuf-scanner-place-link place-a) | |
395 | (charbuf-chain-link-next link)) | |
396 | (start (charbuf-scanner-place-index place-a) 0)) | |
397 | ((eq link last-link) | |
398 | (let ((end (charbuf-scanner-place-index place-b))) | |
399 | (when (< end start) | |
400 | (bad)) | |
401 | (push (make-charbuf-slice (charbuf-chain-link-buf link) | |
402 | start end) | |
403 | slices) | |
404 | (concatenate-charbuf-slices (nreverse slices)))) | |
405 | (when (null link) (bad)) | |
406 | (push (make-charbuf-slice (charbuf-chain-link-buf link) | |
407 | start | |
408 | (charbuf-chain-link-size link)) | |
409 | slices))))) | |
410 | ||
411 | ;;;-------------------------------------------------------------------------- | |
412 | ;;; Specialized streams. | |
413 | ||
414 | (export 'charbuf-scanner-stream) | |
415 | (defclass charbuf-scanner-stream (character-scanner-stream) | |
416 | ((scanner :initarg :scanner :type charbuf-scanner))) | |
417 | ||
bf090e02 MW |
418 | (defmethod make-scanner-stream ((scanner charbuf-scanner)) |
419 | (make-instance 'charbuf-scanner-stream :scanner scanner)) | |
420 | ||
dea4d055 | 421 | (defmethod stream-read-sequence |
1d8cc67a MW |
422 | ((stream charbuf-scanner-stream) (seq string) |
423 | #+clisp &key #-clisp &optional (start 0) end) | |
dea4d055 MW |
424 | (with-slots (scanner) stream |
425 | (unless end (setf end (length seq))) | |
426 | (let ((i start) (n (- end start))) | |
427 | (labels ((copy (i buf start end) | |
777e16ac | 428 | (replace seq buf :start1 i :start2 start :end2 end)) |
dea4d055 MW |
429 | (snarf (buf start end) |
430 | (let ((m (- end start))) | |
431 | (cond ((< m n) | |
432 | (copy i buf start end) (decf n m) (incf i m) | |
433 | (values nil 0)) | |
434 | (t | |
435 | (copy i buf start (+ start n)) (incf i n) | |
436 | (values t n)))))) | |
437 | (charbuf-scanner-map scanner #'snarf) | |
438 | i)))) | |
439 | ||
440 | (defmethod stream-read-line ((stream charbuf-scanner-stream)) | |
441 | (with-slots (scanner) stream | |
442 | (let ((slices nil)) | |
443 | (flet ((snarf (buf start end) | |
444 | (let ((pos (position #\newline buf :start start :end end))) | |
445 | (push (make-charbuf-slice buf start (or pos end)) slices) | |
a02384b5 | 446 | (values pos (and pos (1+ pos)))))) |
6e5e1905 | 447 | (multiple-value-bind (result eofp consumedp) |
a02384b5 | 448 | (charbuf-scanner-map scanner #'snarf) |
6e5e1905 | 449 | (declare (ignore result consumedp)) |
a02384b5 | 450 | (values (concatenate-charbuf-slices (nreverse slices))) eofp))))) |
dea4d055 MW |
451 | |
452 | ;;;----- That's all, folks -------------------------------------------------- |