3 %%% Description of the parsing machinery
5 %%% (c) 2015 Straylight/Edgeware
8 %%%----- Licensing notice ---------------------------------------------------
10 %%% This file is part of the Sensble Object Design, an object system for C.
12 %%% SOD is free software; you can redistribute it and/or modify
13 %%% it under the terms of the GNU General Public License as published by
14 %%% the Free Software Foundation; either version 2 of the License, or
15 %%% (at your option) any later version.
17 %%% SOD is distributed in the hope that it will be useful,
18 %%% but WITHOUT ANY WARRANTY; without even the implied warranty of
19 %%% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 %%% GNU General Public License for more details.
22 %%% You should have received a copy of the GNU General Public License
23 %%% along with SOD; if not, write to the Free Software Foundation,
24 %%% Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
26 \chapter{Parsing} \label{ch:parsing}
28 %%%--------------------------------------------------------------------------
29 \section{The parser protocol} \label{sec:parsing.proto}
31 For the purpose of Sod's parsing library, \emph{parsing} is the process of
32 reading a sequence of input items, in order, and computing an output value.
34 A \emph{parser} is an expression which consumes zero or more input items and
35 returns three values: a \emph{result}, a \emph{success flag}, and a
36 \emph{consumed flag}. The two flags are (generalized) booleans. If the
37 success flag is non-nil, then the parser is said to have \emph{succeeded},
38 and the result is the parser's output. If the success flag is nil then the
39 parser is said to have \emph{failed}, and the result is a list of
40 \emph{indicators}. Finally, the consumed flag is non-nil if the parser
41 consumed any input items.
43 \begin{describe}{fun}{combine-parser-failures @<failures> @> @<list>}
46 %%%--------------------------------------------------------------------------
47 \section{File locations} \label{sec:parsing.floc}
49 \begin{describe}{cls}{file-location}
52 \begin{describe}{fun}{file-location-p @<object> @> @<generalized-boolean>}
56 {make-file-location @<filename> \&optional @<line> @<column>
61 {\dhead{fun}{file-location-filename @<floc> @> @<string-or-nil>}
62 \dhead{fun}{file-location-line @<floc> @> @<fixnum-or-nil>}
63 \dhead{fun}{file-location-column @<floc> @> @<fixnum-or-nil>}}
66 \begin{describe}{gf}{file-location @<object> @> @<floc>}
67 \begin{describe}{meth}{file-location (@<floc> file-location) @> @<floc>}
69 \begin{describe}{meth}{file-location (@<stream> stream) @> @<floc>}
71 \begin{describe}{meth}{file-location (@<any> t) @> @<floc>}
75 \begin{describe}{cls}{condition-with-location (condition) \&key :location}
78 \begin{describe}{meth}
79 {file-location (@<condition> condition-with-location) @> @<floc>}
85 {error-with-location (condition-with-location error) \\ \>
88 {warning-with-location (condition-with-location warning) \\ \>
91 {enclosing-error-with-location
92 (enclosing-error-with-location error) \\ \>
93 \&key :condition :location}
95 {enclosing-warning-with-location
96 (enclosing-condition-with-location warning) \\ \>
97 \&key :condition :location}
99 {simple-condition-with-location
100 (condition-with-location simple-condition) \\ \>
101 \&key :format-control :format-arguments :location}
103 {simple-error-with-location
104 (error-with-location simple-error) \\ \>
105 \&key :format-control :format-arguments :location}
107 {simple-warning-with-location
108 (warning-with-location simple-warning) \\ \>
109 \&key :format-control :format-arguments :location}}
112 \begin{describe}{fun}
113 {make-condition-with-location @<default-type> @<floc>
114 @<datum> \&rest @<arguments>
115 \nlret @<condition-with-location>}
119 {\dhead{fun}{error-with-location @<floc> @<datum> \&rest @<arguments>}
120 \dhead{fun}{cerror-with-location @<floc> @<continue-string>
121 @<datum> \&rest @<arguments>}
122 \dhead{fun}{cerror*-with-location @<floc> @<datum> \&rest @<arguments>}
123 \dhead{fun}{warn-with-location @<floc> @<datum> \&rest @<arguments>}}
126 \begin{describe}{mac}
127 {with-default-error-location (@<floc>) @<body-form>^* @> @<value>^*}
130 \begin{describe}{mac}
131 {count-and-report-errors () @<body-form>^*
132 @> @<value> @<n-errors> @<n-warnings>}
135 %%%--------------------------------------------------------------------------
136 \section{Scanners} \label{sec:parsing.scanner}
138 A \emph{scanner} is an object which keeps track of a parser's progress as it
139 works through its input. There's no common base class for scanners: a
140 scanner is simply any object which implements the scanner protocol described
143 A scanner maintains a sequence of items to read. It can step forwards
144 through the items, one at a time, until it reaches the end (if, indeed, the
145 sequence is finite, which it needn't be). Until that point, there is a
146 current item, though there's no protocol for accessing it at this level
147 because the nature of the items is left unspecified.
149 Some scanners support an additional \emph{place-capture} protocol which
150 allows rewinding the scanner to an earlier point in the input so that it can
153 \subsection{Basic scanner protocol} \label{sec:parsing.scanner.basic}
155 The basic protocol supports stepping the scanner forward through its input
156 sequence, and detecting the end of the sequence.
158 \begin{describe}{gf}{scanner-step @<scanner>}
159 Advance the @<scanner> to the next item, which becomes current.
161 It is an error to step the scanner if the scanner is at end-of-file.
164 \begin{describe}{gf}{scanner-at-eof-p @<scanner> @> @<generalized-boolean>}
165 Return non-nil if the scanner is at end-of-file, i.e., there are no more
168 If nil is returned, there is a current item, and it is safe to step the
169 scanner again; otherwise, it is an error to query the current item or to
173 \subsection{Place-capture scanner protocol} \label{sec:parsing.scanner.place}
175 The place-capture protocol allows rewinding to an earlier point in the
176 sequence. Not all scanners support the place-capture protocol.
178 To rewind a scanner to a particular point, that point must be \emph{captured}
179 as a \emph{place} when it's current -- so you must know in advance that this
180 is an interesting place that's worth capturing. The type of place returned
181 depends on the type of scanner. Given a captured place, the scanner can be
182 rewound to the position held in it.
184 Depending on how the scanner works, holding onto a captured place might
185 consume a lot of memory or case poor performance. For example, if the
186 scanner is reading from an input stream, having a captured place means that
187 data from that point on must be buffered in case the program needs to rewind
188 the scanner and read that data again. Therefore it's possible to
189 \emph{release} a place when it turns out not to be needed any more.
191 \begin{describe}{gf}{scanner-capture-place @<scanner> @> @<place>}
192 Capture the @<scanner>'s current position as a place, and return the place.
195 \begin{describe}{gf}{scanner-restore-place @<scanner> @<place>}
196 Rewind the @<scanner> to the state it was in when @<place> was captured.
197 In particular, the item that was current when the @<place> was captured
198 becomes current again.
200 It is an error to restore a @<place> that has been released, or if the
201 @<place> wasn't captured from the @<scanner>.
204 \begin{describe}{gf}{scanner-release-place @<scanner> @<place>}
205 Release the @<place>, to avoid having to maintaining the ability to restore
206 it after it's not needed any more..
208 It is an error if the @<place> wasn't captured from the @<scanner>.
211 \begin{describe}{mac}
212 {with-scanner-place (@<place> @<scanner>) @<body-form>^* @> @<value>^*}
213 Capture the @<scanner>'s current position as a place, evaluate the
214 @<body-form>s as an implicit progn with the variable @<place> bound to the captured
215 place. When control leaves the @<body-form>s, the place is released. The return
216 values are the values of the final @<body-form>.
219 \subsection{Scanner file-location protocol} \label{sec:parsing.scanner.floc}
221 Some scanners participate in the file-location protocol
222 (\xref{sec:parsing.floc}). They implement a method on @|file-location| which
223 collects the necessary information using scanner-specific functions described
226 \begin{describe}{fun}{scanner-file-location @<scanner> @> @<file-location>}
227 Return a @|file-location| object describing the current position of the
230 This calls the @|scanner-filename|, @|scanner-line| and @|scanner-column|
231 generic functions on the scanner, and uses these to fill in an appropriate
234 Since there are default methods on these generic functions, it is not an
235 error to call @|scanner-file-location| on any kind of value, but it might
236 not be very useful. This function exists to do the work of appropriately
237 specialized methods on @|file-location|.
241 {\dhead{gf}{scanner-filename @<scanner> @> @<string>}
242 \dhead{gf}{scanner-line @<scanner> @> @<integer>}
243 \dhead{gf}{scanner-column @<scanner> @> @<integer>}}
244 Return the filename, line and column components of the @<scanner>'s current
245 position, for use in assembling a @<file-location>: see the
246 @|scanner-file-location| function.
248 There are default methods on all three generic functions which simply
252 \subsection{Character scanners} \label{sec:parsing.scanner.char}
254 Character scanners are scanners which read sequences of characters.
256 \begin{describe}{cls}{character-scanner () \&key}
257 Base class for character scanners. This provides some very basic
260 Not all character scanners are subclasses of @|character-scanner|.
263 \begin{describe}{gf}{scanner-current-char @<scanner> @> @<character>}
264 Returns the current character.
267 \begin{describe}{gf}{scanner-unread @<scanner> @<character>}
268 Rewind the @<scanner> by one step. The @<chararacter> must be the previous
269 current character, and becomes the current character again. It is an error
270 if: the @<scanner> has reached end-of-file; the @<scanner> is never been
271 stepped; or @<character> was not the previous current character.
275 {scanner-interval @<scanner> @<place-a> \&optional @<place-b>
277 Return the characters in the @<scanner>'s input from @<place-a> up to (but
278 not including) @<place-b>.
280 The characters are returned as a string. If @<place-b> is omitted, return
281 the characters up to (but not including) the current position. It is an
282 error if @<place-b> precedes @<place-a> or they are from different
285 This function is a character-scanner-specific extension to the
286 place-capture protocol; not all character scanners implement the
287 place-capture protocol, and some that do may not implement this function.
290 \subsubsection{Stream access to character scanners}
291 Sometimes it can be useful to apply the standard Lisp character input
292 operations to the sequence of characters held by a character scanner.
294 \begin{describe}{gf}{make-scanner-stream @<scanner> @> @<stream>}
295 Returns a fresh input @|stream| object which fetches input characters from
296 the character scanner object @<scanner>. Reading characters from the
297 stream steps the scanner. The stream will reach end-of-file when the
298 scanner reports end-of-file. If the scanner implements the file-location
299 protocol then reading from the stream will change the file location in an
302 This is mostly useful for applying standard Lisp stream functions, most
303 particularly the @|read| function, in the middle of a parsing operation.
306 \begin{describe}{cls}{character-scanner-stream (stream) \&key :scanner}
307 A Common Lisp input @|stream| object which works using the character
308 scanner protocol. Any @<scanner> which implements the base scanner and
309 character scanner protocols is suitable. See @|make-scanner-stream|.
312 \subsection{String scanners} \label{sec:parsing.scanner.string}
314 A \emph{string scanner} is a simple kind of character scanner which reads
315 input from a string object. String scanners implement the character scanner
316 and place-capture protocols.
318 \begin{describe}{cls}{string-scanner}
319 The class of string scanners. The @|string-scanner| class is not a
320 subclass of @|character-scanner|.
323 \begin{describe}{fun}{string-scanner-p @<value> @> @<generalized-boolean>}
324 Return non-nil if @<value> is a @|string-scanner| object; otherwise return
328 \begin{describe}{fun}
329 {make-string-scanner @<string> \&key :start :end @> @<string-scanner>}
330 Construct and return a fresh @|string-scanner| object. The new scanner
331 will read characters from @<string>, starting at index @<start> (which
332 defaults to zero), and continuing until it reaches index @<end> (defaults
333 to the end of the @<string>).
336 \subsection{Character buffer scanners} \label{sec:parsing.scanner.charbuf}
338 A \emph{character buffer scanner}, or \emph{charbuf scanner} for short, is an
339 efficient scanner for reading characters from an input stream. Charbuf
340 scanners implements the basic scanner, character buffer, place-capture, and
341 file-location protocols.
343 \begin{describe}{cls}
344 {charbuf-scanner (character-scanner)
345 \&key :stream :filename :line :column}
346 The class of charbuf scanners. The scanner will read characters from
347 @<stream>. Charbuf scanners implement the file-location protocol: the
348 initial location is set from the given @<filename>, @<line> and @<column>;
349 the scanner will update the location as it reads its input.
352 \begin{describe}{cls}{charbuf-scanner-place}
353 The class of place objects captured by a charbuf scanner.
356 \begin{describe}{fun}
357 {charbuf-scanner-place-p @<value> @> @<generalized-boolean>}
358 Type predicate for charbuf scanner places: returns non-nil if @<value> is a
359 place captured by a charbuf scanner, and nil otherwise.
363 {charbuf-scanner-map @<scanner> @<func> \&optional @<fail>
364 \nlret @<result> @<successp> @<consumedp>}
365 Read characters from the @<scanner>'s buffers.
367 This is intended to be an efficient and versatile interface for reading
368 characters from a scanner in bulk. The function @<func> is invoked
371 (multiple-value-bind (@<donep> @<used>) \\ \ind\ind
372 (funcall @<func> @<buf> @<start> @<end>) \- \\
375 The argument @<buf> is a simple string; @<start> and @<end> are two
376 nonnegative fixnums, indicating that the subsequence of @<buf> between
377 @<start> (inclusive) and @<end> (exclusive) should be processed. If
378 @<func>'s return value @<donep> is nil then @<used> is ignored: the
379 function has consumed the entire buffer and wishes to read more. If
380 @<donep> is non-nil, then it must be a fixnum such that $@<start> \le
381 @<used> \le @<end>$: the function has consumed the buffer as far as @<used>
382 (exclusive) and has completed successfully.
384 If end-of-file is encountered before @<func> completes successfully then it
385 fails: the @<fail> function is called with no arguments, and is expected to
386 return two values. If omitted, @<fail> defaults to
392 The @|charbuf-scanner-map| function returns three values. The first value
393 is the non-nil @<donep> value returned by @<func> if @|charbuf-scanner-map|
394 succeeded, or the first value returned by @<fail>; the second value is @|t|
395 on success, or the second value returned by @<fail>; the third value is
396 non-nil if @<func> consumed any input, i.e., it returned with @<donep> nil
397 at least once, or with $@<used> > @<start>$.
400 \subsection{Token scanners} \label{sec:parsing.scanner.token}
402 \begin{describe}{cls}
403 {token-scanner () \&key :filename (:line 1) (:column 0)}
406 \begin{describe}{gf}{token-type @<scanner> @> @<type>}
409 \begin{describe}{gf}{token-value @<scanner> @> @<value>}
412 \begin{describe}{gf}{scanner-token @<scanner> @> @<type> @<value>}
415 \begin{describe}{ty}{token-scanner-place}
418 \begin{describe}{fun}
419 {token-scanner-place-p @<value> @> @<generalized-boolean>}
422 \subsection{List scanners}
424 \begin{describe}{ty}{list-scanner}
427 \begin{describe}{fun}{list-scanner-p @<value> @> @<generalized-boolean>}
430 \begin{describe}{fun}{make-list-scanner @<list> @> @<list-scanner>}
433 %%%--------------------------------------------------------------------------
434 \section{Parsing syntax}
436 \begin{describe}{gf}{expand-parser-spec @<context> @<spec> @> @<form>}
440 {expand-parser-form @<context> @<head> @<tail> @> @<form>}
443 \begin{describe}{gf}{wrap-parser @<context> @<form> @> @<wrapped-form>}
446 \begin{describe}{mac}
447 {defparse @<name> (@[[ :context (@<var> @<context-class>) @]]
448 @<destructuring-lambda-list-item>^*) \\ \ind
453 \begin{describe}{mac}
455 (@<context-class> @{ @<init-keyword> @<value> @}^*) \\ \ind
460 \begin{describe}{lmac}
461 {parse @<parser> @> @<result> @<success-flag> @<consumed-flag>}
464 \begin{describe}{gf}{parser-at-eof-p @<context> @> @<form>}
467 \begin{describe}{gf}{parser-step @<context> @> @<form>}
470 \begin{describe}{sym}{it}
473 \begin{describe}{mac}
474 {if-parse (@[[ \=:result @<result-var> @!
475 :expected @<expected-var> @! \+ \\
476 :consumedp @<consumed-var> @]]) \- \\ \ind\ind
483 \begin{describe}{mac}
484 {when-parse (@[@<result-var>@]) @<parser> \\ \ind
489 \begin{describe}{mac}
490 {cond-parse (@[[ \=:result @<result-var> @!
491 :expected @<expected-var> @! \+ \\
492 :consumedp @<consumed-var> @]]) \- \\ \ind
493 @{ (@<parser> @<form>^*) @}^* \-
497 \begin{describe}{parse}{:eof}
500 \begin{describe}{parseform}{lisp @<form>^*}
503 \begin{describe}{parseform}{label @<parser>}
506 \begin{describe}{parse}{t}
509 \begin{describe}{parseform}{t @<value>}
512 \begin{describe}{parse}{nil}
515 \begin{describe}{parseform}{nil @<indicator>}
518 \begin{describe}{parseform}{when @<cond> @<parser>}
521 \begin{describe}{parseform}
522 {seq (@{ @<atomic-parser-spec> @! (@[@<var>@] @<parser>) @}^*) \\ \ind
526 \begin{describe}{parseform}{and @<parser>^*}
529 \begin{describe}{parseform}{or @<parser>^*}
532 \begin{describe}{parseform}{? @<parser> @[@<default>@]}
535 \begin{describe}{parseform}
536 {many (\=@<accumulator-var> @<init-form> @<update-form> \+ \\
537 @[[ \=:new @<new-var> @! :final @<final-form> @! \+ \\
538 :min @<minimum> @! :max @<maximum> @! \\
539 :commitp @<commitp> @]]) \-\- \\ \ind
540 @<item-parser> @[@<sep-parser>@]}
543 \begin{describe}{parseform}
544 {list (@[[ :min @<minimum> @! :max @<maximum> @!
545 :commitp @<commitp> @]])\\ \ind
546 @<item-parser> @[@<sep-parser>@]}
549 \begin{describe}{parseform}
550 {skip-many (@[[ :min @<minimum> @! :max @<maximum> @!
551 :commitp @<commitp> @]])\\ \ind
552 @<item-parser> @[@<sep-parser>@]}
555 \begin{describe}{fun}{call-pluggable-parser @<symbol> \&rest @<args>}
558 \begin{describe}{parseform}{plug @<symbol> @<arg>^*}
561 \begin{describe}{fun}
562 {pluggable-parser-add @<symbol> @<tag> @<parser-function>}
565 \begin{describe}{mac}
566 {define-pluggable-parser @<symbol> @<tag> @<lambda-list> @<body-form>^*}
569 \begin{describe}{gf}{parser-capture-place @<context> @> @<form>}
572 \begin{describe}{gf}{parser-restore-place @<context> @<place> @> @<form>}
575 \begin{describe}{gf}{parser-release-place @<context> @<place> @> @<form>}
579 {parser-places-must-be-released-p @<context> @> @<generalized-boolean>>}
582 \begin{describe}{mac}
583 {with-parser-place (@<place-var> @<context>) @<body-form>^*}
586 \begin{describe}{parseform}{peek @<parser>}
589 \begin{describe}{cls}{character-parser-context () \&key}
592 \begin{describe}{gf}{parser-current-char @<context> @> @<form>}
595 \begin{describe}{parseform}
596 {if-char (@[@<result-var>@]) @<condition> @<consequent> @<alternative>}
599 \begin{describe}{parseform}{char @<character>}
602 \begin{describe}[char]{parse}{@<character>}
605 \begin{describe}[string]{parse}{@<string>}
608 \begin{describe}{parse}{:any}
611 \begin{describe}{parseform}{satisfies @<predicate>}
614 \begin{describe}{parseform}{not @<character>}
617 \begin{describe}{parseform}{filter @<predicate>}
620 \begin{describe}{parse}{:whitespace}
623 \begin{describe}{cls}{token-parser-context () \&key}
626 \begin{describe}{gf}{parser-token-type @<context> @> @<form>}
629 \begin{describe}{gf}{parser-token-value @<context> @> @<form>}
632 \begin{describe}{parseform}{token @<type> @[@<value>@] @[:peekp @<peek>@]}
635 \begin{describe}[atom]{parse}{@<atom>}
638 \begin{describe}[string]{parse}{@<string>}
641 \begin{describe}{cls}{scanner-context () \&key :scanner}
644 \begin{describe}{gf}{parse-scanner @<context> @> @<symbol>}
647 \begin{describe}{cls}
648 {character-scanner-context (scanner-context character-parser-context)
652 \begin{describe}{cls}
653 {token-scanner-context (scanner-context token-parser-context)
657 \begin{describe}{gf}{push-operator @<operator> @<state>}
660 \begin{describe}{gf}{push-value @<value> @<state>}
663 \begin{describe}{gf}{apply-operator @<operator> @<state>}
666 \begin{describe}{gf}{operator-push-action @<left> @<right>}
669 \begin{describe}{parseform}
670 {expr \=(@[[ :nestedp @<nestedp-var> @]]) \+ \\
671 @<operand-parser> @<binop-parser>
672 @<preop-parser> @<postop-parser>}
675 \begin{describe}{gf}{operator-left-precedence @<operator> @> @<prec>}
678 \begin{describe}{gf}{operator-right-precedence @<operator> @> @<prec>}
681 \begin{describe}{gf}{operator-associativity @<operator> @> @<assoc>}
684 \begin{describe}{cls}{prefix-operator () \&key}
687 \begin{describe}{cls}{simple-operator () \&key :name :function}
690 \begin{describe}{cls}
691 {simple-unary-operator (simple-operator) \&key :name :function}
696 \dhead{cls}{simple-binary-operator (simple-operator) \\ \>
697 \&key :name :function :lprec :rprec :associativity}
698 \dhead{cls}{simple-postfix-operator (simple-unary-operator) \\ \>
699 \&key :name :function :lprec :rprec}
700 \dhead{cls}{simple-prefix-operator
701 (prefix-operator simple-unary-operator) \\ \>
702 \&key :name :function :rprec}}
706 {\dhead{mac}{preop @<name> (@<operand-var> @<lprec>)
708 @> @<prefix-operator>}
709 \dhead{mac}{postop @<name>
710 (@<operand-var> @<lprec> @[[ :rprec @<rprec> @]])
712 @> @<postfix-operator>}
713 \dhead{mac}{binop @<name> (@<operand-var> @<lprec> @<rprec> @<assoc>)
715 @> @<binary-operator>}}
719 {\dhead{cls}{parenthesis () \&key :tag}
720 \dhead{cls}{open-parenthesis (parenthesis prefix-operator) \&key :tag}
721 \dhead{cls}{close-parenthesis (parenthesis) \&key :tag}}
725 {\dhead{fun}{lparen @<tag> @> @<open-paren>}
726 \dhead{fun}{rparen @<tag> @> @<close-paren>}}
729 %%%-------------------------------------------------------------------------
730 \section{Lexical analyser}
732 \begin{describe}{cls}
733 {sod-token-scanner (token-scanner)
734 \&key :filename (:line 1) (:column 0) :char-scanner}
737 \begin{describe}{fun}{define-indicator @<indicator> @<description>}
740 \begin{describe}{fun}{syntax-error @<scanner> @<expected> \&key :continuep}
743 \begin{describe}{fun}
744 {lexer-error @<char-scanner> @<expected> @<consumed-flag>}
747 \begin{describe}{parseform}
748 {skip-until (@[[ :keep-end @<keep-end-flag> @]]) @<token-type>^*}
751 \begin{describe}{parseform}{error () @<sub-parser> @<recover-parser>}
754 \begin{describe}{fun}
755 {scan-comment @<char-scanner>
756 @> @<result> @<success-flag> @<consumed-flag>}
759 %%%----- That's all, folks --------------------------------------------------
763 %%% TeX-master: "sod.tex"