3 %%% Description of the parsing machinery
5 %%% (c) 2015 Straylight/Edgeware
8 %%%----- Licensing notice ---------------------------------------------------
10 %%% This file is part of the Sensible Object Design, an object system for C.
12 %%% SOD is free software; you can redistribute it and/or modify
13 %%% it under the terms of the GNU General Public License as published by
14 %%% the Free Software Foundation; either version 2 of the License, or
15 %%% (at your option) any later version.
17 %%% SOD is distributed in the hope that it will be useful,
18 %%% but WITHOUT ANY WARRANTY; without even the implied warranty of
19 %%% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 %%% GNU General Public License for more details.
22 %%% You should have received a copy of the GNU General Public License
23 %%% along with SOD; if not, write to the Free Software Foundation,
24 %%% Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
26 \chapter{Parsing} \label{ch:parsing}
28 %%%--------------------------------------------------------------------------
29 \section{The parser protocol} \label{sec:parsing.proto}
31 For the purpose of Sod's parsing library, \emph{parsing} is the process of
32 reading a sequence of input items, in order, and computing an output value.
34 A \emph{parser} is an expression which consumes zero or more input items and
35 returns three values: a \emph{result}, a \emph{success flag}, and a
36 \emph{consumed flag}. The two flags are (generalized) booleans. If the
37 success flag is non-nil, then the parser is said to have \emph{succeeded},
38 and the result is the parser's output. If the success flag is nil then the
39 parser is said to have \emph{failed}, and the result is a list of
40 \emph{indicators}. Finally, the consumed flag is non-nil if the parser
41 consumed any input items.
43 \begin{describe}{fun}{combine-parser-failures @<failures> @> @<list>}
46 %%%--------------------------------------------------------------------------
47 \section{File locations} \label{sec:parsing.floc}
49 \begin{describe}{cls}{file-location}
52 \begin{describe}{fun}{file-location-p @<object> @> @<generalized-boolean>}
56 {make-file-location @<filename> \&optional @<line> @<column>
61 {\dhead{fun}{file-location-filename @<floc> @> @<string-or-nil>}
62 \dhead{fun}{file-location-line @<floc> @> @<fixnum-or-nil>}
63 \dhead{fun}{file-location-column @<floc> @> @<fixnum-or-nil>}}
66 \begin{describe}{gf}{file-location @<object> @> @<floc>}
67 \begin{describe}{meth}{file-location (@<floc> file-location) @> @<floc>}
69 \begin{describe}{meth}{file-location (@<stream> stream) @> @<floc>}
71 \begin{describe}{meth}{file-location (@<any> t) @> @<floc>}
75 \begin{describe}{cls}{condition-with-location (condition) \&key :location}
78 \begin{describe}{meth}
79 {file-location (@<condition> condition-with-location) @> @<floc>}
85 {error-with-location (condition-with-location error) \\ \>
88 {warning-with-location (condition-with-location warning) \\ \>
91 {enclosing-error-with-location
92 (enclosing-error-with-location error) \\ \>
93 \&key :condition :location}
95 {enclosing-warning-with-location
96 (enclosing-condition-with-location warning) \\ \>
97 \&key :condition :location}
99 {simple-condition-with-location
100 (condition-with-location simple-condition) \\ \>
101 \&key :format-control :format-arguments :location}
103 {simple-error-with-location
104 (error-with-location simple-error) \\ \>
105 \&key :format-control :format-arguments :location}
107 {simple-warning-with-location
108 (warning-with-location simple-warning) \\ \>
109 \&key :format-control :format-arguments :location}}
112 \begin{describe}{fun}
113 {make-condition-with-location @<default-type> @<floc>
114 @<datum> \&rest @<arguments>
115 \nlret @<condition-with-location>}
119 {\dhead{fun}{error-with-location @<floc> @<datum> \&rest @<arguments>}
120 \dhead{fun}{cerror-with-location @<floc> @<continue-string>
121 @<datum> \&rest @<arguments>}
122 \dhead{fun}{cerror*-with-location @<floc> @<datum> \&rest @<arguments>}
123 \dhead{fun}{warn-with-location @<floc> @<datum> \&rest @<arguments>}}
126 \begin{describe}{mac}
127 {with-default-error-location (@<floc>) @<declaration>^* @<form>^*
131 \begin{describe}{mac}
132 {count-and-report-errors () @<declaration>^* @<form>^*
133 @> @<value> @<n-errors> @<n-warnings>}
136 %%%--------------------------------------------------------------------------
137 \section{Scanners} \label{sec:parsing.scanner}
139 A \emph{scanner} is an object which keeps track of a parser's progress as it
140 works through its input. There's no common base class for scanners: a
141 scanner is simply any object which implements the scanner protocol described
144 A scanner maintains a sequence of items to read. It can step forwards
145 through the items, one at a time, until it reaches the end (if, indeed, the
146 sequence is finite, which it needn't be). Until that point, there is a
147 current item, though there's no protocol for accessing it at this level
148 because the nature of the items is left unspecified.
150 Some scanners support an additional \emph{place-capture} protocol which
151 allows rewinding the scanner to an earlier point in the input so that it can
154 \subsection{Basic scanner protocol} \label{sec:parsing.scanner.basic}
156 The basic protocol supports stepping the scanner forward through its input
157 sequence, and detecting the end of the sequence.
159 \begin{describe}{gf}{scanner-step @<scanner>}
160 Advance the @<scanner> to the next item, which becomes current.
162 It is an error to step the scanner if the scanner is at end-of-file.
165 \begin{describe}{gf}{scanner-at-eof-p @<scanner> @> @<generalized-boolean>}
166 Return non-nil if the scanner is at end-of-file, i.e., there are no more
169 If nil is returned, there is a current item, and it is safe to step the
170 scanner again; otherwise, it is an error to query the current item or to
174 \subsection{Place-capture scanner protocol} \label{sec:parsing.scanner.place}
176 The place-capture protocol allows rewinding to an earlier point in the
177 sequence. Not all scanners support the place-capture protocol.
179 To rewind a scanner to a particular point, that point must be \emph{captured}
180 as a \emph{place} when it's current -- so you must know in advance that this
181 is an interesting place that's worth capturing. The type of place returned
182 depends on the type of scanner. Given a captured place, the scanner can be
183 rewound to the position held in it.
185 Depending on how the scanner works, holding onto a captured place might
186 consume a lot of memory or case poor performance. For example, if the
187 scanner is reading from an input stream, having a captured place means that
188 data from that point on must be buffered in case the program needs to rewind
189 the scanner and read that data again. Therefore it's possible to
190 \emph{release} a place when it turns out not to be needed any more.
192 \begin{describe}{gf}{scanner-capture-place @<scanner> @> @<place>}
193 Capture the @<scanner>'s current position as a place, and return the place.
196 \begin{describe}{gf}{scanner-restore-place @<scanner> @<place>}
197 Rewind the @<scanner> to the state it was in when @<place> was captured.
198 In particular, the item that was current when the @<place> was captured
199 becomes current again.
201 It is an error to restore a @<place> that has been released, or if the
202 @<place> wasn't captured from the @<scanner>.
205 \begin{describe}{gf}{scanner-release-place @<scanner> @<place>}
206 Release the @<place>, to avoid having to maintaining the ability to restore
207 it after it's not needed any more..
209 It is an error if the @<place> wasn't captured from the @<scanner>.
212 \begin{describe}{mac}
213 {with-scanner-place (@<place> @<scanner>) @<declarations>^* @<form>^*
215 Capture the @<scanner>'s current position as a place, evaluate the
216 @<body-form>s as an implicit progn with the variable @<place> bound to the captured
217 place. When control leaves the @<body-form>s, the place is released. The return
218 values are the values of the final @<body-form>.
221 \subsection{Scanner file-location protocol} \label{sec:parsing.scanner.floc}
223 Some scanners participate in the file-location protocol
224 (\xref{sec:parsing.floc}). They implement a method on @|file-location| which
225 collects the necessary information using scanner-specific functions described
228 \begin{describe}{fun}{scanner-file-location @<scanner> @> @<file-location>}
229 Return a @|file-location| object describing the current position of the
232 This calls the @|scanner-filename|, @|scanner-line| and @|scanner-column|
233 generic functions on the scanner, and uses these to fill in an appropriate
236 Since there are default methods on these generic functions, it is not an
237 error to call @|scanner-file-location| on any kind of value, but it might
238 not be very useful. This function exists to do the work of appropriately
239 specialized methods on @|file-location|.
243 {\dhead{gf}{scanner-filename @<scanner> @> @<string>}
244 \dhead{gf}{scanner-line @<scanner> @> @<integer>}
245 \dhead{gf}{scanner-column @<scanner> @> @<integer>}}
246 Return the filename, line and column components of the @<scanner>'s current
247 position, for use in assembling a @<file-location>: see the
248 @|scanner-file-location| function.
250 There are default methods on all three generic functions which simply
254 \subsection{Character scanners} \label{sec:parsing.scanner.char}
256 Character scanners are scanners which read sequences of characters.
258 \begin{describe}{cls}{character-scanner () \&key}
259 Base class for character scanners. This provides some very basic
262 Not all character scanners are subclasses of @|character-scanner|.
265 \begin{describe}{gf}{scanner-current-char @<scanner> @> @<character>}
266 Returns the current character.
269 \begin{describe}{gf}{scanner-unread @<scanner> @<character>}
270 Rewind the @<scanner> by one step. The @<chararacter> must be the previous
271 current character, and becomes the current character again. It is an error
272 if: the @<scanner> has reached end-of-file; the @<scanner> is never been
273 stepped; or @<character> was not the previous current character.
277 {scanner-interval @<scanner> @<place-a> \&optional @<place-b>
279 Return the characters in the @<scanner>'s input from @<place-a> up to (but
280 not including) @<place-b>.
282 The characters are returned as a string. If @<place-b> is omitted, return
283 the characters up to (but not including) the current position. It is an
284 error if @<place-b> precedes @<place-a> or they are from different
287 This function is a character-scanner-specific extension to the
288 place-capture protocol; not all character scanners implement the
289 place-capture protocol, and some that do may not implement this function.
292 \subsubsection{Stream access to character scanners}
293 Sometimes it can be useful to apply the standard Lisp character input
294 operations to the sequence of characters held by a character scanner.
296 \begin{describe}{gf}{make-scanner-stream @<scanner> @> @<stream>}
297 Returns a fresh input @|stream| object which fetches input characters from
298 the character scanner object @<scanner>. Reading characters from the
299 stream steps the scanner. The stream will reach end-of-file when the
300 scanner reports end-of-file. If the scanner implements the file-location
301 protocol then reading from the stream will change the file location in an
304 This is mostly useful for applying standard Lisp stream functions, most
305 particularly the @|read| function, in the middle of a parsing operation.
308 \begin{describe}{cls}{character-scanner-stream (stream) \&key :scanner}
309 A Common Lisp input @|stream| object which works using the character
310 scanner protocol. Any @<scanner> which implements the base scanner and
311 character scanner protocols is suitable. See @|make-scanner-stream|.
314 \subsection{String scanners} \label{sec:parsing.scanner.string}
316 A \emph{string scanner} is a simple kind of character scanner which reads
317 input from a string object. String scanners implement the character scanner
318 and place-capture protocols.
320 \begin{describe}{cls}{string-scanner}
321 The class of string scanners. The @|string-scanner| class is not a
322 subclass of @|character-scanner|.
325 \begin{describe}{fun}{string-scanner-p @<value> @> @<generalized-boolean>}
326 Return non-nil if @<value> is a @|string-scanner| object; otherwise return
330 \begin{describe}{fun}
331 {make-string-scanner @<string> \&key :start :end @> @<string-scanner>}
332 Construct and return a fresh @|string-scanner| object. The new scanner
333 will read characters from @<string>, starting at index @<start> (which
334 defaults to zero), and continuing until it reaches index @<end> (defaults
335 to the end of the @<string>).
338 \subsection{Character buffer scanners} \label{sec:parsing.scanner.charbuf}
340 A \emph{character buffer scanner}, or \emph{charbuf scanner} for short, is an
341 efficient scanner for reading characters from an input stream. Charbuf
342 scanners implements the basic scanner, character buffer, place-capture, and
343 file-location protocols.
345 \begin{describe}{cls}
346 {charbuf-scanner (character-scanner)
347 \&key :stream :filename :line :column}
348 The class of charbuf scanners. The scanner will read characters from
349 @<stream>. Charbuf scanners implement the file-location protocol: the
350 initial location is set from the given @<filename>, @<line> and @<column>;
351 the scanner will update the location as it reads its input.
354 \begin{describe}{cls}{charbuf-scanner-place}
355 The class of place objects captured by a charbuf scanner.
358 \begin{describe}{fun}
359 {charbuf-scanner-place-p @<value> @> @<generalized-boolean>}
360 Type predicate for charbuf scanner places: returns non-nil if @<value> is a
361 place captured by a charbuf scanner, and nil otherwise.
365 {charbuf-scanner-map @<scanner> @<func> \&optional @<fail>
366 \nlret @<result> @<successp> @<consumedp>}
367 Read characters from the @<scanner>'s buffers.
369 This is intended to be an efficient and versatile interface for reading
370 characters from a scanner in bulk. The function @<func> is invoked
373 (multiple-value-bind (@<donep> @<used>) \\ \ind\ind
374 (funcall @<func> @<buf> @<start> @<end>) \- \\
377 The argument @<buf> is a simple string; @<start> and @<end> are two
378 nonnegative fixnums, indicating that the subsequence of @<buf> between
379 @<start> (inclusive) and @<end> (exclusive) should be processed. If
380 @<func>'s return value @<donep> is nil then @<used> is ignored: the
381 function has consumed the entire buffer and wishes to read more. If
382 @<donep> is non-nil, then it must be a fixnum such that $@<start> \le
383 @<used> \le @<end>$: the function has consumed the buffer as far as @<used>
384 (exclusive) and has completed successfully.
386 If end-of-file is encountered before @<func> completes successfully then it
387 fails: the @<fail> function is called with no arguments, and is expected to
388 return two values. If omitted, @<fail> defaults to
394 The @|charbuf-scanner-map| function returns three values. The first value
395 is the non-nil @<donep> value returned by @<func> if @|charbuf-scanner-map|
396 succeeded, or the first value returned by @<fail>; the second value is @|t|
397 on success, or the second value returned by @<fail>; the third value is
398 non-nil if @<func> consumed any input, i.e., it returned with @<donep> nil
399 at least once, or with $@<used> > @<start>$.
402 \subsection{Token scanners} \label{sec:parsing.scanner.token}
404 \begin{describe}{cls}
405 {token-scanner () \&key :filename (:line 1) (:column 0)}
408 \begin{describe}{gf}{token-type @<scanner> @> @<type>}
411 \begin{describe}{gf}{token-value @<scanner> @> @<value>}
414 \begin{describe}{gf}{scanner-token @<scanner> @> @<type> @<value>}
417 \begin{describe}{ty}{token-scanner-place}
420 \begin{describe}{fun}
421 {token-scanner-place-p @<value> @> @<generalized-boolean>}
424 \subsection{List scanners}
426 \begin{describe}{ty}{list-scanner}
429 \begin{describe}{fun}{list-scanner-p @<value> @> @<generalized-boolean>}
432 \begin{describe}{fun}{make-list-scanner @<list> @> @<list-scanner>}
435 %%%--------------------------------------------------------------------------
436 \section{Parsing syntax}
438 \begin{describe}{gf}{expand-parser-spec @<context> @<spec> @> @<form>}
442 {expand-parser-form @<context> @<head> @<tail> @> @<form>}
445 \begin{describe}{gf}{wrap-parser @<context> @<form> @> @<wrapped-form>}
448 \begin{describe}{mac}
449 {defparse @<name> (@[[ :context (@<var> @<context-class>) @]]
450 @<destructuring-lambda-list-item>^*) \\ \ind
451 @[[ @<declaration>^* @! @<doc-string> @]] \\
456 \begin{describe}{mac}
458 (@<context-class> @{ @<init-keyword> @<value> @}^*) \\ \ind
464 \begin{describe}{lmac}
465 {parse @<parser> @> @<result> @<success-flag> @<consumed-flag>}
468 \begin{describe}{mac}
469 {parser @<lambda-list>
470 @[[ @<declaration>^* @! @<doc-string> @]]
475 \begin{describe}{gf}{parser-at-eof-p @<context> @> @<form>}
478 \begin{describe}{gf}{parser-step @<context> @> @<form>}
481 \begin{describe}{sym}{it}
484 \begin{describe}{mac}
485 {if-parse (@[[ \=:result @<result-var> @!
486 :expected @<expected-var> @! \+ \\
487 :consumedp @<consumed-var> @]]) \- \\ \ind\ind
494 \begin{describe}{mac}
495 {when-parse (@[@<result-var>@]) @<parser> \\ \ind
500 \begin{describe}{mac}
501 {cond-parse (@[[ \=:result @<result-var> @!
502 :expected @<expected-var> @! \+ \\
503 :consumedp @<consumed-var> @]]) \- \\ \ind
504 @{ (@<parser> @<form>^*) @}^* \-
508 \begin{describe}{parse}{:eof}
511 \begin{describe}{parseform}{lisp @<form>^*}
514 \begin{describe}{parseform}{label @<parser>}
517 \begin{describe}{parse}{t}
520 \begin{describe}{parseform}{t @<value>}
523 \begin{describe}{parse}{nil}
526 \begin{describe}{parseform}{nil @<indicator>}
529 \begin{describe}{parseform}{when @<cond> @<parser>}
532 \begin{describe}{parseform}
533 {seq (@{ @<atomic-parser-spec> @! (@[@<var>@] @<parser>) @}^*) \\ \ind
537 \begin{describe}{parseform}{and @<parser>^*}
540 \begin{describe}{parseform}{or @<parser>^*}
543 \begin{describe}{parseform}{? @<parser> @[@<default>@]}
546 \begin{describe}{parseform}
547 {many (\=@<accumulator-var> @<init-form> @<update-form> \+ \\
548 @[[ \=:new @<new-var> @! :final @<final-form> @! \+ \\
549 :min @<minimum> @! :max @<maximum> @! \\
550 :commitp @<commitp> @]]) \-\- \\ \ind
551 @<item-parser> @[@<sep-parser>@]}
554 \begin{describe}{parseform}
555 {list (@[[ :min @<minimum> @! :max @<maximum> @!
556 :commitp @<commitp> @]])\\ \ind
557 @<item-parser> @[@<sep-parser>@]}
560 \begin{describe}{parseform}
561 {skip-many (@[[ :min @<minimum> @! :max @<maximum> @!
562 :commitp @<commitp> @]])\\ \ind
563 @<item-parser> @[@<sep-parser>@]}
566 \begin{describe}{fun}{call-pluggable-parser @<symbol> \&rest @<args>}
569 \begin{describe}{parseform}{plug @<symbol> @<arg>^*}
572 \begin{describe}{fun}
573 {pluggable-parser-add @<symbol> @<tag> @<parser-function>}
576 \begin{describe}{mac}
577 {define-pluggable-parser @<symbol> @<tag> @<lambda-list>
578 @[[ @<declaration>^* @! @<doc-string> @]]
582 \begin{describe}{gf}{parser-capture-place @<context> @> @<form>}
585 \begin{describe}{gf}{parser-restore-place @<context> @<place> @> @<form>}
588 \begin{describe}{gf}{parser-release-place @<context> @<place> @> @<form>}
592 {parser-places-must-be-released-p @<context> @> @<generalized-boolean>>}
595 \begin{describe}{mac}
596 {with-parser-place (@<place-var> @<context>)
597 @[[ @<declaration>^* @! @<doc-string> @]]
601 \begin{describe}{parseform}{peek @<parser>}
604 \begin{describe}{parseform}{commit}
607 \begin{describe}{cls}{character-parser-context () \&key}
610 \begin{describe}{gf}{parser-current-char @<context> @> @<form>}
613 \begin{describe}{parseform}
614 {if-char (@[@<result-var>@]) @<condition> @<consequent> @<alternative>}
617 \begin{describe}{parseform}{char @<character>}
620 \begin{describe}[char]{parse}{@<character>}
623 \begin{describe}[string]{parse}{@<string>}
626 \begin{describe}{parse}{:any}
629 \begin{describe}{parseform}{satisfies @<predicate>}
632 \begin{describe}{parseform}{not @<character>}
635 \begin{describe}{parseform}{filter @<predicate>}
638 \begin{describe}{parse}{:whitespace}
641 \begin{describe}{cls}{token-parser-context () \&key}
644 \begin{describe}{gf}{parser-token-type @<context> @> @<form>}
647 \begin{describe}{gf}{parser-token-value @<context> @> @<form>}
650 \begin{describe}{parseform}{token @<type> @[@<value>@] @[:peekp @<peek>@]}
653 \begin{describe}[atom]{parse}{@<atom>}
656 \begin{describe}[string]{parse}{@<string>}
659 \begin{describe}{cls}{scanner-context () \&key :scanner}
662 \begin{describe}{gf}{parse-scanner @<context> @> @<symbol>}
665 \begin{describe}{cls}
666 {character-scanner-context (scanner-context character-parser-context)
670 \begin{describe}{cls}
671 {token-scanner-context (scanner-context token-parser-context)
675 \begin{describe}{gf}{push-operator @<operator> @<state>}
678 \begin{describe}{gf}{push-value @<value> @<state>}
681 \begin{describe}{gf}{apply-operator @<operator> @<state>}
684 \begin{describe}{gf}{operator-push-action @<left> @<right>}
687 \begin{describe}{parseform}
688 {expr \=(@[[ :nestedp @<nestedp-var> @]]) \+ \\
689 @<operand-parser> @<binop-parser>
690 @<preop-parser> @<postop-parser>}
693 \begin{describe}{gf}{operator-left-precedence @<operator> @> @<prec>}
696 \begin{describe}{gf}{operator-right-precedence @<operator> @> @<prec>}
699 \begin{describe}{gf}{operator-associativity @<operator> @> @<assoc>}
702 \begin{describe}{cls}{prefix-operator () \&key}
705 \begin{describe}{cls}{simple-operator () \&key :name :function}
708 \begin{describe}{cls}
709 {simple-unary-operator (simple-operator) \&key :name :function}
714 \dhead{cls}{simple-binary-operator (simple-operator) \\ \>
715 \&key :name :function :lprec :rprec :associativity}
716 \dhead{cls}{simple-postfix-operator (simple-unary-operator) \\ \>
717 \&key :name :function :lprec :rprec}
718 \dhead{cls}{simple-prefix-operator
719 (prefix-operator simple-unary-operator) \\ \>
720 \&key :name :function :rprec}}
724 {\dhead{mac}{preop @<name> (@<operand-var> @<lprec>)
725 @<declaration>^* @<form>^*
726 @> @<prefix-operator>}
727 \dhead{mac}{postop @<name>
728 (@<operand-var> @<lprec> @[[ :rprec @<rprec> @]])
729 @<declaration>^* @<form>^*
730 \nlret @<postfix-operator>}
731 \dhead{mac}{binop @<name> (@<operand-var> @<lprec> @<rprec> @<assoc>)
732 @<declaration>^*@<form>^*
733 @> @<binary-operator>}}
737 {\dhead{cls}{parenthesis () \&key :tag}
738 \dhead{cls}{open-parenthesis (parenthesis prefix-operator) \&key :tag}
739 \dhead{cls}{close-parenthesis (parenthesis) \&key :tag}}
743 {\dhead{fun}{lparen @<tag> @> @<open-paren>}
744 \dhead{fun}{rparen @<tag> @> @<close-paren>}}
747 %%%-------------------------------------------------------------------------
748 \section{Lexical analyser}
750 \begin{describe}{cls}
751 {sod-token-scanner (token-scanner)
752 \&key :filename (:line 1) (:column 0) :char-scanner}
755 \begin{describe}{fun}{define-indicator @<indicator> @<description>}
758 \begin{describe}{fun}{syntax-error @<scanner> @<expected> \&key :continuep}
761 \begin{describe}{fun}
762 {lexer-error @<char-scanner> @<expected> @<consumed-flag>}
765 \begin{describe}{parseform}
766 {skip-until (@[[ :keep-end @<keep-end-flag> @]]) @<token-type>^*}
769 \begin{describe}{parseform}{error () @<sub-parser> @<recover-parser>}
772 \begin{describe}{fun}
773 {scan-comment @<char-scanner>
774 @> @<result> @<success-flag> @<consumed-flag>}
777 %%%----- That's all, folks --------------------------------------------------
781 %%% TeX-master: "sod.tex"