3 %%% Description of the parsing machinery
5 %%% (c) 2015 Straylight/Edgeware
8 %%%----- Licensing notice ---------------------------------------------------
10 %%% This file is part of the Sensible Object Design, an object system for C.
12 %%% SOD is free software; you can redistribute it and/or modify
13 %%% it under the terms of the GNU General Public License as published by
14 %%% the Free Software Foundation; either version 2 of the License, or
15 %%% (at your option) any later version.
17 %%% SOD is distributed in the hope that it will be useful,
18 %%% but WITHOUT ANY WARRANTY; without even the implied warranty of
19 %%% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 %%% GNU General Public License for more details.
22 %%% You should have received a copy of the GNU General Public License
23 %%% along with SOD; if not, write to the Free Software Foundation,
24 %%% Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
26 \chapter{Parsing} \label{ch:parsing}
28 %%%--------------------------------------------------------------------------
29 \section{The parser protocol} \label{sec:parsing.proto}
31 For the purpose of Sod's parsing library, \emph{parsing} is the process of
32 reading a sequence of input items, in order, and computing an output value.
34 A \emph{parser} is an expression which consumes zero or more input items and
35 returns three values: a \emph{result}, a \emph{success flag}, and a
36 \emph{consumed flag}. The two flags are (generalized) booleans. If the
37 success flag is non-nil, then the parser is said to have \emph{succeeded},
38 and the result is the parser's output. If the success flag is nil then the
39 parser is said to have \emph{failed}, and the result is a list of
40 \emph{indicators}. Finally, the consumed flag is non-nil if the parser
41 consumed any input items.
43 \begin{describe}{fun}{combine-parser-failures @<failures> @> @<list>}
46 \begin{describe}{fun}{parse-empty \&optional @<value> @> @<function>}
50 {parse-fail @<indicator> \&optional @<consumedp> @> @<function>}
53 %%%--------------------------------------------------------------------------
54 \section{File locations} \label{sec:parsing.floc}
56 \begin{describe}{cls}{file-location}
59 \begin{describe}{fun}{file-location-p @<object> @> @<generalized-boolean>}
63 {make-file-location @<filename> \&optional @<line> @<column>
68 {\dhead{fun}{file-location-filename @<floc> @> @<string-or-nil>}
69 \dhead{fun}{file-location-line @<floc> @> @<fixnum-or-nil>}
70 \dhead{fun}{file-location-column @<floc> @> @<fixnum-or-nil>}}
73 \begin{describe}{gf}{file-location @<object> @> @<floc>}
74 \begin{describe}{meth}{file-location (@<floc> file-location) @> @<floc>}
76 \begin{describe}{meth}{file-location (@<stream> stream) @> @<floc>}
78 \begin{describe}{meth}{file-location (@<any> t) @> @<floc>}
82 \begin{describe}{cls}{condition-with-location (condition) \&key :location}
85 \begin{describe}{meth}
86 {file-location (@<condition> condition-with-location) @> @<floc>}
92 {error-with-location (condition-with-location error) \\ \>
95 {warning-with-location (condition-with-location warning) \\ \>
98 {enclosing-error-with-location
99 (enclosing-error-with-location error) \\ \>
100 \&key :condition :location}
102 {enclosing-warning-with-location
103 (enclosing-condition-with-location warning) \\ \>
104 \&key :condition :location}
106 {simple-condition-with-location
107 (condition-with-location simple-condition) \\ \>
108 \&key :format-control :format-arguments :location}
110 {simple-error-with-location
111 (error-with-location simple-error) \\ \>
112 \&key :format-control :format-arguments :location}
114 {simple-warning-with-location
115 (warning-with-location simple-warning) \\ \>
116 \&key :format-control :format-arguments :location}}
119 \begin{describe}{fun}
120 {make-condition-with-location @<default-type> @<floc>
121 @<datum> \&rest @<arguments>
122 \nlret @<condition-with-location>}
126 {\dhead{fun}{error-with-location @<floc> @<datum> \&rest @<arguments>}
127 \dhead{fun}{cerror-with-location @<floc> @<continue-string>
128 @<datum> \&rest @<arguments>}
129 \dhead{fun}{cerror*-with-location @<floc> @<datum> \&rest @<arguments>}
130 \dhead{fun}{warn-with-location @<floc> @<datum> \&rest @<arguments>}}
133 \begin{describe}{mac}
134 {with-default-error-location (@<floc>) @<declaration>^* @<form>^*
138 \begin{describe}{mac}
139 {count-and-report-errors () @<declaration>^* @<form>^*
140 @> @<value> @<n-errors> @<n-warnings>}
143 %%%--------------------------------------------------------------------------
144 \section{Scanners} \label{sec:parsing.scanner}
146 A \emph{scanner} is an object which keeps track of a parser's progress as it
147 works through its input. There's no common base class for scanners: a
148 scanner is simply any object which implements the scanner protocol described
151 A scanner maintains a sequence of items to read. It can step forwards
152 through the items, one at a time, until it reaches the end (if, indeed, the
153 sequence is finite, which it needn't be). Until that point, there is a
154 current item, though there's no protocol for accessing it at this level
155 because the nature of the items is left unspecified.
157 Some scanners support an additional \emph{place-capture} protocol which
158 allows rewinding the scanner to an earlier point in the input so that it can
161 \subsection{Basic scanner protocol} \label{sec:parsing.scanner.basic}
163 The basic protocol supports stepping the scanner forward through its input
164 sequence, and detecting the end of the sequence.
166 \begin{describe}{gf}{scanner-step @<scanner>}
167 Advance the @<scanner> to the next item, which becomes current.
169 It is an error to step the scanner if the scanner is at end-of-file.
172 \begin{describe}{gf}{scanner-at-eof-p @<scanner> @> @<generalized-boolean>}
173 Return non-nil if the scanner is at end-of-file, i.e., there are no more
176 If nil is returned, there is a current item, and it is safe to step the
177 scanner again; otherwise, it is an error to query the current item or to
181 \subsection{Place-capture scanner protocol} \label{sec:parsing.scanner.place}
183 The place-capture protocol allows rewinding to an earlier point in the
184 sequence. Not all scanners support the place-capture protocol.
186 To rewind a scanner to a particular point, that point must be \emph{captured}
187 as a \emph{place} when it's current -- so you must know in advance that this
188 is an interesting place that's worth capturing. The type of place returned
189 depends on the type of scanner. Given a captured place, the scanner can be
190 rewound to the position held in it.
192 Depending on how the scanner works, holding onto a captured place might
193 consume a lot of memory or case poor performance. For example, if the
194 scanner is reading from an input stream, having a captured place means that
195 data from that point on must be buffered in case the program needs to rewind
196 the scanner and read that data again. Therefore it's possible to
197 \emph{release} a place when it turns out not to be needed any more.
199 \begin{describe}{gf}{scanner-capture-place @<scanner> @> @<place>}
200 Capture the @<scanner>'s current position as a place, and return the place.
203 \begin{describe}{gf}{scanner-restore-place @<scanner> @<place>}
204 Rewind the @<scanner> to the state it was in when @<place> was captured.
205 In particular, the item that was current when the @<place> was captured
206 becomes current again.
208 It is an error to restore a @<place> that has been released, or if the
209 @<place> wasn't captured from the @<scanner>.
212 \begin{describe}{gf}{scanner-release-place @<scanner> @<place>}
213 Release the @<place>, to avoid having to maintaining the ability to restore
214 it after it's not needed any more..
216 It is an error if the @<place> wasn't captured from the @<scanner>.
219 \begin{describe}{mac}
220 {with-scanner-place (@<place> @<scanner>) @<declarations>^* @<form>^*
222 Capture the @<scanner>'s current position as a place, evaluate the @<form>s
223 as an implicit progn with the variable @<place> bound to the captured
224 place. When control leaves the @<form>s, the place is released. The
225 return values are the values of the final @<form>.
228 \subsection{Scanner file-location protocol} \label{sec:parsing.scanner.floc}
230 Some scanners participate in the file-location protocol
231 (\xref{sec:parsing.floc}). They implement a method on @|file-location| which
232 collects the necessary information using scanner-specific functions described
235 \begin{describe}{fun}{scanner-file-location @<scanner> @> @<file-location>}
236 Return a @|file-location| object describing the current position of the
239 This calls the @|scanner-filename|, @|scanner-line| and @|scanner-column|
240 generic functions on the scanner, and uses these to fill in an appropriate
243 Since there are default methods on these generic functions, it is not an
244 error to call @|scanner-file-location| on any kind of value, but it might
245 not be very useful. This function exists to do the work of appropriately
246 specialized methods on @|file-location|.
250 {\dhead{gf}{scanner-filename @<scanner> @> @<string>}
251 \dhead{gf}{scanner-line @<scanner> @> @<integer>}
252 \dhead{gf}{scanner-column @<scanner> @> @<integer>}}
253 Return the filename, line and column components of the @<scanner>'s current
254 position, for use in assembling a @<file-location>: see the
255 @|scanner-file-location| function.
257 There are default methods on all three generic functions which simply
261 \subsection{Character scanners} \label{sec:parsing.scanner.char}
263 Character scanners are scanners which read sequences of characters.
265 \begin{describe}{cls}{character-scanner () \&key}
266 Base class for character scanners. This provides some very basic
269 Not all character scanners are subclasses of @|character-scanner|.
272 \begin{describe}{gf}{scanner-current-char @<scanner> @> @<character>}
273 Returns the current character.
276 \begin{describe}{gf}{scanner-unread @<scanner> @<character>}
277 Rewind the @<scanner> by one step. The @<chararacter> must be the previous
278 current character, and becomes the current character again. It is an error
279 if: the @<scanner> has reached end-of-file; the @<scanner> is never been
280 stepped; or @<character> was not the previous current character.
284 {scanner-interval @<scanner> @<place-a> \&optional @<place-b>
286 Return the characters in the @<scanner>'s input from @<place-a> up to (but
287 not including) @<place-b>.
289 The characters are returned as a string. If @<place-b> is omitted, return
290 the characters up to (but not including) the current position. It is an
291 error if @<place-b> precedes @<place-a> or they are from different
294 This function is a character-scanner-specific extension to the
295 place-capture protocol; not all character scanners implement the
296 place-capture protocol, and some that do may not implement this function.
299 \subsubsection{Stream access to character scanners}
300 Sometimes it can be useful to apply the standard Lisp character input
301 operations to the sequence of characters held by a character scanner.
303 \begin{describe}{gf}{make-scanner-stream @<scanner> @> @<stream>}
304 Returns a fresh input @|stream| object which fetches input characters from
305 the character scanner object @<scanner>. Reading characters from the
306 stream steps the scanner. The stream will reach end-of-file when the
307 scanner reports end-of-file. If the scanner implements the file-location
308 protocol then reading from the stream will change the file location in an
311 This is mostly useful for applying standard Lisp stream functions, most
312 particularly the @|read| function, in the middle of a parsing operation.
315 \begin{describe}{cls}{character-scanner-stream (stream) \&key :scanner}
316 A Common Lisp input @|stream| object which works using the character
317 scanner protocol. Any @<scanner> which implements the base scanner and
318 character scanner protocols is suitable. See @|make-scanner-stream|.
321 \subsection{String scanners} \label{sec:parsing.scanner.string}
323 A \emph{string scanner} is a simple kind of character scanner which reads
324 input from a string object. String scanners implement the character scanner
325 and place-capture protocols.
327 \begin{describe}{cls}{string-scanner}
328 The class of string scanners. The @|string-scanner| class is not a
329 subclass of @|character-scanner|.
332 \begin{describe}{fun}{string-scanner-p @<value> @> @<generalized-boolean>}
333 Return non-nil if @<value> is a @|string-scanner| object; otherwise return
337 \begin{describe}{fun}
338 {make-string-scanner @<string> \&key :start :end @> @<string-scanner>}
339 Construct and return a fresh @|string-scanner| object. The new scanner
340 will read characters from @<string>, starting at index @<start> (which
341 defaults to zero), and continuing until it reaches index @<end> (defaults
342 to the end of the @<string>).
345 \subsection{Character buffer scanners} \label{sec:parsing.scanner.charbuf}
347 A \emph{character buffer scanner}, or \emph{charbuf scanner} for short, is an
348 efficient scanner for reading characters from an input stream. Charbuf
349 scanners implements the basic scanner, character buffer, place-capture, and
350 file-location protocols.
352 \begin{describe}{cls}
353 {charbuf-scanner (character-scanner)
354 \&key :stream :filename :line :column}
355 The class of charbuf scanners. The scanner will read characters from
356 @<stream>. Charbuf scanners implement the file-location protocol: the
357 initial location is set from the given @<filename>, @<line> and @<column>;
358 the scanner will update the location as it reads its input.
361 \begin{describe}{cls}{charbuf-scanner-place}
362 The class of place objects captured by a charbuf scanner.
365 \begin{describe}{fun}
366 {charbuf-scanner-place-p @<value> @> @<generalized-boolean>}
367 Type predicate for charbuf scanner places: returns non-nil if @<value> is a
368 place captured by a charbuf scanner, and nil otherwise.
372 {charbuf-scanner-map @<scanner> @<func> \&optional @<fail>
373 \nlret @<result> @<successp> @<consumedp>}
374 Read characters from the @<scanner>'s buffers.
376 This is intended to be an efficient and versatile interface for reading
377 characters from a scanner in bulk. The function @<func> is invoked
380 (multiple-value-bind (@<donep> @<used>) \\ \ind\ind
381 (funcall @<func> @<buf> @<start> @<end>) \-\\
384 The argument @<buf> is a simple string; @<start> and @<end> are two
385 nonnegative fixnums, indicating that the subsequence of @<buf> between
386 @<start> (inclusive) and @<end> (exclusive) should be processed. If
387 @<func>'s return value @<donep> is nil then @<used> is ignored: the
388 function has consumed the entire buffer and wishes to read more. If
389 @<donep> is non-nil, then it must be a fixnum such that $@<start> \le
390 @<used> \le @<end>$: the function has consumed the buffer as far as @<used>
391 (exclusive) and has completed successfully.
393 If end-of-file is encountered before @<func> completes successfully then it
394 fails: the @<fail> function is called with no arguments, and is expected to
395 return two values. If omitted, @<fail> defaults to
401 The @|charbuf-scanner-map| function returns three values. The first value
402 is the non-nil @<donep> value returned by @<func> if @|charbuf-scanner-map|
403 succeeded, or the first value returned by @<fail>; the second value is @|t|
404 on success, or the second value returned by @<fail>; the third value is
405 non-nil if @<func> consumed any input, i.e., it returned with @<donep> nil
406 at least once, or with $@<used> > @<start>$.
409 \subsection{Token scanners} \label{sec:parsing.scanner.token}
411 \begin{describe}{cls}
412 {token-scanner () \&key :filename (:line 1) (:column 0)}
415 \begin{describe}{gf}{token-type @<scanner> @> @<type>}
418 \begin{describe}{gf}{token-value @<scanner> @> @<value>}
421 \begin{describe}{gf}{scanner-token @<scanner> @> @<type> @<value>}
424 \begin{describe}{ty}{token-scanner-place}
427 \begin{describe}{fun}
428 {token-scanner-place-p @<value> @> @<generalized-boolean>}
431 \subsection{List scanners}
433 \begin{describe}{ty}{list-scanner}
436 \begin{describe}{fun}{list-scanner-p @<value> @> @<generalized-boolean>}
439 \begin{describe}{fun}{make-list-scanner @<list> @> @<list-scanner>}
442 %%%--------------------------------------------------------------------------
443 \section{Parsing syntax}
445 \begin{describe}{gf}{expand-parser-spec @<context> @<spec> @> @<form>}
449 {expand-parser-form @<context> @<head> @<tail> @> @<form>}
452 \begin{describe}{gf}{wrap-parser @<context> @<form> @> @<wrapped-form>}
455 \begin{describe}{mac}
456 {defparse @<name> (@[[ :context (@<var> @<context-class>) @]]
457 @<destructuring-lambda-list-item>^*) \\ \ind
458 @[[ @<declaration>^* @! @<doc-string> @]] \\
463 \begin{describe}{mac}
465 (@<context-class> @{ @<init-keyword> @<value> @}^*) \\ \ind
471 \begin{describe}{lmac}
472 {parse @<parser> @> @<result> @<success-flag> @<consumed-flag>}
475 \begin{describe}{mac}
476 {parser @<lambda-list>
477 @[[ @<declaration>^* @! @<doc-string> @]]
482 \begin{describe}{gf}{parser-at-eof-p @<context> @> @<form>}
485 \begin{describe}{gf}{parser-step @<context> @> @<form>}
488 \begin{describe}{sym}{it}
491 \begin{describe}{mac}
492 {if-parse (@[[ \=:result @<result-var> @!
493 :expected @<expected-var> @! \+\\
494 :consumedp @<consumed-var> @]]) \-\\ \ind\ind
501 \begin{describe}{mac}
502 {when-parse (@[@<result-var>@]) @<parser> \\ \ind
507 \begin{describe}{mac}
508 {cond-parse (@[[ \=:result @<result-var> @!
509 :expected @<expected-var> @! \+\\
510 :consumedp @<consumed-var> @]]) \-\\ \ind
511 @{ (@<parser> @<form>^*) @}^*
515 \begin{describe}{parse}{:eof}
518 \begin{describe}{parseform}{lisp @<form>^*}
521 \begin{describe}{parseform}{label @<parser>}
524 \begin{describe}{parse}{t}
527 \begin{describe}{parseform}{t @<value>}
530 \begin{describe}{parse}{nil}
533 \begin{describe}{parseform}{nil @<indicator>}
536 \begin{describe}{parseform}{when @<cond> @<parser>}
539 \begin{describe}{parseform}
540 {seq (@{ @<atomic-parser-spec> @!
541 (@[@<var>@] @<parser>) @}^*) \\ \ind
545 \begin{describe}{parseform}{and @<parser>^*}
548 \begin{describe}{parseform}{or @<parser>^*}
551 \begin{describe}{parseform}{? @<parser> @[@<default>@]}
554 \begin{describe}{parseform}
555 {many (\=@<accumulator-var> @<init-form> @<update-form> \+\\
556 @[[ \=:new @<new-var> @! :final @<final-form> @! \+\\
557 :min @<minimum> @! :max @<maximum> @! \\
558 :commitp @<commitp> @]]) \-\-\\ \ind
559 @<item-parser> @[@<sep-parser>@]}
562 \begin{describe}{parseform}
563 {list (@[[ :min @<minimum> @! :max @<maximum> @!
564 :commitp @<commitp> @]]) \\ \ind
565 @<item-parser> @[@<sep-parser>@]}
568 \begin{describe}{parseform}
569 {skip-many (@[[ :min @<minimum> @! :max @<maximum> @!
570 :commitp @<commitp> @]]) \\ \ind
571 @<item-parser> @[@<sep-parser>@]}
574 \begin{describe}{fun}{call-pluggable-parser @<symbol> \&rest @<args>}
577 \begin{describe}{parseform}{plug @<symbol> @<arg>^*}
580 \begin{describe}{fun}
581 {pluggable-parser-add @<symbol> @<tag> @<parser-function>}
584 \begin{describe}{mac}
585 {define-pluggable-parser @<symbol> @<tag> @<lambda-list>
586 @[[ @<declaration>^* @! @<doc-string> @]]
590 \begin{describe}{gf}{parser-capture-place @<context> @> @<form>}
593 \begin{describe}{gf}{parser-restore-place @<context> @<place> @> @<form>}
596 \begin{describe}{gf}{parser-release-place @<context> @<place> @> @<form>}
600 {parser-places-must-be-released-p @<context> @> @<generalized-boolean>>}
603 \begin{describe}{mac}
604 {with-parser-place (@<place-var> @<context>)
605 @[[ @<declaration>^* @! @<doc-string> @]]
609 \begin{describe}{parseform}{peek @<parser>}
612 \begin{describe}{parseform}{commit}
615 \begin{describe}{cls}{character-parser-context () \&key}
618 \begin{describe}{gf}{parser-current-char @<context> @> @<form>}
621 \begin{describe}{parseform}
622 {if-char (@[@<result-var>@]) @<condition> @<consequent> @<alternative>}
625 \begin{describe}{parseform}{char @<character>}
628 \begin{describe}[char]{parse}{@<character>}
631 \begin{describe}[string]{parse}{@<string>}
634 \begin{describe}{parse}{:any}
637 \begin{describe}{parseform}{satisfies @<predicate>}
640 \begin{describe}{parseform}{not @<character>}
643 \begin{describe}{parseform}{filter @<predicate>}
646 \begin{describe}{parse}{:whitespace}
649 \begin{describe}{cls}{token-parser-context () \&key}
652 \begin{describe}{gf}{parser-token-type @<context> @> @<form>}
655 \begin{describe}{gf}{parser-token-value @<context> @> @<form>}
658 \begin{describe}{parseform}{token @<type> @[@<value>@] @[:peekp @<peek>@]}
661 \begin{describe}[atom]{parse}{@<atom>}
664 \begin{describe}[string]{parse}{@<string>}
667 \begin{describe}{cls}{scanner-context () \&key :scanner}
670 \begin{describe}{gf}{parse-scanner @<context> @> @<symbol>}
673 \begin{describe}{cls}
674 {character-scanner-context (scanner-context character-parser-context)
678 \begin{describe}{cls}
679 {token-scanner-context (scanner-context token-parser-context)
683 \begin{describe}{gf}{push-operator @<operator> @<state>}
686 \begin{describe}{gf}{push-value @<value> @<state>}
689 \begin{describe}{gf}{apply-operator @<operator> @<state>}
692 \begin{describe}{gf}{operator-push-action @<left> @<right>}
695 \begin{describe}{parseform}
696 {expr \=(@[[ :nestedp @<nestedp-var> @]]) \+\\
697 @<operand-parser> @<binop-parser>
698 @<preop-parser> @<postop-parser>}
701 \begin{describe}{gf}{operator-left-precedence @<operator> @> @<prec>}
704 \begin{describe}{gf}{operator-right-precedence @<operator> @> @<prec>}
707 \begin{describe}{gf}{operator-associativity @<operator> @> @<assoc>}
710 \begin{describe}{cls}{prefix-operator () \&key}
713 \begin{describe}{cls}{simple-operator () \&key :name :function}
716 \begin{describe}{cls}
717 {simple-unary-operator (simple-operator) \&key :name :function}
722 \dhead{cls}{simple-binary-operator (simple-operator) \\ \>
723 \&key :name :function
724 :lprec :rprec :associativity}
725 \dhead{cls}{simple-postfix-operator (simple-unary-operator) \\ \>
726 \&key :name :function :lprec :rprec}
727 \dhead{cls}{simple-prefix-operator
728 (prefix-operator simple-unary-operator) \\ \>
729 \&key :name :function :rprec}}
733 {\dhead{mac}{preop @<name> (@<operand-var> @<lprec>)
734 @<declaration>^* @<form>^*
735 @> @<prefix-operator>}
736 \dhead{mac}{postop @<name>
737 (@<operand-var> @<lprec> @[[ :rprec @<rprec> @]])
738 @<declaration>^* @<form>^*
739 \nlret @<postfix-operator>}
740 \dhead{mac}{binop @<name> (@<operand-var> @<lprec> @<rprec> @<assoc>)
741 @<declaration>^*@<form>^*
742 @> @<binary-operator>}}
746 {\dhead{cls}{parenthesis () \&key :tag}
747 \dhead{cls}{open-parenthesis (parenthesis prefix-operator) \&key :tag}
748 \dhead{cls}{close-parenthesis (parenthesis) \&key :tag}}
752 {\dhead{fun}{lparen @<tag> @> @<open-paren>}
753 \dhead{fun}{rparen @<tag> @> @<close-paren>}}
756 %%%-------------------------------------------------------------------------
757 \section{Lexical analyser}
759 \begin{describe}{cls}
760 {sod-token-scanner (token-scanner)
761 \&key :filename (:line 1) (:column 0) :char-scanner}
764 \begin{describe}{fun}{define-indicator @<indicator> @<description>}
767 \begin{describe}{fun}{syntax-error @<scanner> @<expected> \&key :continuep}
770 \begin{describe}{fun}
771 {lexer-error @<char-scanner> @<expected>}
774 \begin{describe}{parseform}
775 {skip-until (@[[ :keep-end @<keep-end-flag> @]]) @<token-type>^*}
778 \begin{describe}{parseform}
779 {error (@[[ :ignore-unconsumed @<flag> @]]) \\ \ind
780 @<sub-parser> @<recover-parser>}
783 \begin{describe}{fun}
784 {scan-comment @<char-scanner>
785 @> @<result> @<success-flag> @<consumed-flag>}
788 %%%----- That's all, folks --------------------------------------------------
792 %%% TeX-master: "sod.tex"