;;;----- Licensing notice ---------------------------------------------------
;;;
-;;; This file is part of the Sensble Object Design, an object system for C.
+;;; This file is part of the Sensible Object Design, an object system for C.
;;;
;;; SOD is free software; you can redistribute it and/or modify
;;; it under the terms of the GNU General Public License as published by
(cl:in-package #:sod)
;;;--------------------------------------------------------------------------
-;;; Basic lexical analyser.
+;;; Class implementation.
-(defstruct (pushed-token
- (:constructor make-pushed-token (type value location)))
- "A token that has been pushed back into a lexer for later processing."
- type value location)
+(defmethod shared-initialize :after
+ ((scanner sod-token-scanner) slot-names &key)
+ (default-slot (scanner 'sod-parser::filename slot-names)
+ (scanner-filename (token-scanner-char-scanner scanner))))
-;;; Class definition.
-
-(export 'basic-lexer)
-(defclass basic-lexer ()
- ((stream :initarg :stream :type stream :reader lexer-stream)
- (char :initform nil :type (or character null) :reader lexer-char)
- (pushback-chars :initform nil :type list)
- (token-type :initform nil :accessor token-type)
- (token-value :initform nil :accessor token-value)
- (location :initform nil :reader file-location)
- (pushback-tokens :initform nil :type list))
- (:documentation
- "Base class for lexical analysers.
-
- The lexer reads characters from STREAM, which, for best results, wants to
- be a `position-aware-input-stream'.
-
- The lexer provides one-character lookahead by default: the current
- lookahead character is available to subclasses in the slot CHAR. Before
- beginning lexical analysis, the lookahead character needs to be
- established with `next-char'. If one-character lookahead is insufficient,
- the analyser can push back an arbitrary number of characters using
- `pushback-char'.
-
- The `next-token' function scans and returns the next token from the
- STREAM, and makes it available as TOKEN-TYPE and TOKEN-VALUE, providing
- one-token lookahead. A parser using the lexical analyser can push back
- tokens using `pushback-tokens'.
-
- For convenience, the lexer implements a `file-location' method (delegated
- to the underlying stream)."))
-
-;;; Reading and pushing back characters.
-
-(defmethod next-char ((lexer basic-lexer))
- (with-slots (stream char pushback-chars) lexer
- (setf char (if pushback-chars
- (pop pushback-chars)
- (read-char stream nil)))))
-
-(defmethod pushback-char ((lexer basic-lexer) new-char)
- (with-slots (char pushback-chars) lexer
- (push char pushback-chars)
- (setf char new-char)))
-
-(defmethod fixup-stream* ((lexer basic-lexer) thunk)
- (with-slots (stream char pushback-chars) lexer
- (when pushback-chars
- (error "Lexer has pushed-back characters."))
- (when (slot-boundp lexer 'char)
- (unread-char char stream))
- (unwind-protect
- (funcall thunk stream)
- (setf char (read-char stream nil)))))
-
-;;; Reading and pushing back tokens.
-
-(defmethod next-token :around ((lexer basic-lexer))
- (unless (slot-boundp lexer 'char)
- (next-char lexer)))
-
-(defmethod next-token ((lexer basic-lexer))
- (with-slots (pushback-tokens token-type token-value location) lexer
- (setf (values token-type token-value)
- (if pushback-tokens
- (let ((pushback (pop pushback-tokens)))
- (setf location (pushed-token-location pushback))
- (values (pushed-token-type pushback)
- (pushed-token-value pushback)))
- (scan-token lexer)))))
-
-(defmethod scan-token :around ((lexer basic-lexer))
- (with-default-error-location (lexer)
- (call-next-method)))
-
-(defmethod pushback-token ((lexer basic-lexer) new-token-type
- &optional new-token-value new-location)
- (with-slots (pushback-tokens token-type token-value location) lexer
- (push (make-pushed-token token-type token-value location)
- pushback-tokens)
- (when new-location (setf location new-location))
- (setf token-type new-token-type
- token-value new-token-value)))
-
-;;; Utilities.
-
-(defmethod skip-spaces ((lexer basic-lexer))
- (do ((ch (lexer-char lexer) (next-char lexer)))
- ((not (whitespace-char-p ch)) ch)))
+(defmethod make-scanner-stream ((scanner sod-token-scanner))
+ (make-scanner-stream (token-scanner-char-scanner scanner)))
;;;--------------------------------------------------------------------------
-;;; Our main lexer.
-
-(export 'sod-lexer)
-(defclass sod-lexer (basic-lexer)
- ()
- (:documentation
- "Lexical analyser for the SOD lanuage.
-
- See the `lexer' class for the gory details about the lexer protocol."))
-
-(defmethod scan-token ((lexer sod-lexer))
- (with-slots (stream char keywords location) lexer
- (prog (ch)
+;;; Indicators and error messages.
+
+(defun show-char (stream char &optional colonp atsignp)
+ "Format CHAR to STREAM in a readable way.
+
+ Usable in `format''s ~/.../ command."
+ (declare (ignore colonp atsignp))
+ (cond ((null char) (write-string "<eof>" stream))
+ ((and (graphic-char-p char) (char/= char #\space))
+ (format stream "`~C'" char))
+ (t (format stream "<~(~:C~)>" char))))
+
+(defun skip-until (scanner token-types &key keep-end)
+ "This is the implementation of the `skip-until' parser."
+ (do ((consumedp nil t))
+ ((member (token-type scanner) token-types)
+ (unless keep-end (scanner-step scanner))
+ (values nil t (or keep-end consumedp)))
+ (when (scanner-at-eof-p scanner)
+ (return (values token-types nil consumedp)))
+ (scanner-step scanner)))
+
+(defun parse-error-recover (scanner parser recover &key ignore-unconsumed)
+ "This is the implementation of the `error' parser."
+ (multiple-value-bind (result win consumedp) (funcall parser)
+ (cond ((or win
+ (and (not consumedp)
+ (or ignore-unconsumed
+ (scanner-at-eof-p scanner))))
+ ;; If we succeeded, or if we didn't consume any tokens and the
+ ;; caller's OK with that, then there's nothing for us to do here.
+ ;; On the other hand, if we failed, didn't consume any tokens, and
+ ;; we're at end-of-file, then there's not much hope of making
+ ;; onward progress, so in this case we propagate the failure
+ ;; rather than trying to recover. And we assume that the
+ ;; continuation will somehow arrange to report the problem, and
+ ;; avoid inundating the user with error reports.
+ (values result win consumedp))
+ (t
+ ;; Now we have to do some kind of sensible error recovery. The
+ ;; important thing to do here is to make sure that we make some
+ ;; progress. If we consumed any tokens then we're fine, and we'll
+ ;; just try the provided recovery strategy. Otherwise, if we're
+ ;; not at EOF, then we can ensure progress by discarding the
+ ;; current token. Finally, if we are at EOF then our best bet is
+ ;; simply to propagate the current failure back to the caller, but
+ ;; we handled that case above.
+ (syntax-error scanner result :continuep t)
+ (unless consumedp (scanner-step scanner))
+ (funcall recover)))))
- consider
-
- ;; Stash the position of this token so that we can report it later.
- (setf ch (skip-spaces lexer)
- location (file-location stream))
-
- ;; Now work out what it is that we're dealing with.
- (cond
-
- ;; End-of-file brings its own peculiar joy.
- ((null ch) (return (values :eof t)))
-
- ;; Strings.
- ((or (char= ch #\") (char= ch #\'))
- (let* ((quote ch)
- (string
- (with-output-to-string (out)
- (loop
- (flet ((getch ()
- (setf ch (next-char lexer))
- (when (null ch)
- (cerror* "Unexpected end of file in ~
- ~:[string~;character~] constant"
- (char= quote #\'))
- (return))))
- (getch)
- (cond ((char= ch quote) (return))
- ((char= ch #\\) (getch)))
- (write-char ch out))))))
- (setf ch (next-char lexer))
- (ecase quote
- (#\" (return (values :string string)))
- (#\' (case (length string)
- (0 (cerror* "Empty character constant")
- (return (values :char #\?)))
- (1 (return (values :char (char string 0))))
- (t (cerror* "Multiple characters in character constant")
- (return (values :char (char string 0)))))))))
-
- ;; Pick out identifiers and keywords.
- ((or (alpha-char-p ch) (char= ch #\_))
-
- ;; Scan a sequence of alphanumerics and underscores. We could
- ;; allow more interesting identifiers, but it would damage our C
- ;; lexical compatibility.
- (let ((id (with-output-to-string (out)
- (loop
- (write-char ch out)
- (setf ch (next-char lexer))
- (when (or (null ch)
- (not (or (alphanumericp ch)
- (char= ch #\_))))
- (return))))))
-
- ;; Done.
- (return (values :id id))))
-
- ;; Pick out numbers. Currently only integers, but we support
- ;; multiple bases.
- ((digit-char-p ch)
-
- ;; Sort out the prefix. If we're looking at `0b', `0o' or `0x'
- ;; (maybe uppercase) then we've got a funny radix to deal with.
- ;; Otherwise, a leading zero signifies octal (daft, I know), else
- ;; we're left with decimal.
- (multiple-value-bind (radix skip-char)
- (if (char/= ch #\0)
- (values 10 nil)
- (case (and (setf ch (next-char lexer))
- (char-downcase ch))
- (#\b (values 2 t))
- (#\o (values 8 t))
- (#\x (values 16 t))
- (t (values 8 nil))))
-
- ;; If we last munched an interesting letter, we need to skip over
- ;; it. That's what the SKIP-CHAR flag is for.
- ;;
- ;; Danger, Will Robinson! If we're just about to eat a radix
- ;; letter, then the next thing must be a digit. For example,
- ;; `0xfatenning' parses as a hex number followed by an identifier
- ;; `0xfa ttening', but `0xturning' is an octal number followed by
- ;; an identifier `0 xturning'.
- (when skip-char
- (let ((peek (next-char lexer)))
- (unless (digit-char-p peek radix)
- (pushback-char lexer ch)
- (return-from scan-token (values :integer 0)))
- (setf ch peek)))
-
- ;; Scan an integer. While there are digits, feed them into the
- ;; accumulator.
- (do ((accum 0 (+ (* accum radix) digit))
- (digit (and ch (digit-char-p ch radix))
- (and ch (digit-char-p ch radix))))
- ((null digit) (return-from scan-token
- (values :integer accum)))
- (setf ch (next-char lexer)))))
-
- ;; A slash might be the start of a comment.
- ((char= ch #\/)
- (setf ch (next-char lexer))
- (case ch
-
- ;; Comment up to the end of the line.
- (#\/
- (loop
- (setf ch (next-char lexer))
- (when (or (null ch) (char= ch #\newline))
- (go scan))))
-
- ;; Comment up to the next `*/'.
- (#\*
- (tagbody
- top
- (case (setf ch (next-char lexer))
- (#\* (go star))
- ((nil) (go done))
- (t (go top)))
- star
- (case (setf ch (next-char lexer))
- (#\* (go star))
- (#\/ (setf ch (next-char lexer))
- (go done))
- ((nil) (go done))
- (t (go top)))
- done)
- (go consider))
-
- ;; False alarm. (The next character is already set up.)
- (t
- (return (values #\/ t)))))
-
- ;; A dot: might be `...'. Tread carefully! We need more lookahead
- ;; than is good for us.
- ((char= ch #\.)
- (setf ch (next-char lexer))
- (cond ((eql ch #\.)
- (setf ch (next-char lexer))
- (cond ((eql ch #\.) (return (values :ellipsis nil)))
- (t (pushback-char lexer #\.)
- (return (values #\. t)))))
- (t
- (return (values #\. t)))))
-
- ;; Anything else is a lone delimiter.
- (t
- (return (multiple-value-prog1
- (values ch t)
- (next-char lexer)))))
-
- scan
- ;; Scan a new character and try again.
- (setf ch (next-char lexer))
- (go consider))))
+;;;--------------------------------------------------------------------------
+;;; Token scanning.
+
+(defmethod scanner-token ((scanner sod-token-scanner))
+ (with-slots (char-scanner line column) scanner
+ (with-parser-context (character-scanner-context :scanner char-scanner)
+
+ (flet ((scan-digits (&key (radix 10) (min 1) (init 0))
+ ;; Scan and return a sequence of digits.
+ (parse (many (acc init (+ (* acc radix) it) :min min)
+ (label (list :digit radix)
+ (filter (lambda (ch)
+ (digit-char-p ch radix))))))))
+
+ ;; Skip initial junk, and remember the place.
+ (loop
+ (setf (scanner-line scanner) (scanner-line char-scanner)
+ (scanner-column scanner) (scanner-column char-scanner))
+ (cond-parse (:consumedp cp :expected exp)
+ ((satisfies whitespace-char-p) (parse :whitespace))
+ ((scan-comment char-scanner))
+ (t (if cp (lexer-error char-scanner exp cp) (return)))))
+
+ ;; Now parse something.
+ (cond-parse (:consumedp cp :expected exp)
+
+ ;; Alphanumerics mean we read an identifier.
+ ((or #\_ (satisfies alpha-char-p))
+ (values :id (with-output-to-string (out)
+ (write-char it out)
+ (parse (many (nil nil (write-char it out))
+ (or #\_ (satisfies alphanumericp)))))))
+
+ ;; Quotes introduce a literal.
+ ((seq ((quote (or #\" #\'))
+ (contents (many (out (make-string-output-stream)
+ (progn (write-char it out) out)
+ :final (get-output-stream-string out))
+ (or (and #\\ :any) (not quote))))
+ (nil (char quote)))
+ (ecase quote
+ (#\" contents)
+ (#\' (case (length contents)
+ (1 (char contents 0))
+ (0 (cerror* "Empty character literal") #\?)
+ (t (cerror* "Too many characters in literal")
+ (char contents 0))))))
+ (values (etypecase it
+ (character :char)
+ (string :string))
+ it))
+
+ ;; Zero introduces a chosen-radix integer.
+ ((and #\0
+ (or (and (or #\b #\B) (scan-digits :radix 2))
+ (and (or #\o #\O) (scan-digits :radix 8))
+ (and (or #\x #\X) (scan-digits :radix 16))
+ (scan-digits :radix 8 :min 0)))
+ (values :int it))
+
+ ;; Any other digit forces radix-10.
+ ((seq ((d (filter digit-char-p))
+ (i (scan-digits :radix 10 :min 0 :init d)))
+ i)
+ (values :int it))
+
+ ;; Some special punctuation sequences are single tokens.
+ ("..." (values :ellipsis nil))
+
+ ;; Any other character is punctuation.
+ (:any (values it nil))
+
+ ;; End of file means precisely that.
+ (:eof (values :eof nil))
+
+ ;; Report errors and try again. Because we must have consumed some
+ ;; input in order to get here (we've matched both :any and :eof) we
+ ;; must make progress on every call.
+ (t
+ (assert cp)
+ (lexer-error char-scanner exp cp)
+ (scanner-token scanner)))))))
;;;----- That's all, folks --------------------------------------------------