X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~mdw/git/sod/blobdiff_plain/aa14a4cddcb96b681d5c19a2ec8bad382f43b264..1c2db39a35b4efe99594b2a53737d4f0971d01d8:/src/lexer-impl.lisp diff --git a/src/lexer-impl.lisp b/src/lexer-impl.lisp index 9f9d31e..1686179 100644 --- a/src/lexer-impl.lisp +++ b/src/lexer-impl.lisp @@ -7,7 +7,7 @@ ;;;----- Licensing notice --------------------------------------------------- ;;; -;;; This file is part of the Sensble Object Design, an object system for C. +;;; This file is part of the Sensible Object Design, an object system for C. ;;; ;;; SOD is free software; you can redistribute it and/or modify ;;; it under the terms of the GNU General Public License as published by @@ -26,272 +26,180 @@ (cl:in-package #:sod) ;;;-------------------------------------------------------------------------- -;;; Basic lexical analyser. +;;; Class implementation. -(defstruct (pushed-token - (:constructor make-pushed-token (type value location))) - "A token that has been pushed back into a lexer for later processing." - type value location) +(defmethod shared-initialize :after + ((scanner sod-token-scanner) slot-names &key) + (default-slot (scanner 'sod-parser::filename slot-names) + (scanner-filename (token-scanner-char-scanner scanner)))) -;;; Class definition. - -(export 'basic-lexer) -(defclass basic-lexer () - ((stream :initarg :stream :type stream :reader lexer-stream) - (char :initform nil :type (or character null) :reader lexer-char) - (pushback-chars :initform nil :type list) - (token-type :initform nil :accessor token-type) - (token-value :initform nil :accessor token-value) - (location :initform nil :reader file-location) - (pushback-tokens :initform nil :type list)) - (:documentation - "Base class for lexical analysers. - - The lexer reads characters from STREAM, which, for best results, wants to - be a POSITION-AWARE-INPUT-STREAM. - - The lexer provides one-character lookahead by default: the current - lookahead character is available to subclasses in the slot CHAR. Before - beginning lexical analysis, the lookahead character needs to be - established with NEXT-CHAR. If one-character lookahead is insufficient, - the analyser can push back an arbitrary number of characters using - PUSHBACK-CHAR. - - The NEXT-TOKEN function scans and returns the next token from the STREAM, - and makes it available as TOKEN-TYPE and TOKEN-VALUE, providing one-token - lookahead. A parser using the lexical analyser can push back tokens using - PUSHBACK-TOKENS. - - For convenience, the lexer implements a FILE-LOCATION method (delegated to - the underlying stream).")) - -;;; Reading and pushing back characters. - -(defmethod next-char ((lexer basic-lexer)) - (with-slots (stream char pushback-chars) lexer - (setf char (if pushback-chars - (pop pushback-chars) - (read-char stream nil))))) - -(defmethod pushback-char ((lexer basic-lexer) new-char) - (with-slots (char pushback-chars) lexer - (push char pushback-chars) - (setf char new-char))) - -(defmethod fixup-stream* ((lexer basic-lexer) thunk) - (with-slots (stream char pushback-chars) lexer - (when pushback-chars - (error "Lexer has pushed-back characters.")) - (when (slot-boundp lexer 'char) - (unread-char char stream)) - (unwind-protect - (funcall thunk stream) - (setf char (read-char stream nil))))) - -;;; Reading and pushing back tokens. - -(defmethod next-token :around ((lexer basic-lexer)) - (unless (slot-boundp lexer 'char) - (next-char lexer))) - -(defmethod next-token ((lexer basic-lexer)) - (with-slots (pushback-tokens token-type token-value location) lexer - (setf (values token-type token-value) - (if pushback-tokens - (let ((pushback (pop pushback-tokens))) - (setf location (pushed-token-location pushback)) - (values (pushed-token-type pushback) - (pushed-token-value pushback))) - (scan-token lexer))))) - -(defmethod scan-token :around ((lexer basic-lexer)) - (with-default-error-location (lexer) - (call-next-method))) - -(defmethod pushback-token ((lexer basic-lexer) new-token-type - &optional new-token-value new-location) - (with-slots (pushback-tokens token-type token-value location) lexer - (push (make-pushed-token token-type token-value location) - pushback-tokens) - (when new-location (setf location new-location)) - (setf token-type new-token-type - token-value new-token-value))) - -;;; Utilities. - -(defmethod skip-spaces ((lexer basic-lexer)) - (do ((ch (lexer-char lexer) (next-char lexer))) - ((not (whitespace-char-p ch)) ch))) +(defmethod make-scanner-stream ((scanner sod-token-scanner)) + (make-scanner-stream (token-scanner-char-scanner scanner))) ;;;-------------------------------------------------------------------------- -;;; Our main lexer. - -(export 'sod-lexer) -(defclass sod-lexer (basic-lexer) - () - (:documentation - "Lexical analyser for the SOD lanuage. - - See the LEXER class for the gory details about the lexer protocol.")) - -(defmethod scan-token ((lexer sod-lexer)) - (with-slots (stream char keywords location) lexer - (prog (ch) +;;; Indicators and error messages. + +(defun show-char (char) + "Format CHAR as a string in a readable way." + (cond ((null char) "") + ((and (graphic-char-p char) (char/= char #\space)) + (format nil "`~C'" char)) + (t (format nil "<~(~:C~)>" char)))) + +(defun %skip-until (scanner token-types + &key (keep-end (not (null (cdr token-types))))) + "This is the implementation of the `skip-until' parser." + (do ((consumedp nil t)) + ((let ((type (token-type scanner)) + (value (token-value scanner))) + (some (lambda (spec) + (multiple-value-bind (want-type want-value) + (cond ((listp spec) (values (car spec) (cadr spec))) + (t (values spec t))) + (and (eq want-type type) + (or (eq want-value t) + (equal want-value value))))) + token-types)) + (unless keep-end (scanner-step scanner)) + (values nil t (or keep-end consumedp))) + (when (scanner-at-eof-p scanner) + (return (values token-types nil consumedp))) + (scanner-step scanner))) + +(defun parse-error-recover (scanner parser recover + &key ignore-unconsumed force-progress) + "This is the implementation of the `error' parser." + (multiple-value-bind (result win consumedp) (funcall parser) + (cond ((or win + (and (not consumedp) + (or ignore-unconsumed + (scanner-at-eof-p scanner)))) + ;; If we succeeded, or if we didn't consume any tokens and the + ;; caller's OK with that, then there's nothing for us to do here. + ;; On the other hand, if we failed, didn't consume any tokens, and + ;; we're at end-of-file, then there's not much hope of making + ;; onward progress, so in this case we propagate the failure + ;; rather than trying to recover. And we assume that the + ;; continuation will somehow arrange to report the problem, and + ;; avoid inundating the user with error reports. + (values result win consumedp)) + (t + ;; Now we have to do some kind of sensible error recovery. The + ;; important thing to do here is to make sure that we make some + ;; progress. If we consumed any tokens then we're fine, and we'll + ;; just try the provided recovery strategy. Otherwise, if we're + ;; not at EOF, then we can ensure progress by discarding the + ;; current token. Finally, if we are at EOF then our best bet is + ;; simply to propagate the current failure back to the caller, but + ;; we handled that case above. + (syntax-error scanner result) + (when (and force-progress (not consumedp)) (scanner-step scanner)) + (funcall recover))))) - consider - - ;; Stash the position of this token so that we can report it later. - (setf ch (skip-spaces lexer) - location (file-location stream)) - - ;; Now work out what it is that we're dealing with. - (cond - - ;; End-of-file brings its own peculiar joy. - ((null ch) (return (values :eof t))) - - ;; Strings. - ((or (char= ch #\") (char= ch #\')) - (let* ((quote ch) - (string - (with-output-to-string (out) - (loop - (flet ((getch () - (setf ch (next-char lexer)) - (when (null ch) - (cerror* "Unexpected end of file in ~ - ~:[string~;character~] constant" - (char= quote #\')) - (return)))) - (getch) - (cond ((char= ch quote) (return)) - ((char= ch #\\) (getch))) - (write-char ch out)))))) - (setf ch (next-char lexer)) - (ecase quote - (#\" (return (values :string string))) - (#\' (case (length string) - (0 (cerror* "Empty character constant") - (return (values :char #\?))) - (1 (return (values :char (char string 0)))) - (t (cerror* "Multiple characters in character constant") - (return (values :char (char string 0))))))))) - - ;; Pick out identifiers and keywords. - ((or (alpha-char-p ch) (char= ch #\_)) - - ;; Scan a sequence of alphanumerics and underscores. We could - ;; allow more interesting identifiers, but it would damage our C - ;; lexical compatibility. - (let ((id (with-output-to-string (out) - (loop - (write-char ch out) - (setf ch (next-char lexer)) - (when (or (null ch) - (not (or (alphanumericp ch) - (char= ch #\_)))) - (return)))))) - - ;; Done. - (return (values :id id)))) - - ;; Pick out numbers. Currently only integers, but we support - ;; multiple bases. - ((digit-char-p ch) - - ;; Sort out the prefix. If we're looking at `0b', `0o' or `0x' - ;; (maybe uppercase) then we've got a funny radix to deal with. - ;; Otherwise, a leading zero signifies octal (daft, I know), else - ;; we're left with decimal. - (multiple-value-bind (radix skip-char) - (if (char/= ch #\0) - (values 10 nil) - (case (and (setf ch (next-char lexer)) - (char-downcase ch)) - (#\b (values 2 t)) - (#\o (values 8 t)) - (#\x (values 16 t)) - (t (values 8 nil)))) - - ;; If we last munched an interesting letter, we need to skip over - ;; it. That's what the SKIP-CHAR flag is for. - ;; - ;; Danger, Will Robinson! If we're just about to eat a radix - ;; letter, then the next thing must be a digit. For example, - ;; `0xfatenning' parses as a hex number followed by an identifier - ;; `0xfa ttening', but `0xturning' is an octal number followed by - ;; an identifier `0 xturning'. - (when skip-char - (let ((peek (next-char lexer))) - (unless (digit-char-p peek radix) - (pushback-char lexer ch) - (return-from scan-token (values :integer 0))) - (setf ch peek))) - - ;; Scan an integer. While there are digits, feed them into the - ;; accumulator. - (do ((accum 0 (+ (* accum radix) digit)) - (digit (and ch (digit-char-p ch radix)) - (and ch (digit-char-p ch radix)))) - ((null digit) (return-from scan-token - (values :integer accum))) - (setf ch (next-char lexer))))) - - ;; A slash might be the start of a comment. - ((char= ch #\/) - (setf ch (next-char lexer)) - (case ch - - ;; Comment up to the end of the line. - (#\/ - (loop - (setf ch (next-char lexer)) - (when (or (null ch) (char= ch #\newline)) - (go scan)))) - - ;; Comment up to the next `*/'. - (#\* - (tagbody - top - (case (setf ch (next-char lexer)) - (#\* (go star)) - ((nil) (go done)) - (t (go top))) - star - (case (setf ch (next-char lexer)) - (#\* (go star)) - (#\/ (setf ch (next-char lexer)) - (go done)) - ((nil) (go done)) - (t (go top))) - done) - (go consider)) - - ;; False alarm. (The next character is already set up.) - (t - (return (values #\/ t))))) - - ;; A dot: might be `...'. Tread carefully! We need more lookahead - ;; than is good for us. - ((char= ch #\.) - (setf ch (next-char lexer)) - (cond ((eql ch #\.) - (setf ch (next-char lexer)) - (cond ((eql ch #\.) (return (values :ellipsis nil))) - (t (pushback-char lexer #\.) - (return (values #\. t))))) - (t - (return (values #\. t))))) - - ;; Anything else is a lone delimiter. - (t - (return (multiple-value-prog1 - (values ch t) - (next-char lexer))))) - - scan - ;; Scan a new character and try again. - (setf ch (next-char lexer)) - (go consider)))) +;;;-------------------------------------------------------------------------- +;;; Token scanning. + +(defmethod scanner-token ((scanner sod-token-scanner)) + (with-slots (char-scanner line column) scanner + (with-parser-context (character-scanner-context :scanner char-scanner) + + (flet ((scan-digits (&key (radix 10) (min 1) (init 0)) + ;; Scan and return a sequence of digits. + (parse (many (acc init (+ (* acc radix) it) :min min) + (label (list :digit radix) + (filter (lambda (ch) + (digit-char-p ch radix))))))) + (start-floc () + ;; This is a little nasty. We scan the first token during + ;; instance initialization, as a result of `shared-initialize' + ;; on `token-scanner'. Unfortunately, this happens before + ;; we've had a chance to initialize our own `filename' slot. + ;; This means that we can't use the SCANNER as a file + ;; location, however tempting it might be. So we have this + ;; hack. + (make-file-location (scanner-filename char-scanner) + (scanner-line scanner) + (scanner-column scanner)))) + + ;; Skip initial junk, and remember the place. + (loop + (setf (scanner-line scanner) (scanner-line char-scanner) + (scanner-column scanner) (scanner-column char-scanner)) + (cond-parse (:consumedp cp :expected exp) + ((satisfies whitespace-char-p) (parse :whitespace)) + ((scan-comment char-scanner)) + (t (if cp (lexer-error char-scanner exp) (return))))) + + ;; Now parse something. + (cond-parse (:consumedp cp :expected exp) + + ;; Alphanumerics mean we read an identifier. + ((or #\_ (satisfies alpha-char-p)) + (values :id (with-output-to-string (out) + (write-char it out) + (parse (many (nil nil (write-char it out)) + (or #\_ (satisfies alphanumericp))))))) + + ;; Quotes introduce a literal. + ((seq ((quote (or #\" #\')) + (contents (many (out (make-string-output-stream) + (progn (write-char it out) out) + :final (get-output-stream-string out)) + (or (and #\\ :any) (not quote)))) + (nil (or (char quote) + (seq (:eof) + (lexer-error char-scanner (list quote)) + (info-with-location + (start-floc) "Literal started here"))))) + (ecase quote + (#\" contents) + (#\' (case (length contents) + (1 (char contents 0)) + (0 (cerror*-with-location (start-floc) + 'simple-lexer-error + :format-control + "Empty character literal") + #\?) + (t (cerror*-with-location (start-floc) + 'simple-lexer-error + :format-control + "Too many characters ~ + in character literal") + (char contents 0)))))) + (values (etypecase it + (character :char) + (string :string)) + it)) + + ;; Zero introduces a chosen-radix integer. + ((and #\0 + (or (and (or #\b #\B) (scan-digits :radix 2)) + (and (or #\o #\O) (scan-digits :radix 8)) + (and (or #\x #\X) (scan-digits :radix 16)) + (scan-digits :radix 8 :min 0))) + (values :int it)) + + ;; Any other digit forces radix-10. + ((seq ((d (filter digit-char-p)) + (i (scan-digits :radix 10 :min 0 :init d))) + i) + (values :int it)) + + ;; Some special punctuation sequences are single tokens. + ("..." (values :ellipsis nil)) + + ;; Any other character is punctuation. + (:any (values it nil)) + + ;; End of file means precisely that. + (:eof (values :eof nil)) + + ;; Report errors and try again. Because we must have consumed some + ;; input in order to get here (we've matched both :any and :eof) we + ;; must make progress on every call. + (t + (assert cp) + (lexer-error char-scanner exp) + (scanner-token scanner))))))) ;;;----- That's all, folks --------------------------------------------------