X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~mdw/git/sod/blobdiff_plain/5b0a2bdbdeabfd02923a9998e6e2dafa614b47f3..239fa5bd3dff0b38b0cebdd3438311f21c24ba4f:/src/lexer-impl.lisp diff --git a/src/lexer-impl.lisp b/src/lexer-impl.lisp index 03a6bcc..f474590 100644 --- a/src/lexer-impl.lisp +++ b/src/lexer-impl.lisp @@ -26,272 +26,113 @@ (cl:in-package #:sod) ;;;-------------------------------------------------------------------------- -;;; Basic lexical analyser. +;;; Class implementation. -(defstruct (pushed-token - (:constructor make-pushed-token (type value location))) - "A token that has been pushed back into a lexer for later processing." - type value location) +(defmethod shared-initialize :after + ((scanner sod-token-scanner) slot-names &key) + (default-slot (scanner 'sod-parser::filename slot-names) + (scanner-filename (token-scanner-char-scanner scanner)))) -;;; Class definition. - -(export 'basic-lexer) -(defclass basic-lexer () - ((stream :initarg :stream :type stream :reader lexer-stream) - (char :initform nil :type (or character null) :reader lexer-char) - (pushback-chars :initform nil :type list) - (token-type :initform nil :accessor token-type) - (token-value :initform nil :accessor token-value) - (location :initform nil :reader file-location) - (pushback-tokens :initform nil :type list)) - (:documentation - "Base class for lexical analysers. - - The lexer reads characters from STREAM, which, for best results, wants to - be a `position-aware-input-stream'. - - The lexer provides one-character lookahead by default: the current - lookahead character is available to subclasses in the slot CHAR. Before - beginning lexical analysis, the lookahead character needs to be - established with `next-char'. If one-character lookahead is insufficient, - the analyser can push back an arbitrary number of characters using - `pushback-char'. - - The `next-token' function scans and returns the next token from the - STREAM, and makes it available as TOKEN-TYPE and TOKEN-VALUE, providing - one-token lookahead. A parser using the lexical analyser can push back - tokens using `pushback-tokens'. - - For convenience, the lexer implements a `file-location' method (delegated - to the underlying stream).")) - -;;; Reading and pushing back characters. - -(defmethod next-char ((lexer basic-lexer)) - (with-slots (stream char pushback-chars) lexer - (setf char (if pushback-chars - (pop pushback-chars) - (read-char stream nil))))) - -(defmethod pushback-char ((lexer basic-lexer) new-char) - (with-slots (char pushback-chars) lexer - (push char pushback-chars) - (setf char new-char))) - -(defmethod fixup-stream* ((lexer basic-lexer) thunk) - (with-slots (stream char pushback-chars) lexer - (when pushback-chars - (error "Lexer has pushed-back characters.")) - (when (slot-boundp lexer 'char) - (unread-char char stream)) - (unwind-protect - (funcall thunk stream) - (setf char (read-char stream nil))))) - -;;; Reading and pushing back tokens. - -(defmethod next-token :around ((lexer basic-lexer)) - (unless (slot-boundp lexer 'char) - (next-char lexer))) - -(defmethod next-token ((lexer basic-lexer)) - (with-slots (pushback-tokens token-type token-value location) lexer - (setf (values token-type token-value) - (if pushback-tokens - (let ((pushback (pop pushback-tokens))) - (setf location (pushed-token-location pushback)) - (values (pushed-token-type pushback) - (pushed-token-value pushback))) - (scan-token lexer))))) - -(defmethod scan-token :around ((lexer basic-lexer)) - (with-default-error-location (lexer) - (call-next-method))) - -(defmethod pushback-token ((lexer basic-lexer) new-token-type - &optional new-token-value new-location) - (with-slots (pushback-tokens token-type token-value location) lexer - (push (make-pushed-token token-type token-value location) - pushback-tokens) - (when new-location (setf location new-location)) - (setf token-type new-token-type - token-value new-token-value))) - -;;; Utilities. - -(defmethod skip-spaces ((lexer basic-lexer)) - (do ((ch (lexer-char lexer) (next-char lexer))) - ((not (whitespace-char-p ch)) ch))) +(defmethod make-scanner-stream ((scanner sod-token-scanner)) + (make-scanner-stream (token-scanner-char-scanner scanner))) ;;;-------------------------------------------------------------------------- -;;; Our main lexer. - -(export 'sod-lexer) -(defclass sod-lexer (basic-lexer) - () - (:documentation - "Lexical analyser for the SOD lanuage. - - See the `lexer' class for the gory details about the lexer protocol.")) +;;; Indicators and error messages. -(defmethod scan-token ((lexer sod-lexer)) - (with-slots (stream char keywords location) lexer - (prog (ch) +(defvar *indicator-map* (make-hash-table) + "Hash table mapping indicator objects to human-readable descriptions.") - consider +(defun show-char (stream char &optional colonp atsignp) + "Format CHAR to STREAM in a readable way. - ;; Stash the position of this token so that we can report it later. - (setf ch (skip-spaces lexer) - location (file-location stream)) + Usable in `format''s ~/.../ command." + (declare (ignore colonp atsignp)) + (cond ((null char) (write-string "" stream)) + ((and (graphic-char-p char) (char/= char #\space)) + (format stream "`~C'" char)) + (t (format stream "<~(~:C~)>" char)))) - ;; Now work out what it is that we're dealing with. - (cond - - ;; End-of-file brings its own peculiar joy. - ((null ch) (return (values :eof t))) - - ;; Strings. - ((or (char= ch #\") (char= ch #\')) - (let* ((quote ch) - (string - (with-output-to-string (out) - (loop - (flet ((getch () - (setf ch (next-char lexer)) - (when (null ch) - (cerror* "Unexpected end of file in ~ - ~:[string~;character~] constant" - (char= quote #\')) - (return)))) - (getch) - (cond ((char= ch quote) (return)) - ((char= ch #\\) (getch))) - (write-char ch out)))))) - (setf ch (next-char lexer)) - (ecase quote - (#\" (return (values :string string))) - (#\' (case (length string) - (0 (cerror* "Empty character constant") - (return (values :char #\?))) - (1 (return (values :char (char string 0)))) - (t (cerror* "Multiple characters in character constant") - (return (values :char (char string 0))))))))) - - ;; Pick out identifiers and keywords. - ((or (alpha-char-p ch) (char= ch #\_)) - - ;; Scan a sequence of alphanumerics and underscores. We could - ;; allow more interesting identifiers, but it would damage our C - ;; lexical compatibility. - (let ((id (with-output-to-string (out) - (loop - (write-char ch out) - (setf ch (next-char lexer)) - (when (or (null ch) - (not (or (alphanumericp ch) - (char= ch #\_)))) - (return)))))) - - ;; Done. - (return (values :id id)))) - - ;; Pick out numbers. Currently only integers, but we support - ;; multiple bases. - ((digit-char-p ch) - - ;; Sort out the prefix. If we're looking at `0b', `0o' or `0x' - ;; (maybe uppercase) then we've got a funny radix to deal with. - ;; Otherwise, a leading zero signifies octal (daft, I know), else - ;; we're left with decimal. - (multiple-value-bind (radix skip-char) - (if (char/= ch #\0) - (values 10 nil) - (case (and (setf ch (next-char lexer)) - (char-downcase ch)) - (#\b (values 2 t)) - (#\o (values 8 t)) - (#\x (values 16 t)) - (t (values 8 nil)))) - - ;; If we last munched an interesting letter, we need to skip over - ;; it. That's what the SKIP-CHAR flag is for. - ;; - ;; Danger, Will Robinson! If we're just about to eat a radix - ;; letter, then the next thing must be a digit. For example, - ;; `0xfatenning' parses as a hex number followed by an identifier - ;; `0xfa ttening', but `0xturning' is an octal number followed by - ;; an identifier `0 xturning'. - (when skip-char - (let ((peek (next-char lexer))) - (unless (digit-char-p peek radix) - (pushback-char lexer ch) - (return-from scan-token (values :integer 0))) - (setf ch peek))) - - ;; Scan an integer. While there are digits, feed them into the - ;; accumulator. - (do ((accum 0 (+ (* accum radix) digit)) - (digit (and ch (digit-char-p ch radix)) - (and ch (digit-char-p ch radix)))) - ((null digit) (return-from scan-token - (values :integer accum))) - (setf ch (next-char lexer))))) - - ;; A slash might be the start of a comment. - ((char= ch #\/) - (setf ch (next-char lexer)) - (case ch - - ;; Comment up to the end of the line. - (#\/ - (loop - (setf ch (next-char lexer)) - (when (or (null ch) (char= ch #\newline)) - (go scan)))) - - ;; Comment up to the next `*/'. - (#\* - (tagbody - top - (case (setf ch (next-char lexer)) - (#\* (go star)) - ((nil) (go done)) - (t (go top))) - star - (case (setf ch (next-char lexer)) - (#\* (go star)) - (#\/ (setf ch (next-char lexer)) - (go done)) - ((nil) (go done)) - (t (go top))) - done) - (go consider)) - - ;; False alarm. (The next character is already set up.) - (t - (return (values #\/ t))))) - - ;; A dot: might be `...'. Tread carefully! We need more lookahead - ;; than is good for us. - ((char= ch #\.) - (setf ch (next-char lexer)) - (cond ((eql ch #\.) - (setf ch (next-char lexer)) - (cond ((eql ch #\.) (return (values :ellipsis nil))) - (t (pushback-char lexer #\.) - (return (values #\. t))))) - (t - (return (values #\. t))))) - - ;; Anything else is a lone delimiter. - (t - (return (multiple-value-prog1 - (values ch t) - (next-char lexer))))) - - scan - ;; Scan a new character and try again. - (setf ch (next-char lexer)) - (go consider)))) +;;;-------------------------------------------------------------------------- +;;; Token scanning. + +(defmethod scanner-token ((scanner sod-token-scanner)) + (with-slots (char-scanner line column) scanner + (with-parser-context (character-scanner-context :scanner char-scanner) + + (flet ((scan-digits (&key (radix 10) (min 1) (init 0)) + ;; Scan and return a sequence of digits. + (parse (many (acc init (+ (* acc radix) it) :min min) + (label (list :digit radix) + (filter (lambda (ch) + (digit-char-p ch radix)))))))) + + ;; Skip initial junk, and remember the place. + (loop + (setf (scanner-line scanner) (scanner-line char-scanner) + (scanner-column scanner) (scanner-column char-scanner)) + (cond-parse (:consumedp cp :expected exp) + ((satisfies whitespace-char-p) (parse :whitespace)) + ((scan-comment char-scanner)) + (t (if cp (lexer-error char-scanner exp cp) (return))))) + + ;; Now parse something. + (cond-parse (:consumedp cp :expected exp) + + ;; Alphanumerics mean we read an identifier. + ((or #\_ (satisfies alpha-char-p)) + (values :id (with-output-to-string (out) + (write-char it out) + (parse (many (nil nil (write-char it out)) + (or #\_ (satisfies alphanumericp))))))) + + ;; Quotes introduce a literal. + ((seq ((quote (or #\" #\')) + (contents (many (out (make-string-output-stream) + (progn (write-char it out) out) + :final (get-output-stream-string out)) + (or (and #\\ :any) (not quote)))) + (nil (char quote))) + (ecase quote + (#\" contents) + (#\' (case (length contents) + (1 (char contents 0)) + (0 (cerror* "Empty character literal") #\?) + (t (cerror* "Too many characters in literal") + (char contents 0)))))) + (values (etypecase it + (character :char) + (string :string)) + it)) + + ;; Zero introduces a chosen-radix integer. + ((and #\0 + (or (and (or #\b #\B) (scan-digits :radix 2)) + (and (or #\o #\O) (scan-digits :radix 8)) + (and (or #\x #\X) (scan-digits :radix 16)) + (scan-digits :radix 8 :min 0))) + (values :int it)) + + ;; Any other digit forces radix-10. + ((seq ((d (filter digit-char-p)) + (i (scan-digits :radix 10 :min 0 :init d))) + i) + (values :int it)) + + ;; Some special punctuation sequences are single tokens. + ("..." (values :ellipsis nil)) + + ;; Any other character is punctuation. + (:any (values it nil)) + + ;; End of file means precisely that. + (:eof (values :eof nil)) + + ;; Report errors and try again. Because we must have consumed some + ;; input in order to get here (we've matched both :any and :eof) we + ;; must make progress on every call. + (t + (assert cp) + (lexer-error char-scanner exp cp) + (scanner-token scanner))))))) ;;;----- That's all, folks --------------------------------------------------