- consider
-
- ;; Stash the position of this token so that we can report it later.
- (setf ch (skip-spaces lexer)
- location (file-location stream))
-
- ;; Now work out what it is that we're dealing with.
- (cond
-
- ;; End-of-file brings its own peculiar joy.
- ((null ch) (return (values :eof t)))
-
- ;; Strings.
- ((or (char= ch #\") (char= ch #\'))
- (let* ((quote ch)
- (string
- (with-output-to-string (out)
- (loop
- (flet ((getch ()
- (setf ch (next-char lexer))
- (when (null ch)
- (cerror* "Unexpected end of file in ~
- ~:[string~;character~] constant"
- (char= quote #\'))
- (return))))
- (getch)
- (cond ((char= ch quote) (return))
- ((char= ch #\\) (getch)))
- (write-char ch out))))))
- (setf ch (next-char lexer))
- (ecase quote
- (#\" (return (values :string string)))
- (#\' (case (length string)
- (0 (cerror* "Empty character constant")
- (return (values :char #\?)))
- (1 (return (values :char (char string 0))))
- (t (cerror* "Multiple characters in character constant")
- (return (values :char (char string 0)))))))))
-
- ;; Pick out identifiers and keywords.
- ((or (alpha-char-p ch) (char= ch #\_))
-
- ;; Scan a sequence of alphanumerics and underscores. We could
- ;; allow more interesting identifiers, but it would damage our C
- ;; lexical compatibility.
- (let ((id (with-output-to-string (out)
- (loop
- (write-char ch out)
- (setf ch (next-char lexer))
- (when (or (null ch)
- (not (or (alphanumericp ch)
- (char= ch #\_))))
- (return))))))
-
- ;; Done.
- (return (values :id id))))
-
- ;; Pick out numbers. Currently only integers, but we support
- ;; multiple bases.
- ((digit-char-p ch)
-
- ;; Sort out the prefix. If we're looking at `0b', `0o' or `0x'
- ;; (maybe uppercase) then we've got a funny radix to deal with.
- ;; Otherwise, a leading zero signifies octal (daft, I know), else
- ;; we're left with decimal.
- (multiple-value-bind (radix skip-char)
- (if (char/= ch #\0)
- (values 10 nil)
- (case (and (setf ch (next-char lexer))
- (char-downcase ch))
- (#\b (values 2 t))
- (#\o (values 8 t))
- (#\x (values 16 t))
- (t (values 8 nil))))
-
- ;; If we last munched an interesting letter, we need to skip over
- ;; it. That's what the SKIP-CHAR flag is for.
- ;;
- ;; Danger, Will Robinson! If we're just about to eat a radix
- ;; letter, then the next thing must be a digit. For example,
- ;; `0xfatenning' parses as a hex number followed by an identifier
- ;; `0xfa ttening', but `0xturning' is an octal number followed by
- ;; an identifier `0 xturning'.
- (when skip-char
- (let ((peek (next-char lexer)))
- (unless (digit-char-p peek radix)
- (pushback-char lexer ch)
- (return-from scan-token (values :integer 0)))
- (setf ch peek)))
-
- ;; Scan an integer. While there are digits, feed them into the
- ;; accumulator.
- (do ((accum 0 (+ (* accum radix) digit))
- (digit (and ch (digit-char-p ch radix))
- (and ch (digit-char-p ch radix))))
- ((null digit) (return-from scan-token
- (values :integer accum)))
- (setf ch (next-char lexer)))))
-
- ;; A slash might be the start of a comment.
- ((char= ch #\/)
- (setf ch (next-char lexer))
- (case ch
-
- ;; Comment up to the end of the line.
- (#\/
- (loop
- (setf ch (next-char lexer))
- (when (or (null ch) (char= ch #\newline))
- (go scan))))
-
- ;; Comment up to the next `*/'.
- (#\*
- (tagbody
- top
- (case (setf ch (next-char lexer))
- (#\* (go star))
- ((nil) (go done))
- (t (go top)))
- star
- (case (setf ch (next-char lexer))
- (#\* (go star))
- (#\/ (setf ch (next-char lexer))
- (go done))
- ((nil) (go done))
- (t (go top)))
- done)
- (go consider))
-
- ;; False alarm. (The next character is already set up.)
- (t
- (return (values #\/ t)))))
-
- ;; A dot: might be `...'. Tread carefully! We need more lookahead
- ;; than is good for us.
- ((char= ch #\.)
- (setf ch (next-char lexer))
- (cond ((eql ch #\.)
- (setf ch (next-char lexer))
- (cond ((eql ch #\.) (return (values :ellipsis nil)))
- (t (pushback-char lexer #\.)
- (return (values #\. t)))))
- (t
- (return (values #\. t)))))
-
- ;; Anything else is a lone delimiter.
- (t
- (return (multiple-value-prog1
- (values ch t)
- (next-char lexer)))))
-
- scan
- ;; Scan a new character and try again.
- (setf ch (next-char lexer))
- (go consider))))
+;;;--------------------------------------------------------------------------
+;;; Token scanning.
+
+(defmethod scanner-token ((scanner sod-token-scanner))
+ (with-slots (char-scanner line column) scanner
+ (with-parser-context (character-scanner-context :scanner char-scanner)
+
+ (flet ((scan-digits (&key (radix 10) (min 1) (init 0))
+ ;; Scan and return a sequence of digits.
+ (parse (many (acc init (+ (* acc radix) it) :min min)
+ (label (list :digit radix)
+ (filter (lambda (ch)
+ (digit-char-p ch radix)))))))
+ (start-floc ()
+ ;; This is a little nasty. We scan the first token during
+ ;; instance initialization, as a result of `shared-initialize'
+ ;; on `token-scanner'. Unfortunately, this happens before
+ ;; we've had a chance to initialize our own `filename' slot.
+ ;; This means that we can't use the SCANNER as a file
+ ;; location, however tempting it might be. So we have this
+ ;; hack.
+ (make-file-location (scanner-filename char-scanner)
+ (scanner-line scanner)
+ (scanner-column scanner))))
+
+ ;; Skip initial junk, and remember the place.
+ (loop
+ (setf (scanner-line scanner) (scanner-line char-scanner)
+ (scanner-column scanner) (scanner-column char-scanner))
+ (cond-parse (:consumedp cp :expected exp)
+ ((satisfies whitespace-char-p) (parse :whitespace))
+ ((scan-comment char-scanner))
+ (t (if cp (lexer-error char-scanner exp) (return)))))
+
+ ;; Now parse something.
+ (cond-parse (:consumedp cp :expected exp)
+
+ ;; Alphanumerics mean we read an identifier.
+ ((or #\_ (satisfies alpha-char-p))
+ (values :id (with-output-to-string (out)
+ (write-char it out)
+ (parse (many (nil nil (write-char it out))
+ (or #\_ (satisfies alphanumericp)))))))
+
+ ;; Quotes introduce a literal.
+ ((seq ((quote (or #\" #\'))
+ (contents (many (out (make-string-output-stream)
+ (progn (write-char it out) out)
+ :final (get-output-stream-string out))
+ (or (and #\\ :any) (not quote))))
+ (nil (or (char quote)
+ (seq (:eof)
+ (lexer-error char-scanner (list quote))
+ (info-with-location
+ (start-floc) "Literal started here")))))
+ (ecase quote
+ (#\" contents)
+ (#\' (case (length contents)
+ (1 (char contents 0))
+ (0 (cerror*-with-location (start-floc)
+ 'simple-lexer-error
+ :format-control
+ "Empty character literal")
+ #\?)
+ (t (cerror*-with-location (start-floc)
+ 'simple-lexer-error
+ :format-control
+ "Too many characters ~
+ in character literal")
+ (char contents 0))))))
+ (values (etypecase it
+ (character :char)
+ (string :string))
+ it))
+
+ ;; Zero introduces a chosen-radix integer.
+ ((and #\0
+ (or (and (or #\b #\B) (scan-digits :radix 2))
+ (and (or #\o #\O) (scan-digits :radix 8))
+ (and (or #\x #\X) (scan-digits :radix 16))
+ (scan-digits :radix 8 :min 0)))
+ (values :int it))
+
+ ;; Any other digit forces radix-10.
+ ((seq ((d (filter digit-char-p))
+ (i (scan-digits :radix 10 :min 0 :init d)))
+ i)
+ (values :int it))
+
+ ;; Some special punctuation sequences are single tokens.
+ ("..." (values :ellipsis nil))
+
+ ;; Any other character is punctuation.
+ (:any (values it nil))
+
+ ;; End of file means precisely that.
+ (:eof (values :eof nil))
+
+ ;; Report errors and try again. Because we must have consumed some
+ ;; input in order to get here (we've matched both :any and :eof) we
+ ;; must make progress on every call.
+ (t
+ (assert cp)
+ (lexer-error char-scanner exp)
+ (scanner-token scanner)))))))