From 64eee7885954e794de60530b140efb152b2a0e8b Mon Sep 17 00:00:00 2001 From: riton Date: Thu, 10 Jul 2025 00:52:19 +0300 Subject: lexer of monkey language --- monkey.asd | 11 +++++ src/lexer-test.lisp | 119 +++++++++++++++++++++++++++++++++++++++++++++++++ src/lexer.lisp | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/package.lisp | 2 + src/token.lisp | 83 +++++++++++++++++++++++++++++++++++ src/util.lisp | 22 ++++++++++ 6 files changed, 361 insertions(+) create mode 100644 monkey.asd create mode 100644 src/lexer-test.lisp create mode 100644 src/lexer.lisp create mode 100644 src/package.lisp create mode 100644 src/token.lisp create mode 100644 src/util.lisp diff --git a/monkey.asd b/monkey.asd new file mode 100644 index 0000000..b885b76 --- /dev/null +++ b/monkey.asd @@ -0,0 +1,11 @@ +(defsystem monkey + :author "Emre Akan" + :description "A Monkey implementation" + ;; :depends-on () + :pathname "src/" + :components ((:file "package") + (:file "util") + (:file "token") + (:file "lexer") + ;; (:file "parser") + )) diff --git a/src/lexer-test.lisp b/src/lexer-test.lisp new file mode 100644 index 0000000..cf3ba03 --- /dev/null +++ b/src/lexer-test.lisp @@ -0,0 +1,119 @@ +(in-package :monkey) + +(defun make-tokens (&rest tokens) + (loop :for token :in tokens + :if (consp token) + :collect (make-token (car token) (cadr token)) + :if (numberp token) + :collect (as-token token))) + +(defun test-lexer (str expect) + (let ((lexer (make-lexer str))) + (not (find nil (mapcar #'token= (lexer-tokens lexer) expect))))) + + +(defun test-1 () + (test-lexer "=+(){},;" + (make-tokens token-= + token-+ + token-lparen + token-rparen + token-lbrace + token-rbrace + token-comma + token-semicolon + token-eof))) + + +(defun test-2 () + (test-lexer "let five = 5; +let ten = 10; + +let add = fn(x, y) { + x + y; +}; + +let result = add(five, ten); +!-/*; +5 < 10 > 5; + +if (5 < 10) { + return true; +} else { + return false; +} +10 == 10; +10 != 9; +" + (make-tokens token-let + (list token-ident "five") + token-= + (list token-int "5") + token-semicolon + token-let + (list token-ident "ten") + token-= + (list token-int "10") + token-semicolon + token-let + (list token-ident "add") + token-= + token-function + token-lparen + (list token-ident "x") + token-comma + (list token-ident "y") + token-rparen + token-lbrace + (list token-ident "x") + token-+ + (list token-ident "y") + token-rbrace + token-semicolon + token-let + (list token-ident "result") + token-= + (list token-ident "add") + token-lparen + (list token-ident "five") + token-comma + (list token-ident "ten") + token-rparen + token-semicolon + token-! + token-- + token-/ + token-* + token-semicolon + (list token-int "5") + token-< + (list token-int "10") + token-> + (list token-int "5") + token-semicolon + (list token-if "if") + token-lparen + (list token-int "5") + token-< + (list token-int "10") + token-rparen + token-lbrace + token-return + token-true + token-semicolon + token-rbrace + token-else + token-lbrace + token-return + token-false + token-semicolon + token-rbrace + (list token-int "10") + token-== + (list token-int "10") + token-semicolon + (list token-int "10") + token-!= + (list token-int "9") + token-semicolon + token-eof ""))) diff --git a/src/lexer.lisp b/src/lexer.lisp new file mode 100644 index 0000000..cd47881 --- /dev/null +++ b/src/lexer.lisp @@ -0,0 +1,124 @@ +(in-package :monkey) + +(defparameter *eof* (code-char 0)) + +(defun eof? (char) + (char= char *eof*)) + +(defun digit? (char) + (char<= #\0 char #\9)) + +(defun letter? (char) + (or (char<= #\a char #\z) + (char<= #\A char #\Z))) + +(defun whitespace? (char) + (or (char= char #\Space) + (char= char #\Tab) + (char= char #\Newline) + (char= char #\Linefeed) + (char= char #\_))) + + +(defparameter *lexer-start-pos* -1) + +(defclass lexer () + ((input :reader input :initarg :input) + (len :reader len :initarg :length) + (pos :accessor pos :initform *lexer-start-pos*) + (ch :accessor ch) + (peek-ch :accessor peek-ch))) + +(defun make-lexer (input-string &optional len) + (let ((l (make-instance 'lexer :input input-string + :length (if (not (null len)) + len + (length input-string))))) + (lexer-peek* l) + (lexer-read l) + l)) + +(defmethod lexer-reset ((l lexer)) + (setf (pos l) *lexer-start-pos*) + (lexer-peek* l) + (lexer-read l)) + +(defmethod lexer-read ((l lexer)) + (prog1 (setf (ch l) (peek-ch l)) + (incf (pos l)) + (lexer-peek* l))) + +(defmethod lexer-peek* ((l lexer)) + (setf (peek-ch l) + (if (< (+ 1 (pos l)) (len l)) + (char (input l) (+ 1 (pos l))) + *eof*))) + +(defmethod lexer-peek ((l lexer)) + (peek-ch l)) + +(defmethod skip-ws ((l lexer)) + (loop :while (whitespace? (ch l)) + :do (lexer-read l))) + +(defmethod next-token ((l lexer)) + (skip-ws l) + (block nil + (prog1 (case (ch l) + (#\= (cond ((char= #\= (lexer-peek l)) + (lexer-read l) + (make-token token-== "==")) + (t ;;(whitespace? (lexer-peek l)) + (make-token token-= (ch l))))) + (#\+ (make-token token-+ (ch l))) + (#\- (make-token token-- (ch l))) + (#\! (cond ((char= #\= (lexer-peek l)) + (lexer-read l) + (make-token token-!= "!=")) + (t ;; (whitespace? (lexer-peek l)) + (make-token token-! (ch l))))) + (#\/ (make-token token-/ (ch l))) + (#\* (make-token token-* (ch l))) + (#\< (make-token token-< (ch l))) + (#\> (make-token token-> (ch l))) + (#\; (make-token token-semicolon (ch l))) + (#\, (make-token token-comma (ch l))) + (#\( (make-token token-lparen (ch l))) + (#\) (make-token token-rparen (ch l))) + (#\{ (make-token token-lbrace (ch l))) + (#\} (make-token token-rbrace (ch l))) + (otherwise (cond ((eof? (ch l)) + (make-token token-eof "")) + ((letter? (ch l)) + (let ((literal (read-identifier l))) + (return (make-token (lookup-identifier literal) + literal)))) + ((digit? (ch l)) + (return (make-token token-int (read-number l)))) + (t (make-token token-illegal "ILLEGAL"))))) + (lexer-read l)))) + +(defmethod read-number ((l lexer)) + (let ((start (pos l))) + (lexer-read l) + (loop :while (digit? (ch l)) + :do (lexer-read l)) + (make-array (- (pos l) start) :element-type 'character + :displaced-to (input l) + :displaced-index-offset start))) + +(defmethod read-identifier ((l lexer)) + (let ((start (pos l))) + (lexer-read l) + (loop :while (letter? (ch l)) + :do (lexer-read l)) + (make-array (- (pos l) start) :element-type 'character + :displaced-to (input l) + :displaced-index-offset start))) + + +(defmethod lexer-tokens ((l lexer)) + (loop :for token := (next-token l) + :collect token + :until (= token-eof (_type token)))) + diff --git a/src/package.lisp b/src/package.lisp new file mode 100644 index 0000000..aa399d9 --- /dev/null +++ b/src/package.lisp @@ -0,0 +1,2 @@ +(defpackage :monkey + (:use :common-lisp)) diff --git a/src/token.lisp b/src/token.lisp new file mode 100644 index 0000000..7b4f372 --- /dev/null +++ b/src/token.lisp @@ -0,0 +1,83 @@ +(in-package :monkey) + +(eval-always + (defmacro def-tokens (prefix &body tok-def) + (let ((i 0) + (hash-sym (gensym)) + (body (list))) + (loop :for def :in tok-def + :if (consp def) + :do (let ((sym (concat-symbols prefix (car def)))) + (push `(defparameter ,sym ,i) body) + (push `(setf (gethash ,sym ,hash-sym) ,(cadr def)) body)) + :else + :do (push `(defparameter ,(concat-symbols prefix def) ,i) body) + :do (incf i) + :finally (setf body (reverse body))) + `(progn (defparameter ,hash-sym (make-hash-table)) + ,@body + (defparameter ,(concat-symbols prefix 'count) ,i) + (defun as-token (token-type) + (make-token token-type (or (gethash token-type ,hash-sym) ""))))))) + +(def-tokens token + illegal + eof + ident + int + (= "=") + (+ "+") + (- "-") + (! "!") + (* "*") + (/ "/") + (== "==") + (!= "!=") + (< "<") + (> ">") + (comma ",") + (semicolon ";") + (lparen "(") + (rparen ")") + (lbrace "{") + (rbrace "}") + (function "fn") + (let "let") + (true "true") + (false "false") + (if "if") + (else "else") + (return "return")) + + +(defclass token () + ((_type :reader _type :initarg :type) + (literal :reader literal :initarg :literal))) + +(defmethod token= ((t1 token) (t2 token)) + (and (= (_type t1) (_type t2)) + (string= (literal t1) (literal t2)))) + +(defun make-token (type literal) + (make-instance 'token :type type :literal (string literal))) + +(defmethod print-object ((token token) stream) + (print-unreadable-object (token stream :type t :identity t) + (princ (literal token) stream))) + +(defparameter *keywords* + (let ((kw (make-hash-table))) + (setf (gethash "fn" kw) token-function) + (setf (gethash "let" kw) token-let) + (setf (gethash "true" kw) token-true) + (setf (gethash "false" kw) token-false) + (setf (gethash "if" kw) token-if) + (setf (gethash "else" kw) token-else) + (setf (gethash "return" kw) token-return) + kw)) + +(defun lookup-identifier (str) + (multiple-value-bind (val ok) (gethash str *keywords*) + (if (not (null ok)) + val + token-ident))) diff --git a/src/util.lisp b/src/util.lisp new file mode 100644 index 0000000..d58cde8 --- /dev/null +++ b/src/util.lisp @@ -0,0 +1,22 @@ +(in-package :monkey) + +(defun concat-symbols (sym1 sym2) + (intern (format nil "~a-~a" (symbol-name sym1) (symbol-name sym2)))) + +(eval-when (:compile-toplevel :load-toplevel :execute) + (defmacro eval-always (&body body) + `(eval-when (:compile-toplevel :load-toplevel :execute) + ,@body))) + +(defun read-file-contents (file-name) + (let* ((len 0) + (contents + (with-output-to-string (out) + (with-open-file (in file-name :external-format :utf-8) + (loop :with buffer := (make-array 8192 :element-type 'character) + :for n := (read-sequence buffer in) + :while (< 0 n) + :do (incf len n) + (write-sequence buffer out :start 0 :end n)))))) + (values contents len))) + -- cgit v1.2.3